import dask
import logging
import numpy as np
from collections import OrderedDict
from ml4chem.atomistic.models.kernelridge import KernelRidge
from scipy.linalg import cholesky
logger = logging.getLogger()
[docs]class GaussianProcess(KernelRidge):
"""Gaussian Process Regression
This method is based on the KernelRidge regression class of ML4Chem.
Parameters
----------
sigma : float, list, or dict
Length scale of the Gaussian in the case of RBF, exponential, and
laplacian kernels. Default is 1. (float) and it computes isotropic
kernels. Pass a list if you would like to compute anisotropic kernels,
or a dictionary if you want sigmas for each model.
Example:
>>> sigma={'energy': {'H': value, 'O': value},
'forces': {'H': {0: value, 1: value, 2: value},
'O': {0: value, 1: value, 2: value}}}
`value` can be a float or a list.
kernel : str
Choose the kernel. Available kernels are: 'linear', 'rbf', 'laplacian',
and 'exponential'. Default is 'rbf'.
lamda : float, or dictionary
Strength of the regularization. If you pass a dictionary then force and
energy will have different regularization:
>>> lamda = {'energy': value, 'forces': value}
Dictionaries are only used when performing Cholesky factorization.
trainingimages : str
Path to Trajectory file containing the images in the training set. This
is useful for predicting new structures.
cholesky : bool
Whether or not we are using Cholesky decomposition to determine the
weights. This method returns an unique set of regression coefficients.
weights_independent : bool
Whether or not the weights are going to be split for energy and forces.
forcetraining : bool
Turn force training true.
nnpartition : str
Use per-atom energy partition from a neural network calculator.
You have to set the path to .amp file. Useful for energy training with
Cholesky factorization. Default is set to None.
scheduler : str
The scheduler to be used with the dask backend.
sum_rule : bool
Whether or not we sum of fingerprintprime elements over a given axis.
This applies np.sum(fingerprint_list, axis=0).
batch_size : int
Number of elements per batch in order to split computations. Useful
when number of local chemical environments is too large.
weights : dict
Dictionary of weights.
Notes
-----
This regressor applies the atomic decomposition Ansatz (ADA). For
more information check the Notes on the KernelRidge class.
"""
NAME = "GaussianProcess"
def __init__(
self,
sigma=1.0,
kernel="rbf",
scheduler="distributed",
lamda=1e-5,
trainingimages=None,
checkpoints=None,
cholesky=True,
weights_independent=True,
forcetraining=False,
nnpartition=None,
sum_rule=True,
batch_size=None,
weights=None,
):
super(KernelRidge, self).__init__()
np.set_printoptions(precision=30, threshold=999999999)
self.kernel = kernel
self.sigma = sigma
self.scheduler = scheduler
self.lamda = lamda
self.batch_size = batch_size
# Let's add parameters that are going to be stored in the .params json
# file.
self.params = OrderedDict()
self.params["name"] = self.name()
self.params["type"] = "svm"
self.params["class_name"] = self.__class__.__name__
# This is a very general way of not forgetting to save variables
_params = vars()
# Delete useless variables
delete = ["self", "__class__"]
for param in delete:
del _params[param]
for k, v in _params.items():
if v is not None:
self.params[k] = v
# Everything that is added here is not going to be part of the json
# params file
self.fingerprint_map = []
if weights is None:
self.weights = {}
else:
self.weights = weights
[docs] def get_potential_energy(self, features, reference_space, purpose):
"""Get potential energy with Kernel Ridge
Parameters
----------
features : dict
Dictionary with hash and features.
reference_space : array
Array with reference feature space.
purpose : str
Purpose of this function: 'training', 'inference'.
Returns
-------
energy, variance
Energy of a molecule and its respective variance.
"""
try:
reference_space = reference_space[b"reference_space"]
except KeyError:
reference_space = reference_space["reference_space"]
computations = self.get_kernel_matrix(features, reference_space, purpose)
kernel = np.array(dask.compute(*computations, scheduler=self.scheduler))
weights = np.array(self.weights["energy"])
dim = int(kernel.shape[0] / weights.shape[0])
kernel = kernel.reshape(dim, len(weights))
energy_per_atom = np.dot(kernel, weights)
energy = np.sum(energy_per_atom)
variance = self.get_variance(features, kernel, reference_space, purpose)
return energy, variance
[docs] def get_variance(self, features, ks, reference_space, purpose):
"""Compute predictive variance
Parameters
----------
features : dict
Dictionary with data point to be predicted.
ks : array
Variance between data point and reference space.
reference_space : list
Reference space used to compute kernel.
purpose : str
Purpose of this function: 'training', 'inference'.
Returns
-------
variance
Predictive variance.
"""
K = self.get_kernel_matrix(reference_space, reference_space, purpose)
K = np.array(dask.compute(*K, scheduler=self.scheduler))
dim = int(np.sqrt(len(K)))
K = K.reshape(dim, dim)
if isinstance(self.lamda, dict):
lamda = self.lamda["energy"]
else:
lamda = self.lamda
size = K.shape[0]
I_e = np.identity(size)
cholesky_U = cholesky((K + (lamda * I_e)))
betas = np.linalg.solve(cholesky_U.T, ks.T)
variance = ks.dot(np.linalg.solve(cholesky_U, betas))
kxx = self.get_kernel_matrix(features, features, purpose)
kxx = np.array(dask.compute(*kxx, scheduler=self.scheduler))
dim = int(np.sqrt(len(kxx)))
kxx = kxx.reshape(dim, dim)
variance = np.sum(kxx - variance)
return variance