Source code for ml4chem.atomistic.models.kernelridge

import dask
import datetime
import logging
import time
import numpy as np
from ml4chem.utils import convert_elapsed_time, get_chunks
from collections import OrderedDict
from scipy.linalg import cholesky

logger = logging.getLogger()


[docs]class KernelRidge(object): """Kernel Ridge Regression Parameters ---------- sigma : float, list, or dict Length scale of the Gaussian in the case of RBF, exponential, and laplacian kernels. Default is 1. (float) and it computes isotropic kernels. Pass a list if you would like to compute anisotropic kernels, or a dictionary if you want sigmas for each model. Example: >>> sigma={'energy': {'H': value, 'O': value}, 'forces': {'H': {0: value, 1: value, 2: value}, 'O': {0: value, 1: value, 2: value}}} `value` can be a float or a list. kernel : str Choose the kernel. Available kernels are: 'linear', 'rbf', 'laplacian', and 'exponential'. Default is 'rbf'. lamda : float, or dictionary Strength of the regularization. If you pass a dictionary then force and energy will have different regularization: >>> lamda = {'energy': value, 'forces': value} Dictionaries are only used when performing Cholesky factorization. trainingimages : str Path to Trajectory file containing the images in the training set. This is useful for predicting new structures. cholesky : bool Whether or not we are using Cholesky decomposition to determine the weights. This method returns an unique set of regression coefficients. weights_independent : bool Whether or not the weights are going to be split for energy and forces. forcetraining : bool Turn force training true. nnpartition : str Use per-atom energy partition from a neural network calculator. You have to set the path to .amp file. Useful for energy training with Cholesky factorization. Default is set to None. scheduler : str The scheduler to be used with the dask backend. sum_rule : bool Whether or not we sum of fingerprintprime elements over a given axis. This applies np.sum(fingerprint_list, axis=0). batch_size : int Number of elements per batch in order to split computations. Useful when number of local chemical environments is too large. weights : dict Dictionary of weights. Notes ----- In the case of training total energies, we need to apply either an atomic decomposition Ansatz (ADA) during training or an energy partition scheme to the training set. ADA can be achieved based on Ref. 1. For an explanation of what they do, see the Master thesis by Sonja Mathias. http://wissrech.ins.uni-bonn.de/teaching/master/masterthesis_mathias_revised.pdf ADA is the default way of training total energies in this KernelRidge class. An energy partition scheme for total energies can be obtained from an artificial neural network or methods such as the interacting quantum atoms theory (IQA). I implemented the nnpartition mode for which users can provide the path to a NN calculator and we take the energies per-atom from the function .calculate_atomic_energy(). The strategy would be to use train the NN with a very tight convergence criterion (1e-6 RSME). Then, calling .calculate_atomic_energy() would give you the atomic energies for such set. For forces is a different history because we do know the derivative of the energy with respect to atom positions (a per-atom quantity). So we rely on the method in the algorithm shown by Rupp in Ref. 2. References ---------- 1. Bartók, A. P. & Csányi, G. Gaussian approximation potentials: A brief tutorial introduction. Int. J. Quantum Chem. 115, 1051–1057 (2015). 2. Rupp, M. Machine learning for quantum mechanics in a nutshell. Int. J. Quantum Chem. 115, 1058–1073 (2015). """ NAME = "KernelRidge"
[docs] @classmethod def name(cls): """Returns name of class""" return cls.NAME
def __init__( self, sigma=1.0, kernel="rbf", scheduler="distributed", lamda=1e-5, trainingimages=None, checkpoints=None, cholesky=True, weights_independent=True, forcetraining=False, nnpartition=None, sum_rule=True, batch_size=None, weights=None, **kwargs ): np.set_printoptions(precision=30, threshold=999999999) self.kernel = kernel self.sigma = sigma self.scheduler = scheduler self.lamda = lamda self.batch_size = batch_size # Let's add parameters that are going to be stored in the .params json # file. self.params = OrderedDict() self.params["name"] = self.name() self.params["type"] = "svm" self.params["class_name"] = self.__class__.__name__ # This is a very general way of not forgetting to save variables _params = vars() # Delete useless variables del _params["self"] for k, v in _params.items(): if v is not None: self.params[k] = v # Everything that is added here is not going to be part of the json # params file self.fingerprint_map = [] if weights is None: self.weights = {} else: self.weights = weights
[docs] def prepare_model( self, feature_space, reference_features, data=None, purpose="training" ): """Prepare the Kernel Ridge Regression model Parameters ---------- feature_space : dict A dictionary with hash, fingerprint structure. reference_features : dict A dictionary with raveled tuples of symbol, atomic fingerprint. data : object Data object created from the handler. purpose : str Purpose of this model: 'training', 'inference'. Notes ----- This method builds the atomic kernel matrices and the LT vectors needed to apply the atomic decomposition Ansatz. """ if self.batch_size == None: self.batch_size = data.get_total_number_atoms() if purpose == "training": now = datetime.datetime.now() logger.info(" ") logger.info("Model") logger.info("=====") logger.info( "Module accessed on {}.".format(now.strftime("%Y-%m-%d %H:%M:%S")) ) logger.info("Model name: {}.".format(self.name())) logger.info("Kernel parameters:") logger.info(" - Kernel function: {}.".format(self.kernel)) logger.info(" - Sigma: {}.".format(self.sigma)) logger.info(" - Lamda: {}.".format(self.lamda)) logger.info(" ") dim = len(reference_features) """ Atomic kernel matrices """ logger.info("Computing Kernel Matrix...") # We start populating computations with delayed functions to # operate with dask's scheduler kernel_matrix = self.get_kernel_matrix( feature_space, reference_features, purpose=purpose ) # futures = self.get_kernel_matrix( # feature_space, reference_features, purpose=purpose # ) # if self.batch_size is not None: # futures = list(get_chunks(futures, self.batch_size)) # logger.info( # " The calculations are in batches of {}.".format(self.batch_size) # ) # We compute the calculations with dask and the result is converted # to numpy array. # if self.batch_size is None: # kernel_matrix = client.gather(futures) # else: # kernel_matrix = [] # for chunk in futures: # kernel_matrix += client.gather(chunk) # FIXME probably not very efficient yet. # Found at https://stackoverflow.com/a/36250972/1995261 _K = np.zeros([dim, dim]) indices_upper = np.triu_indices(dim) _K[indices_upper] = kernel_matrix self.K = _K.T + _K np.fill_diagonal(self.K, np.diag(_K)) del _K
[docs] def get_kernel_matrix(self, feature_space, reference_features, purpose): """Get kernel matrix delayed computations Parameters ---------- features : dict, list Dictionary with hash and features, or a list. reference_space : array Array with reference feature space. purpose : str Purpose of this kernel matrix. Accepted arguments are 'training', and 'inference'. Returns ------- kernel_matrix List with kernel matrix values. Notes ----- This class method expects the feature_space to be an OrderedDict and reference_space but it turns out that for computing variances, it might be the case the feature_space is also a list. """ call = {"exponential": exponential, "laplacian": laplacian, "rbf": rbf} initial_time = time.time() if isinstance(reference_features, dict): # This is the case when the reference_features are a # dictionary, too. If that's true we have to convert it to a list. reference_features = list(reference_features.values())[0] chunks = list(get_chunks(feature_space, self.batch_size)) logger.info( " The calculations are distributed in {} batches of {} atoms.".format( len(chunks), self.batch_size ) ) counter = 0 kernel_matrix = [] for c, chunk in enumerate(chunks): chunk_initial_time = time.time() logger.info(" Computing kernel functions for chunk {}...".format(c)) intermediates = [] if isinstance(feature_space, dict) and isinstance(reference_features, list): if isinstance(chunk, dict) is False: chunk = OrderedDict(chunk) reference_lenght = len(reference_features) for hash, _feature_space in chunk.items(): f_map = [] for i_symbol, i_afp in _feature_space: i_symbol = decode(i_symbol) f_map.append(1) if purpose == "training": for j in range(counter, reference_lenght): j_symbol, j_afp = reference_features[j] kernel = call[self.kernel]( i_afp, j_afp, i_symbol, j_symbol, self.sigma ) intermediates.append(kernel) counter += 1 else: for j_symbol, j_afp in reference_features: j_symbol = decode(j_symbol) kernel = call[self.kernel]( i_afp, j_afp, i_symbol, j_symbol, self.sigma ) intermediates.append(kernel) self.fingerprint_map.append(f_map) elif isinstance(feature_space, list) and isinstance( reference_features, list ): for i_symbol, i_afp in chunk: for j_symbol, j_afp in reference_features: i_symbol = decode(i_symbol) j_symbol = decode(j_symbol) kernel = call[self.kernel]( i_afp, j_afp, i_symbol, j_symbol, self.sigma ) intermediates.append(kernel) # Compute stuff from above kernel_matrix += dask.compute(intermediates, scheduler=self.scheduler)[0] del intermediates chunk_final_time = time.time() - chunk_initial_time h, m, s = convert_elapsed_time(chunk_final_time) logger.info( " ...finished in {} hours {} minutes {:.2f} " "seconds.".format(h, m, s) ) # dask.distributed.wait(kernel_matrix) del reference_features # kernel_matrix = client.gather(kernel_matrix) build_time = time.time() - initial_time h, m, s = convert_elapsed_time(build_time) logger.info( "Kernel matrix built in {} hours {} minutes {:.2f} " "seconds.".format(h, m, s) ) """ LT Vectors """ # We build the LT matrix needed for ADA if purpose == "training": self.LT = [] logger.info("Building LT matrix") computations = [] for index, feature_space in enumerate(feature_space.items()): computations.append(self.get_lt(index)) computations = list(get_chunks(computations, self.batch_size)) logger.info( " The calculations are distributed in {} batches of {} molecules.".format( len(computations), self.batch_size ) ) for chunk in computations: self.LT += dask.compute(*chunk, scheduler=self.scheduler) self.LT = np.array(self.LT) del computations del chunk lt_time = time.time() - initial_time h, m, s = convert_elapsed_time(lt_time) logger.info( "LT matrix built in {} hours {} minutes {:.2f} seconds.".format(h, m, s) ) return kernel_matrix
[docs] def train(self, inputs, targets, data=None): """Train the model Parameters ---------- inputs : dict Dictionary with hashed feature space. targets : list The expected values that the model has to learn aka y. data : object Data object created from the handler. """ if isinstance(self.lamda, dict): lamda = self.lamda["energy"] else: lamda = self.lamda size = len(targets) I_e = np.identity(size) K = self.LT.dot(self.K).dot(self.LT.T) del self.LT logger.info("Size of the Kernel matrix is {}.".format(K.shape)) logger.info("Starting Cholesky Factorization...") cholesky_U = cholesky((K + (lamda * I_e))) logger.info("Cholesky Factorization finished...") betas = np.linalg.solve(cholesky_U.T, targets) _weights = np.linalg.solve(cholesky_U, betas) _weights = [ w * g for index, w in enumerate(_weights) for g in self.fingerprint_map[index] ] self.weights["energy"] = _weights
[docs] def get_potential_energy(self, features, reference_space, purpose): """Get potential energy with Kernel Ridge Parameters ---------- features : dict Dictionary with hash and features. reference_space : array Array with reference feature space. purpose : str Purpose of this function: 'training', 'inference'. Returns ------- energy Energy of a molecule. """ client = dask.distributed.get_client() try: reference_space = reference_space[b"reference_space"] except KeyError: reference_space = reference_space["reference_space"] futures = self.get_kernel_matrix(features, reference_space, purpose=purpose) kernel = np.array(client.gather(futures)) weights = np.array(self.weights["energy"]) dim = int(kernel.shape[0] / weights.shape[0]) kernel = kernel.reshape(dim, len(weights)) energy_per_atom = np.dot(kernel, weights) energy = np.sum(energy_per_atom) return energy
@dask.delayed def get_lt(self, index): """Return LT vectors Parameters ---------- index : int Index of image. Returns ------- _LT : list Returns a list that maps atomic features in the images. """ _LT = [] for i, group in enumerate(self.fingerprint_map): if i == index: for _ in group: _LT.append(1.0) else: for _ in group: _LT.append(0.0) return _LT
[docs] def get_sigma(self, sigma, forcetraining=False): """Function to build sigma Parameters ---------- sigma : float, list or dict. This is user's raw input for sigma. forcetraining : bool Whether or not force training is set to true. Returns ------- _sigma : dict Universal sigma dictionary for KernelRidge. """ _sigma = {} if isinstance(sigma, int): for symbol in unique_element_symbols: _sigma[symbol] = sigma return _sigma
""" Auxiliary functions """ @dask.delayed def linear(feature_i, feature_j, i_symbol=None, j_symbol=None): """ Compute a linear kernel Parameters ---------- feature_i : np.array Atomic fingerprint for central atom. feature_j : np.array Atomic fingerprint for j atom. i_symbol : str Chemical symbol for central atom. j_symbol : str Chemical symbol for j atom. Returns ------- linear :float Linear kernel. """ if i_symbol != j_symbol: return 0.0 else: linear = np.dot(feature_i, feature_j) return linear @dask.delayed def rbf(feature_i, feature_j, i_symbol=None, j_symbol=None, sigma=1.0): """ Compute the rbf (AKA Gaussian) kernel. Parameters ---------- feature_i : np.array Atomic fingerprint for central atom. feature_j : np.array Atomic fingerprint for j atom. i_symbol : str Chemical symbol for central atom. j_symbol : str Chemical symbol for j atom. sigma : float, or list. Gaussian width. If passed as a list or np.darray, kernel can become anisotropic. Returns ------- rbf :float RBF kernel. """ if i_symbol != j_symbol: return 0.0 else: if isinstance(sigma, list) or isinstance(sigma, np.ndarray): assert len(sigma) == len(feature_i) and len(sigma) == len(feature_j), ( "Length of sigma does not " "match atomic fingerprint " "length." ) sigma = np.array(sigma) anisotropic_rbf = np.exp( -( np.sum( np.divide( np.square(np.subtract(feature_i, feature_j)), (2.0 * np.square(sigma)), ) ) ) ) return anisotropic_rbf else: rbf = np.exp( -(np.linalg.norm(feature_i - feature_j) ** 2.0) / (2.0 * sigma ** 2.0) ) return rbf @dask.delayed def exponential(feature_i, feature_j, i_symbol=None, j_symbol=None, sigma=1.0): """ Compute the exponential kernel Parameters ---------- feature_i : np.array Atomic fingerprint for central atom. feature_j : np.array Atomic fingerprint for j atom. i_symbol : str Chemical symbol for central atom. j_symbol : str Chemical symbol for j atom. sigma : float, or list. Gaussian width. Returns ------- exponential : float Exponential kernel. """ if i_symbol != j_symbol: return 0.0 else: if isinstance(sigma, list) or isinstance(sigma, np.ndarray): assert len(sigma) == len(feature_i) and len(sigma) == len(feature_j), ( "Length of sigma does not " "match atomic fingerprint " "length." ) sigma = np.array(sigma) anisotropic_exp = np.exp( -( np.sqrt( np.sum( np.square( np.divide( np.subtract(feature_i, feature_j), (2.0 * np.square(sigma)), ) ) ) ) ) ) return anisotropic_exp else: exponential = np.exp( -(np.linalg.norm(feature_i - feature_j)) / (2.0 * sigma ** 2) ) return exponential @dask.delayed def laplacian(feature_i, feature_j, i_symbol=None, j_symbol=None, sigma=1.0): """ Compute the laplacian kernel Parameters ---------- feature_i : np.array Atomic fingerprint for central atom. feature_j : np.array Atomic fingerprint for j atom. i_symbol : str Chemical symbol for central atom. j_symbol : str Chemical symbol for j atom. sigma : float Gaussian width. Returns ------- laplacian : float Laplacian kernel. """ if i_symbol != j_symbol: return 0.0 else: if isinstance(sigma, list) or isinstance(sigma, np.ndarray): assert len(sigma) == len(feature_i) and len(sigma) == len(feature_j), ( "Length of sigma does not " "match atomic fingerprint " "length." ) sigma = np.array(sigma) sum_ij = np.sum( np.square(np.divide(np.subtract(feature_i, feature_j), sigma)) ) anisotropic_lap = np.exp(-(np.sqrt(sum_ij))) return anisotropic_lap else: laplacian = np.exp(-(np.linalg.norm(feature_i - feature_j)) / sigma) return laplacian
[docs]def decode(symbol): """Decode from binary to string Parameters ---------- symbol : binary A string in binary form, e.g. b'hola'. Returns ------- str Symbol as a string. """ try: return symbol.decode() except AttributeError: return symbol