Source code for ml4chem.atomistic.potentials

import ase
import codecs
import copy
import json
import logging
import os
import torch
from ase.calculators.calculator import Calculator
from ml4chem.backends.available import available_backends
from ml4chem.data.handler import Data
from ml4chem.data.serialization import dump, load
from ml4chem.utils import get_header_message, dynamic_import


logger = logging.getLogger()


[docs]class Potentials(Calculator, object): """Atomistic Machine Learning Potentials This class is highly inspired by the Atomistic Machine-Learning package (Amp). Parameters ---------- features : object Atomic feature vectors (local chemical environments) from any of the features module. model : object Machine learning algorithm to build a model. path : str Path to save files. label : str Name of files. Default ml4chem. preprocessor : str Path to load sklearn preprocessor object. Useful when doing inference. batch_size : int Number of data points per batch to use for training. Default is None. """ # This is needed by ASE implemented_properties = ["energy", "forces"] # This is a good way to make attributes available to the class. This can be # accessed as Potentials.attribute svm_models = ["KernelRidge", "GaussianProcess"] autoencoders = ["AutoEncoder", "VAE"] module_names = { "PytorchPotentials": "neuralnetwork", "PytorchIonicPotentials": "ionic", "RetentionTimes": "rt", "KernelRidge": "kernelridge", "GaussianProcess": "gaussian_process", "VAE": "autoencoders", } def __init__( self, features=None, model=None, path=None, label="ml4chem", atoms=None, ml4chem_path=None, preprocessor=None, batch_size=None, ): Calculator.__init__(self, label=label, atoms=atoms) self.features = features self.available_backends = available_backends() self.path = path self.label = label self.model = model self.ml4chem_path = ml4chem_path self.preprocessor = preprocessor self.batch_size = batch_size logger.info(get_header_message()) self.reference_space = None
[docs] @classmethod def load(Cls, model=None, params=None, preprocessor=None, **kwargs): """Load ML4Chem models Parameters ---------- model : str The path to load the model from the .ml4c file for inference. params : srt The path to load .params file with users' inputs. preprocessor : str The path to load the file with the sklearn preprocessor object. """ kwargs["ml4chem_path"] = model kwargs["preprocessor"] = preprocessor with open(params, "rb") as ml4chem_params: ml4chem_params = json.load(ml4chem_params) model_type = ml4chem_params["model"].get("type") model_params = ml4chem_params["model"] class_name = model_params["class_name"] module_name = Potentials.module_names[model_params["name"]] model_class = dynamic_import( class_name, "ml4chem.atomistic.models", alt_name=module_name ) delete = ["name", "type", "class_name"] for param in delete: # delete unneeded (key, value) pairs. del model_params[param] if model_type == "svm": weights = load(model) # TODO remove after de/serialization is fixed. try: weights = { key.decode("utf-8"): value for key, value in weights.items() } except AttributeError: weights = {key: value for key, value in weights.items()} model_params.update({"weights": weights}) model = model_class(**model_params) else: # Instantiate the model class model = model_class(**model_params) # Instantiation of fingerprint class fingerprint_params = ml4chem_params.get("features", None) if fingerprint_params == None: features = None else: if "kwargs" in fingerprint_params.keys(): update_dict_with = fingerprint_params.pop("kwargs") fingerprint_params.update(update_dict_with) if fingerprint_params is None: features = fingerprint_params else: name = fingerprint_params.get("name") del fingerprint_params["name"] features = dynamic_import(name, "ml4chem.atomistic.features") features = features(**fingerprint_params) calc = Cls(features=features, model=model, **kwargs) return calc
[docs] @staticmethod def save(model=None, features=None, path=None, label="ml4chem"): """Save a model Parameters ---------- model : obj The model to be saved. features : obj Features object. path : str The path where to save the model. label : str Name of files. Default ml4chem. """ if path is None: path = "." if os.path.isdir(path) is False: os.makedirs(path) if path[-1] == "/": path += label else: path = path + "/" + label if model is not None: model_name = model.name() if model_name in Potentials.svm_models: params = {"model": model.params} # Save model weights to file dump(model.weights, path + ".ml4c") else: # FIXME a global class to save params? params = { "model": { "name": model_name, "class_name": model.__class__.__name__, "hiddenlayers": model.hiddenlayers, "activation": model.activation, "type": "nn", "input_dimension": model.input_dimension, } } torch.save(model.state_dict(), path + ".ml4c") if model_name in Potentials.autoencoders: output_dimension = {"output_dimension": model.output_dimension} params["model"].update(output_dimension) variant = {"variant": model.variant} params["model"].update(variant) one_for_all = {"one_for_all": model.one_for_all} params["model"].update(one_for_all) else: params = {} if features is not None: # Adding features to .params json file. features = {"features": features.params} params.update(features) # Save parameters to file with open(path + ".params", "wb") as json_file: json.dump( params, codecs.getwriter("utf-8")(json_file), ensure_ascii=False, indent=4, )
[docs] def train( self, training_set, epochs=100, lr=0.001, convergence=None, device="cpu", optimizer=(None, None), lossfxn=None, regularization=0.0, batch_size=None, **kwargs ): """Method to train models Parameters ---------- training_set : object, list List containing the training set. epochs : int Number of full training cycles. lr : float Learning rate. convergence : dict Instead of using epochs, users can set a convergence criterion. device : str Calculation can be run in the cpu or cuda (gpu). optimizer : tuple The optimizer is a tuple with the structure: >>> ('adam', {'lr': float, 'weight_decay'=float}) lossfxn : object A loss function object. regularization : float This is the L2 regularization. It is not the same as weight decay. batch_size : int Number of data points per batch to use for training. Default is None. """ purpose = "training" # Raw input and targets aka X, y data_handler = Data(training_set, purpose=purpose) training_set, targets = data_handler.get_data(purpose=purpose) # Now let's featurize # SVM models if self.model.name() in Potentials.svm_models: # Mapping raw positions into a feature space aka X feature_space, reference_features = self.features.calculate( training_set, data=data_handler, purpose=purpose, svm=True ) self.model.prepare_model( feature_space, reference_features, data=data_handler ) self.model.train(feature_space, targets) else: # Mapping raw positions into a feature space aka X feature_space = self.features.calculate( training_set, data=data_handler, purpose=purpose, svm=False ) # Fixed fingerprint dimension input_dimension = len(list(feature_space.values())[0][0][-1]) self.model.prepare_model(input_dimension, data=data_handler) # CUDA stuff if device == "cuda": logger.info("Checking if CUDA is available...") use_cuda = torch.cuda.is_available() if use_cuda: count = torch.cuda.device_count() logger.info( "ML4Chem found {} CUDA devices available.".format(count) ) for index in range(count): device_name = torch.cuda.get_device_name(index) if index == 0: device_name += " (Default)" logger.info(" - {}.".format(device_name)) else: logger.warning("No CUDA available. We will use CPU.") device = "cpu" device_ = torch.device(device) self.model.to(device_) # This is something specific of pytorch. module = Potentials.module_names[self.model.name()] train = dynamic_import("train", "ml4chem.atomistic.models", alt_name=module) # Let's train train( feature_space, targets, model=self.model, data=data_handler, optimizer=optimizer, regularization=regularization, epochs=epochs, convergence=convergence, lossfxn=lossfxn, device=device, batch_size=batch_size, **kwargs ) self.save(self.model, features=self.features, path=self.path, label=self.label)
[docs] def calculate(self, atoms, properties, system_changes): """Calculate things Parameters ---------- atoms : object, list List if images in ASE format. properties : """ purpose = "inference" # Calculator.calculate(self, atoms, properties, system_changes) model_name = self.model.name() # We convert the atoms in atomic features if isinstance(atoms, ase.atoms.Atoms): atoms = [atoms] data_handler = Data(atoms, purpose=purpose) atoms = data_handler.get_data(purpose=purpose) # We copy the loaded fingerprint class features = copy.deepcopy(self.features) kwargs = {"data": data_handler, "purpose": purpose} if model_name in Potentials.svm_models: kwargs.update({"svm": True}) if features.name() == "LatentFeatures": features = features.calculate(atoms, **kwargs) else: features.batch_size = self.batch_size features.preprocessor = self.preprocessor features = features.calculate(atoms, **kwargs) if "energy" in properties: logger.info("Computing energy...") if model_name in Potentials.svm_models: try: reference_space = load(self.reference_space) except: raise ("This is not a database...") self.model.prepare_model(None, None, data=data_handler, purpose=purpose) energy = self.model.get_potential_energy( features, reference_space, purpose=purpose ) else: input_dimension = len(list(features.values())[0][0][-1]) model = copy.deepcopy(self.model) model.prepare_model(input_dimension, data=data_handler, purpose=purpose) try: model.load_state_dict(torch.load(self.ml4chem_path), strict=True) except RuntimeError: logger.warning( "Your image does not have some atoms present in the loaded model.\n" ) model.load_state_dict(torch.load(self.ml4chem_path), strict=False) model.eval() try: # A single-point energy calculation energy = model(features).item() except ValueError: # A list of single-point energy calculations. energy = model(features).tolist() # Populate ASE's self.results dict self.results["energy"] = energy