import ase
import codecs
import copy
import json
import logging
import os
import torch
from ase.calculators.calculator import Calculator
from ml4chem.backends.available import available_backends
from ml4chem.data.handler import Data
from ml4chem.data.serialization import dump, load
from ml4chem.utils import get_header_message, dynamic_import
logger = logging.getLogger()
[docs]class Potentials(Calculator, object):
"""Atomistic Machine Learning Potentials
This class is highly inspired by the Atomistic Machine-Learning package
(Amp).
Parameters
----------
features : object
Atomic feature vectors (local chemical environments) from any of the
features module.
model : object
Machine learning algorithm to build a model.
path : str
Path to save files.
label : str
Name of files. Default ml4chem.
preprocessor : str
Path to load sklearn preprocessor object. Useful when doing inference.
batch_size : int
Number of data points per batch to use for training. Default is None.
"""
# This is needed by ASE
implemented_properties = ["energy", "forces"]
# This is a good way to make attributes available to the class. This can be
# accessed as Potentials.attribute
svm_models = ["KernelRidge", "GaussianProcess"]
autoencoders = ["AutoEncoder", "VAE"]
module_names = {
"PytorchPotentials": "neuralnetwork",
"PytorchIonicPotentials": "ionic",
"RetentionTimes": "rt",
"KernelRidge": "kernelridge",
"GaussianProcess": "gaussian_process",
"VAE": "autoencoders",
}
def __init__(
self,
features=None,
model=None,
path=None,
label="ml4chem",
atoms=None,
ml4chem_path=None,
preprocessor=None,
batch_size=None,
):
Calculator.__init__(self, label=label, atoms=atoms)
self.features = features
self.available_backends = available_backends()
self.path = path
self.label = label
self.model = model
self.ml4chem_path = ml4chem_path
self.preprocessor = preprocessor
self.batch_size = batch_size
logger.info(get_header_message())
self.reference_space = None
[docs] @classmethod
def load(Cls, model=None, params=None, preprocessor=None, **kwargs):
"""Load ML4Chem models
Parameters
----------
model : str
The path to load the model from the .ml4c file for inference.
params : srt
The path to load .params file with users' inputs.
preprocessor : str
The path to load the file with the sklearn preprocessor object.
"""
kwargs["ml4chem_path"] = model
kwargs["preprocessor"] = preprocessor
with open(params, "rb") as ml4chem_params:
ml4chem_params = json.load(ml4chem_params)
model_type = ml4chem_params["model"].get("type")
model_params = ml4chem_params["model"]
class_name = model_params["class_name"]
module_name = Potentials.module_names[model_params["name"]]
model_class = dynamic_import(
class_name, "ml4chem.atomistic.models", alt_name=module_name
)
delete = ["name", "type", "class_name"]
for param in delete:
# delete unneeded (key, value) pairs.
del model_params[param]
if model_type == "svm":
weights = load(model)
# TODO remove after de/serialization is fixed.
try:
weights = {
key.decode("utf-8"): value for key, value in weights.items()
}
except AttributeError:
weights = {key: value for key, value in weights.items()}
model_params.update({"weights": weights})
model = model_class(**model_params)
else:
# Instantiate the model class
model = model_class(**model_params)
# Instantiation of fingerprint class
fingerprint_params = ml4chem_params.get("features", None)
if fingerprint_params == None:
features = None
else:
if "kwargs" in fingerprint_params.keys():
update_dict_with = fingerprint_params.pop("kwargs")
fingerprint_params.update(update_dict_with)
if fingerprint_params is None:
features = fingerprint_params
else:
name = fingerprint_params.get("name")
del fingerprint_params["name"]
features = dynamic_import(name, "ml4chem.atomistic.features")
features = features(**fingerprint_params)
calc = Cls(features=features, model=model, **kwargs)
return calc
[docs] @staticmethod
def save(model=None, features=None, path=None, label="ml4chem"):
"""Save a model
Parameters
----------
model : obj
The model to be saved.
features : obj
Features object.
path : str
The path where to save the model.
label : str
Name of files. Default ml4chem.
"""
if path is None:
path = "."
if os.path.isdir(path) is False:
os.makedirs(path)
if path[-1] == "/":
path += label
else:
path = path + "/" + label
if model is not None:
model_name = model.name()
if model_name in Potentials.svm_models:
params = {"model": model.params}
# Save model weights to file
dump(model.weights, path + ".ml4c")
else:
# FIXME a global class to save params?
params = {
"model": {
"name": model_name,
"class_name": model.__class__.__name__,
"hiddenlayers": model.hiddenlayers,
"activation": model.activation,
"type": "nn",
"input_dimension": model.input_dimension,
}
}
torch.save(model.state_dict(), path + ".ml4c")
if model_name in Potentials.autoencoders:
output_dimension = {"output_dimension": model.output_dimension}
params["model"].update(output_dimension)
variant = {"variant": model.variant}
params["model"].update(variant)
one_for_all = {"one_for_all": model.one_for_all}
params["model"].update(one_for_all)
else:
params = {}
if features is not None:
# Adding features to .params json file.
features = {"features": features.params}
params.update(features)
# Save parameters to file
with open(path + ".params", "wb") as json_file:
json.dump(
params,
codecs.getwriter("utf-8")(json_file),
ensure_ascii=False,
indent=4,
)
[docs] def train(
self,
training_set,
epochs=100,
lr=0.001,
convergence=None,
device="cpu",
optimizer=(None, None),
lossfxn=None,
regularization=0.0,
batch_size=None,
**kwargs
):
"""Method to train models
Parameters
----------
training_set : object, list
List containing the training set.
epochs : int
Number of full training cycles.
lr : float
Learning rate.
convergence : dict
Instead of using epochs, users can set a convergence criterion.
device : str
Calculation can be run in the cpu or cuda (gpu).
optimizer : tuple
The optimizer is a tuple with the structure:
>>> ('adam', {'lr': float, 'weight_decay'=float})
lossfxn : object
A loss function object.
regularization : float
This is the L2 regularization. It is not the same as weight decay.
batch_size : int
Number of data points per batch to use for training. Default is
None.
"""
purpose = "training"
# Raw input and targets aka X, y
data_handler = Data(training_set, purpose=purpose)
training_set, targets = data_handler.get_data(purpose=purpose)
# Now let's featurize
# SVM models
if self.model.name() in Potentials.svm_models:
# Mapping raw positions into a feature space aka X
feature_space, reference_features = self.features.calculate(
training_set, data=data_handler, purpose=purpose, svm=True
)
self.model.prepare_model(
feature_space, reference_features, data=data_handler
)
self.model.train(feature_space, targets)
else:
# Mapping raw positions into a feature space aka X
feature_space = self.features.calculate(
training_set, data=data_handler, purpose=purpose, svm=False
)
# Fixed fingerprint dimension
input_dimension = len(list(feature_space.values())[0][0][-1])
self.model.prepare_model(input_dimension, data=data_handler)
# CUDA stuff
if device == "cuda":
logger.info("Checking if CUDA is available...")
use_cuda = torch.cuda.is_available()
if use_cuda:
count = torch.cuda.device_count()
logger.info(
"ML4Chem found {} CUDA devices available.".format(count)
)
for index in range(count):
device_name = torch.cuda.get_device_name(index)
if index == 0:
device_name += " (Default)"
logger.info(" - {}.".format(device_name))
else:
logger.warning("No CUDA available. We will use CPU.")
device = "cpu"
device_ = torch.device(device)
self.model.to(device_)
# This is something specific of pytorch.
module = Potentials.module_names[self.model.name()]
train = dynamic_import("train", "ml4chem.atomistic.models", alt_name=module)
# Let's train
train(
feature_space,
targets,
model=self.model,
data=data_handler,
optimizer=optimizer,
regularization=regularization,
epochs=epochs,
convergence=convergence,
lossfxn=lossfxn,
device=device,
batch_size=batch_size,
**kwargs
)
self.save(self.model, features=self.features, path=self.path, label=self.label)
[docs] def calculate(self, atoms, properties, system_changes):
"""Calculate things
Parameters
----------
atoms : object, list
List if images in ASE format.
properties :
"""
purpose = "inference"
# Calculator.calculate(self, atoms, properties, system_changes)
model_name = self.model.name()
# We convert the atoms in atomic features
if isinstance(atoms, ase.atoms.Atoms):
atoms = [atoms]
data_handler = Data(atoms, purpose=purpose)
atoms = data_handler.get_data(purpose=purpose)
# We copy the loaded fingerprint class
features = copy.deepcopy(self.features)
kwargs = {"data": data_handler, "purpose": purpose}
if model_name in Potentials.svm_models:
kwargs.update({"svm": True})
if features.name() == "LatentFeatures":
features = features.calculate(atoms, **kwargs)
else:
features.batch_size = self.batch_size
features.preprocessor = self.preprocessor
features = features.calculate(atoms, **kwargs)
if "energy" in properties:
logger.info("Computing energy...")
if model_name in Potentials.svm_models:
try:
reference_space = load(self.reference_space)
except:
raise ("This is not a database...")
self.model.prepare_model(None, None, data=data_handler, purpose=purpose)
energy = self.model.get_potential_energy(
features, reference_space, purpose=purpose
)
else:
input_dimension = len(list(features.values())[0][0][-1])
model = copy.deepcopy(self.model)
model.prepare_model(input_dimension, data=data_handler, purpose=purpose)
try:
model.load_state_dict(torch.load(self.ml4chem_path), strict=True)
except RuntimeError:
logger.warning(
"Your image does not have some atoms present in the loaded model.\n"
)
model.load_state_dict(torch.load(self.ml4chem_path), strict=False)
model.eval()
try:
# A single-point energy calculation
energy = model(features).item()
except ValueError:
# A list of single-point energy calculations.
energy = model(features).tolist()
# Populate ASE's self.results dict
self.results["energy"] = energy