from collections import OrderedDict
from ml4chem.data.utils import ase_to_xyz
from ml4chem.utils import get_hash
import datetime
import logging
import pandas as pd
logger = logging.getLogger()
[docs]class Data(object):
"""A Data class
An adequate data structure is very important to develop machine-learning
models. In general a model receives a data set (X) and a target vector (y).
This class should in principle arrange this in a format that can be
vectorized and operate not only with neural networks but also with support
vector machines.
The central object here is the data set.
Parameters
----------
images : list or object
List of images. Supported format is from ASE.
purpose : str
Is this data for training or inference purpose?. Supported strings are:
"training", and "inference".
"""
def __init__(self, images, purpose=None):
self.images = None
self.targets = None
self.unique_element_symbols = None
logger.info("\nData")
logger.info("====")
now = datetime.datetime.now()
logger.info("Module accessed on {}.".format(now.strftime("%Y-%m-%d %H:%M:%S")))
if self.is_valid_structure(images) is False:
logger.warning("Data structure is not compatible with ML4Chem.")
self.prepare_images(images, purpose=purpose)
[docs] def prepare_images(self, images, purpose=None):
"""Function to prepare images to operate with ML4Chem
Parameters
----------
images : list or object
List of images.
purpose : str
The purpose of the data so that structure is prepared accordingly.
Supported are: 'training', 'inference'
"""
logger.info("Preparing images for {}...".format(purpose))
self.images = OrderedDict()
self.atoms_per_image = []
if purpose == "training":
self.targets = []
duplicates = 0
for image in images:
key = get_hash(image)
if key in self.images.keys():
duplicates += 1
else:
self.images[key] = image
if purpose == "training":
# When purpose is training then you also need targets and
# number of atoms in each image
try:
self.targets.append(image.get_potential_energy())
except RuntimeError: # Atoms object has no calculator
pass
self.atoms_per_image.append(len(image))
if purpose == "training" and len(self.targets) > 0:
max_energy = max(self.targets)
max_index = self.targets.index(max_energy)
min_energy = min(self.targets)
min_index = self.targets.index(min_energy)
max_energy = max_energy / len(images[max_index])
min_energy = min_energy / len(images[min_index])
self.max_energy, self.min_energy = max_energy, min_energy
logger.info("Images hashed and processed...\n")
if purpose == "training":
logger.info(
"There are {} atoms in your data set.".format(
self.get_total_number_atoms()
)
)
[docs] def is_valid_structure(self, images):
"""Check if the data has a valid structure
Parameters
----------
images : list of atoms
List of images.
Returns
-------
valid : bool
Whether or not the structure is valid.
"""
if isinstance(images, dict):
valid = True
else:
valid = False
return valid
[docs] def get_unique_element_symbols(self, images=None, purpose=None):
"""Unique element symbol in data set
Parameters
----------
images : list of images.
ASE object.
purpose : str
The supported categories are: 'training', 'inference'.
"""
if images is None:
images = self.images
supported_categories = ["training", "inference"]
symbols = {}
# FIXME make this parallel.
if purpose in supported_categories:
if purpose not in symbols.keys():
symbols[purpose] = {}
try:
symbols[purpose] = sorted(
list(set([atom.symbol for image in images for atom in image]))
)
except AttributeError:
symbols[purpose] = sorted(
list(
set(
[
atom.symbol
for key, image in images.items()
for atom in image
]
)
)
)
else:
# FIXME
logger.warning("what happens in the following case?")
else:
logger.warning("The requested purpose is not supported...")
symbols = None
self.unique_element_symbols = symbols
return self.unique_element_symbols
[docs] def get_data(self, purpose=None):
"""A method to get data
Parameters
----------
purpose : str
The purpose of the data so that structure is prepared accordingly.
Supported are: 'training', 'inference'
Returns
-------
self.images : dict
Ordered dictionary of images corresponding to order of self.targets
list.
self.targets : list
Targets used for training the model.
"""
if purpose == "training":
return self.images, self.targets
else:
return self.images
[docs] def get_total_number_atoms(self):
"""Get the total number of atoms"""
return sum(self.atoms_per_image)
[docs] def to_pandas(self):
"""Convert data to pandas DataFrame"""
images = OrderedDict()
columns = ["xyz"]
for key, atoms in self.images.items():
images[key] = ase_to_xyz(atoms, file=False)
df = pd.DataFrame.from_dict(images, orient="index", columns=columns)
df["energy"] = self.targets
return df