Source code for ml4chem.data.handler

from collections import OrderedDict
from ml4chem.data.utils import ase_to_xyz
from ml4chem.utils import get_hash
import datetime
import logging
import pandas as pd

logger = logging.getLogger()


[docs]class Data(object):
    """A Data class

    An adequate data structure is very important to develop machine-learning
    models. In general a model receives a data set (X) and a target vector (y).
    This class should in principle arrange this in a format that can be
    vectorized and operate not only with neural networks but also with support
    vector machines.

    The central object here is the data set.

    Parameters
    ----------
    images : list or object
        List of images. Supported format is from ASE.
    purpose : str
        Is this data for training or inference purpose?. Supported strings are:
        "training", and "inference".
    """

    def __init__(self, images, purpose=None):

        self.images = None
        self.targets = None
        self.unique_element_symbols = None
        logger.info("\nData")
        logger.info("====")
        now = datetime.datetime.now()
        logger.info("Module accessed on {}.".format(now.strftime("%Y-%m-%d %H:%M:%S")))

        if self.is_valid_structure(images) is False:
            logger.warning("Data structure is not compatible with ML4Chem.")
            self.prepare_images(images, purpose=purpose)

[docs]    def prepare_images(self, images, purpose=None):
        """Function to prepare images to operate with ML4Chem

        Parameters
        ----------
        images : list or object
            List of images.
        purpose : str
            The purpose of the data so that structure is prepared accordingly.
            Supported are: 'training', 'inference'

        """
        logger.info("Preparing images for {}...".format(purpose))
        self.images = OrderedDict()
        self.atoms_per_image = []

        if purpose == "training":
            self.targets = []

        duplicates = 0

        for image in images:
            key = get_hash(image)
            if key in self.images.keys():
                duplicates += 1
            else:
                self.images[key] = image
                if purpose == "training":
                    # When purpose is training then you also need targets and
                    # number of atoms in each image
                    try:
                        self.targets.append(image.get_potential_energy())
                    except RuntimeError:  # Atoms object has no calculator
                        pass
                    self.atoms_per_image.append(len(image))

        if purpose == "training" and len(self.targets) > 0:
            max_energy = max(self.targets)
            max_index = self.targets.index(max_energy)
            min_energy = min(self.targets)
            min_index = self.targets.index(min_energy)

            max_energy = max_energy / len(images[max_index])
            min_energy = min_energy / len(images[min_index])

            self.max_energy, self.min_energy = max_energy, min_energy
        logger.info("Images hashed and processed...\n")

        if purpose == "training":
            logger.info(
                "There are {} atoms in your data set.".format(
                    self.get_total_number_atoms()
                )
            )

[docs]    def is_valid_structure(self, images):
        """Check if the data has a valid structure

        Parameters
        ----------
        images : list of atoms
            List of images.

        Returns
        -------
        valid : bool
            Whether or not the structure is valid.
        """
        if isinstance(images, dict):
            valid = True
        else:
            valid = False

        return valid

[docs]    def get_unique_element_symbols(self, images=None, purpose=None):
        """Unique element symbol in data set


        Parameters
        ----------
        images : list of images.
            ASE object.
        purpose : str
            The supported categories are: 'training', 'inference'.
        """

        if images is None:
            images = self.images

        supported_categories = ["training", "inference"]

        symbols = {}

        # FIXME make this parallel.
        if purpose in supported_categories:
            if purpose not in symbols.keys():
                symbols[purpose] = {}
                try:
                    symbols[purpose] = sorted(
                        list(set([atom.symbol for image in images for atom in image]))
                    )
                except AttributeError:
                    symbols[purpose] = sorted(
                        list(
                            set(
                                [
                                    atom.symbol
                                    for key, image in images.items()
                                    for atom in image
                                ]
                            )
                        )
                    )

            else:
                # FIXME
                logger.warning("what happens in the following case?")
        else:
            logger.warning("The requested purpose is not supported...")
            symbols = None

        self.unique_element_symbols = symbols

        return self.unique_element_symbols

[docs]    def get_data(self, purpose=None):
        """A method to get data

        Parameters
        ----------
        purpose : str
            The purpose of the data so that structure is prepared accordingly.
            Supported are: 'training', 'inference'

        Returns
        -------
        self.images : dict
            Ordered dictionary of images corresponding to order of self.targets
            list.
        self.targets : list
            Targets used for training the model.
        """

        if purpose == "training":
            return self.images, self.targets
        else:
            return self.images

[docs]    def get_total_number_atoms(self):
        """Get the total number of atoms"""
        return sum(self.atoms_per_image)

[docs]    def to_pandas(self):
        """Convert data to pandas DataFrame"""
        images = OrderedDict()
        columns = ["xyz"]

        for key, atoms in self.images.items():
            images[key] = ase_to_xyz(atoms, file=False)

        df = pd.DataFrame.from_dict(images, orient="index", columns=columns)
        df["energy"] = self.targets

        return df