Source code for ml4chem.active

import numpy as np
import logging
import itertools

# Starting logger object
logger = logging.getLogger()



[docs]
class ActiveLearning(object):
    """Active Learning

    Parameters
    ----------
    labeled : list
        List of graphs or objects. 
    unlabeled : object
        List of graphs or objects. 
    atomistic : bool, optional
        Atomistic similarities?, by default False.
    """

    def __init__(self, labeled, unlabeled, atomistic=True):
        self.labeled = labeled
        self.unlabeled = unlabeled

        if atomistic == False:
            raise RuntimeError("This is not implemented yet.")

        self.atomistic = atomistic


[docs]
    def run(self, kernel, max_variance=10, max_iter=None):
        """Run the ActiveLearning class

        Parameters
        ----------
        kernel : object
            A kernel to measure similarity.
        max_variance : float, optional
            Maximum variance allowed, by default 10.
        max_iter : int, optional
            Maximum number of iterations allowed, by default None.
        """

        converged = False
        nodal = self.atomistic

        logger.info("Computing diagonal matrix of both labeled and unlabeled data...")
        self.D = kernel.diag(self.labeled + self.unlabeled, nodal=nodal)
        print(self.D.shape)
        logger.info("Finished...\n")

        _indices = np.cumsum(
            [len(graph.nodes) for graph in self.labeled + self.unlabeled]
        ).tolist()

        l_indices = []
        u_indices = []

        for i, index in enumerate(_indices):
            if i == 0:
                u_indices.append(list(range(0, index)))
            else:
                u_indices.append(list(range(_indices[i - 1], index)))
        del _indices

        iterations = 0

        while not converged:
            iterations += 1
            self.variances = []
            logger.info("Labeled data points   : {}.".format(len(self.labeled)))
            logger.info("Unlabeled data points : {}.".format(len(self.unlabeled)))

            self._rll = kernel(self.labeled, nodal=nodal)
            self._ruu = kernel(self.unlabeled, nodal=nodal)
            self._rlu = kernel(self.labeled, self.unlabeled, nodal=nodal)

            if len(self.labeled) == 1:
                l_indices += u_indices.pop(0)

            print(l_indices)
            print(u_indices)
            Dl = np.take(self.D, l_indices) ** -0.5
            self.kll = Dl[None, :] * self._rll * Dl[:, None]
            _u_indices = list(itertools.chain.from_iterable(u_indices))
            Du = np.take(self.D, _u_indices) ** -0.5
            self.klu = Dl[:, None] * self._rlu * Du[None, :]
            # print("KLL")
            # print(self.kll)
            # print("KLU")
            # print(self.klu.shape)
            # print(_u_indices)
            # print(self.kll)
            k_inv = np.linalg.pinv(self.kll)

            for u in range(len(_u_indices)):
                klu_u = self.klu[None:, u]
                vu = 1.0 - klu_u.T.dot(k_inv).dot(klu_u)
                self.variances.append(vu)

            self.max_var_index = np.argmax(self.variances)
            self.current_var = self.variances[self.max_var_index]

            index_to_search = _u_indices[self.max_var_index]
            print(index_to_search)

            for index, graph in enumerate(u_indices):
                if index_to_search in graph:
                    add_to_labeled = self.unlabeled.pop(index)
                    self.labeled.append(add_to_labeled)
                    l_indices += graph
                    u_indices.pop(index)
                    break

            logger.info(
                "A graph with variance {} has been labeled.".format(self.current_var)
            )

            # TODO add max_variance as a criterion for converging.
            # if self.current_var > max_variance:
            #     add_to_labeled = self.unlabeled.pop(self.pop_index)
            #     self.labeled.append(add_to_labeled)
            # else:
            #     converged = True
            #     logger.info("Convergence reached")

            if len(self.unlabeled) == 0:
                logger.info("There are no more unlabeled data points...")
                break
            elif iterations == max_iter:
                logger.info("Total number of iterations was reached...")
                converged = True