Source code for ml4chem.visualization

import logging
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import make_pipeline
from ml4chem.data.serialization import load


logger = logging.getLogger()


[docs]def parity(predictions, true, scores=False, filename=None, **kwargs):
    """A parity plot function

    Parameters
    ----------
    predictions : list or ndarray
        Model predictions in a list.
    true : list or ndarray
        Targets or true values.
    scores : bool
        Print scores in parity plot.
    filename : str
        A name to save the plot to a file. If filename is non existent, we
        call plt.show().

    Notes
    -----
    kwargs accepts all valid keyword arguments for matplotlib.pyplot.savefig.
    """

    min_val = min(true)
    max_val = max(true)
    fig = plt.figure(figsize=(6, 6))
    ax = fig.add_subplot(111)
    ax.plot(true, predictions, "r.")
    ax.plot([min_val, max_val], [min_val, max_val], "k-", lw=0.3)
    plt.xlabel("True Values")
    plt.ylabel("ML4Chem Predictions")

    if scores:
        rmse = np.sqrt(mean_squared_error(true, predictions))
        mae = mean_absolute_error(true, predictions)
        correlation = r2_score(true, predictions)
        plt.text(
            min_val,
            max_val,
            "R-squared = {:.2f} \n"
            "RMSE = {:.2f}\n"
            "MAE = {:.2f}\n".format(correlation, rmse, mae),
        )

    if filename is None:
        plt.show()
    else:
        plt.savefig(filename, **kwargs)


[docs]def read_log(logfile, metric="loss", refresh=None, data_only=False):
    """Read the logfile

    Parameters
    ----------
    logfile : str
        Path to logfile.
    metric : str
        The keys,values of the dictionary are:

        - "loss": Loss function values.
        - "training": Training error.
        - "test": Test error.
        - "combined": training + test errors in same plot.

    refresh : float
        Interval in seconds before refreshing log file plot.
    data_only : bool
        If set to True, this function returns only data in a dataframe with
        the following structure:

    >>> df.head()
       epochs      loss  training      test
    0       1  33779.46  815.6884  793.3943

    Returns
    -------
    pandas.DataFrame or matplotlib.pyplot object
        If data_only is true we return dataframe, otherwise a figure.
    """

    if refresh is not None:
        # This means that there is no dynamic update of the plot
        # We create an interactive plot
        plt.ion()
        fig = plt.figure()
        axes = fig.add_subplot(111)
        # This is for autoscale
        axes.set_autoscale_on(True)
        axes.autoscale_view(True, True, True)
        axes.set_xlabel("Epochs")
        annotation = axes.text(0, 0, str(""))
        plt.show(block=False)

    metric = metric.lower()

    f = open(logfile, "r")

    check = "Epoch"
    start = False
    epochs = []
    loss = []
    training = []
    test = []

    initiliazed = False
    while refresh is not None:
        for line in f.readlines():
            if check in line:
                start = True

            if start:
                try:
                    line = line.split()
                    epochs.append(int(line[0]))
                    loss.append(float(line[3]))
                    training.append(float(line[4]))
                    test.append(float(line[6]))
                except IndexError:
                    epochs.append(int(line[0]))
                    loss.append(float(line[3]))
                    training.append(float(line[4]))
                except ValueError:
                    pass

        if initiliazed is False:
            if metric == "loss":
                (fig,) = plt.plot(epochs, loss, label="Loss")

            elif metric == "training":
                (fig,) = plt.plot(epochs, training, label="Training")

            elif metric == "test":
                (fig,) = plt.plot(epochs, test, label="Test")

            elif metric == "combined":
                (fig,) = plt.plot(epochs, training, label="Training")
                (fig2,) = plt.plot(epochs, test, label="Test")
        else:
            if metric == "loss":
                fig.set_data(epochs, loss)

            elif metric == "training":
                fig.set_data(epochs, training)

            elif metric == "test":
                fig.set_data(epochs, test)

            elif metric == "combined":
                fig.set_data(epochs, training)
                fig2.set_data(epochs, test)

            # Updating annotation
            if metric == "loss":
                values = loss
            else:
                values = training

            reported = values[-1]
            x = int(epochs[-1] * 0.9)
            y = float(reported * 1.3)
            annotation.set_text("{:.5f}".format(reported))
            annotation.set_position((x, y))

        plt.legend(loc="upper left")
        axes.relim()
        axes.autoscale_view(True, True, True)

        # Draw the plot
        plt.draw()
        plt.pause(refresh)
        initiliazed = True
    else:
        for line in f.readlines():
            if check in line:
                start = True

            if start:
                try:
                    line = line.split()
                    epochs.append(int(line[0]))
                    loss.append(float(line[3]))
                    training.append(float(line[4]))
                    test.append(float(line[6]))
                except ValueError:
                    pass

        if metric == "loss":
            (fig,) = plt.plot(epochs, loss, label="loss")

        elif metric == "training":
            (fig,) = plt.plot(epochs, training, label="Training")

        elif metric == "test":
            (fig,) = plt.plot(epochs, test, label="Training")

        elif metric == "combined":
            (fig,) = plt.plot(epochs, training, label="Training")
            (fig,) = plt.plot(epochs, test, label="Test")

        if data_only:
            data = OrderedDict()
            columns = ["epochs", "loss", "training", "test"]
            arr = [epochs, loss, training, test]

            if metric != "combined":
                columns.pop(-1)
                arr.pop(-1)

            for i, column in enumerate(columns):
                data[column] = arr[i]
            return pd.DataFrame.from_dict(data)
        else:
            plt.show(block=True)


[docs]def plot_atomic_features(
    latent_space,
    method="PCA",
    dimensions=2,
    backend="seaborn",
    data_only=False,
    preprocessor=None,
    backend_kwargs=None,
    **kwargs,
):
    """Plot high dimensional atomic feature vectors

    This function can take a feature space dictionary, or a database file
    and plot the atomic features using PCA or t-SNE.

    $ ml4chem --plot tsne --file path.db

    Parameters
    ----------
    latent_space : dict or str
        Dictionary of atomic features of path to database file.
    method : str, optional
        Dimensionality reduction method to employed, by default "PCA".
        Supported are: "PCA" and "TSNE".
    dimensions : int, optional
        Number of dimensions to reduce the high dimensional atomic feature
        vectors, by default 2.
    backend : str, optional
        Select the backend to plot features. Supported are "plotly" and
        "seaborn", by default "plotly".
    preprocessor : obj
        One of the preprocessors supported by sklearn e.g.: StandardScaler(),
        Normalizer().
    backend_kwargs : dict
        Dictionary with extra keyword arguments to extend functionality of
        backends that cannot be set with the defaults keyword arguments of
        the plot_atomic_features function.

        For more information see:
            - https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
            - https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
    data_only : bool
        If set to True, this function returns only data in a dataframe with
        the following structure:
    """
    if backend_kwargs == None:
        backend_kwargs = {}

    method = method.lower()
    backend = backend.lower()
    dot_size = kwargs.get("dot_size", 2)

    supported_methods = ["pca", "tsne"]

    if method not in supported_methods:
        raise NotImplementedError

    if backend == "seaborn":
        # This hack is needed because it seems plotly import overwrite
        # everything.
        import matplotlib.pyplot as plt

    axis = ["x", "y", "z"]

    if dimensions > 3:
        raise NotImplementedError
    elif dimensions == 2:
        axis.pop(-1)

    if isinstance(latent_space, str):
        latent_space = load(latent_space)

    full_ls = []
    full_symbols = []

    # This conditional is needed if you are passing an atomic feature database.
    if b"feature_space" in latent_space.keys():
        latent_space = latent_space[b"feature_space"]

    for hash, feature_space in latent_space.items():
        for symbol, feature_vector in feature_space:
            try:
                symbol = symbol.decode("utf-8")
            except AttributeError:
                pass

            if isinstance(feature_vector, np.ndarray) is False:
                feature_vector = feature_vector.numpy()

            full_symbols.append(symbol)
            full_ls.append(feature_vector)

    if method == "pca":
        from sklearn.decomposition import PCA

        labels = {str(axis[i]): "PCA-{}".format(i + 1) for i in range(len(axis))}

        dim_reduction = PCA(n_components=dimensions, **backend_kwargs)

        if preprocessor != None:
            logger.info(
                f"Creating pipeline with preprocessor {preprocessor.__class__.__name__}..."
            )
            dim_reduction = make_pipeline(preprocessor, dim_reduction)

        pca_result = dim_reduction.fit_transform(full_ls)

        to_pandas = []

        entry = []
        for i, element in enumerate(pca_result):
            entry = [full_symbols[i]]
            for d in range(dimensions):
                entry.append(element[d])
            to_pandas.append(entry)

        columns = ["Symbol"]
        args = {}

        for key in axis:
            columns.append(labels[key])
            args[key] = labels[key]

        df = pd.DataFrame(to_pandas, columns=columns)

        if dimensions == 3 and backend == "plotly":
            args["color"] = "Symbol"
            plt = px.scatter_3d(df, **args)
            plt.update_traces(marker=dict(size=dot_size))
        elif dimensions == 2 and backend == "plotly":
            args["color"] = "Symbol"
            plt = px.scatter(df, **args)
            plt.update_traces(marker=dict(size=dot_size))
        elif dimensions == 3 and backend == "seaborn":
            raise ("This backend is for 2D visualization")
        elif dimensions == 2 and backend == "seaborn":
            sns.scatterplot(**labels, data=df, hue="Symbol")

    elif method == "tsne":
        from sklearn import manifold

        labels = {str(axis[i]): "t-SNE-{}".format(i + 1) for i in range(len(axis))}

        dim_reduction = manifold.TSNE(n_components=dimensions, **backend_kwargs)

        if preprocessor != None:
            logger.info(
                f"Creating pipeline with preprocessor {preprocessor.__class__.__name__}..."
            )
            dim_reduction = make_pipeline(preprocessor, dim_reduction)

        tsne_result = dim_reduction.fit_transform(full_ls)

        to_pandas = []

        entry = []
        for i, element in enumerate(tsne_result):
            entry = [full_symbols[i]]
            for d in range(dimensions):
                entry.append(element[d])
            to_pandas.append(entry)

        columns = ["Symbol"]
        args = {}

        for key in axis:
            columns.append(labels[key])
            args[key] = labels[key]

        df = pd.DataFrame(to_pandas, columns=columns)

        if dimensions == 3 and backend == "plotly":
            args["color"] = "Symbol"
            plt = px.scatter_3d(df, **args)
            plt.update_traces(marker=dict(size=dot_size))
        elif dimensions == 2 and backend == "plotly":
            args["color"] = "Symbol"
            plt = px.scatter(df, **args)
            plt.update_traces(marker=dict(size=dot_size))
        elif dimensions == 3 and backend == "seaborn":
            raise ("This backend is for 2D visualization")
        elif dimensions == 2 and backend == "seaborn":
            sns.scatterplot(**labels, data=df, hue="Symbol")

    if data_only:
        return df, dim_reduction

    else:
        try:
            plt.show()
        except:
            pass

        return plt, df, dim_reduction