import logging
import numpy as np
import pandas as pd
import as px
import seaborn as sns
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import make_pipeline
from import load

logger = logging.getLogger()

[docs]def parity(predictions, true, scores=False, filename=None, **kwargs): """A parity plot function Parameters ---------- predictions : list or ndarray Model predictions in a list. true : list or ndarray Targets or true values. scores : bool Print scores in parity plot. filename : str A name to save the plot to a file. If filename is non existent, we call Notes ----- kwargs accepts all valid keyword arguments for matplotlib.pyplot.savefig. """ min_val = min(true) max_val = max(true) fig = plt.figure(figsize=(6, 6)) ax = fig.add_subplot(111) ax.plot(true, predictions, "r.") ax.plot([min_val, max_val], [min_val, max_val], "k-", lw=0.3) plt.xlabel("True Values") plt.ylabel("ML4Chem Predictions") if scores: rmse = np.sqrt(mean_squared_error(true, predictions)) mae = mean_absolute_error(true, predictions) correlation = r2_score(true, predictions) plt.text( min_val, max_val, "R-squared = {:.2f} \n" "RMSE = {:.2f}\n" "MAE = {:.2f}\n".format(correlation, rmse, mae), ) if filename is None: else: plt.savefig(filename, **kwargs)
[docs]def read_log(logfile, metric="loss", refresh=None, data_only=False): """Read the logfile Parameters ---------- logfile : str Path to logfile. metric : str The keys,values of the dictionary are: - "loss": Loss function values. - "training": Training error. - "test": Test error. - "combined": training + test errors in same plot. refresh : float Interval in seconds before refreshing log file plot. data_only : bool If set to True, this function returns only data in a dataframe with the following structure: >>> df.head() epochs loss training test 0 1 33779.46 815.6884 793.3943 Returns ------- pandas.DataFrame or matplotlib.pyplot object If data_only is true we return dataframe, otherwise a figure. """ if refresh is not None: # This means that there is no dynamic update of the plot # We create an interactive plot plt.ion() fig = plt.figure() axes = fig.add_subplot(111) # This is for autoscale axes.set_autoscale_on(True) axes.autoscale_view(True, True, True) axes.set_xlabel("Epochs") annotation = axes.text(0, 0, str("")) metric = metric.lower() f = open(logfile, "r") check = "Epoch" start = False epochs = [] loss = [] training = [] test = [] initiliazed = False while refresh is not None: for line in f.readlines(): if check in line: start = True if start: try: line = line.split() epochs.append(int(line[0])) loss.append(float(line[3])) training.append(float(line[4])) test.append(float(line[6])) except IndexError: epochs.append(int(line[0])) loss.append(float(line[3])) training.append(float(line[4])) except ValueError: pass if initiliazed is False: if metric == "loss": (fig,) = plt.plot(epochs, loss, label="Loss") elif metric == "training": (fig,) = plt.plot(epochs, training, label="Training") elif metric == "test": (fig,) = plt.plot(epochs, test, label="Test") elif metric == "combined": (fig,) = plt.plot(epochs, training, label="Training") (fig2,) = plt.plot(epochs, test, label="Test") else: if metric == "loss": fig.set_data(epochs, loss) elif metric == "training": fig.set_data(epochs, training) elif metric == "test": fig.set_data(epochs, test) elif metric == "combined": fig.set_data(epochs, training) fig2.set_data(epochs, test) # Updating annotation if metric == "loss": values = loss else: values = training reported = values[-1] x = int(epochs[-1] * 0.9) y = float(reported * 1.3) annotation.set_text("{:.5f}".format(reported)) annotation.set_position((x, y)) plt.legend(loc="upper left") axes.relim() axes.autoscale_view(True, True, True) # Draw the plot plt.draw() plt.pause(refresh) initiliazed = True else: for line in f.readlines(): if check in line: start = True if start: try: line = line.split() epochs.append(int(line[0])) loss.append(float(line[3])) training.append(float(line[4])) test.append(float(line[6])) except ValueError: pass if metric == "loss": (fig,) = plt.plot(epochs, loss, label="loss") elif metric == "training": (fig,) = plt.plot(epochs, training, label="Training") elif metric == "test": (fig,) = plt.plot(epochs, test, label="Training") elif metric == "combined": (fig,) = plt.plot(epochs, training, label="Training") (fig,) = plt.plot(epochs, test, label="Test") if data_only: data = OrderedDict() columns = ["epochs", "loss", "training", "test"] arr = [epochs, loss, training, test] if metric != "combined": columns.pop(-1) arr.pop(-1) for i, column in enumerate(columns): data[column] = arr[i] return pd.DataFrame.from_dict(data) else:
[docs]def plot_atomic_features( latent_space, method="PCA", dimensions=2, backend="seaborn", data_only=False, preprocessor=None, backend_kwargs=None, **kwargs, ): """Plot high dimensional atomic feature vectors This function can take a feature space dictionary, or a database file and plot the atomic features using PCA or t-SNE. $ ml4chem --plot tsne --file path.db Parameters ---------- latent_space : dict or str Dictionary of atomic features of path to database file. method : str, optional Dimensionality reduction method to employed, by default "PCA". Supported are: "PCA" and "TSNE". dimensions : int, optional Number of dimensions to reduce the high dimensional atomic feature vectors, by default 2. backend : str, optional Select the backend to plot features. Supported are "plotly" and "seaborn", by default "plotly". preprocessor : obj One of the preprocessors supported by sklearn e.g.: StandardScaler(), Normalizer(). backend_kwargs : dict Dictionary with extra keyword arguments to extend functionality of backends that cannot be set with the defaults keyword arguments of the plot_atomic_features function. For more information see: - - data_only : bool If set to True, this function returns only data in a dataframe with the following structure: """ if backend_kwargs == None: backend_kwargs = {} method = method.lower() backend = backend.lower() dot_size = kwargs.get("dot_size", 2) supported_methods = ["pca", "tsne"] if method not in supported_methods: raise NotImplementedError if backend == "seaborn": # This hack is needed because it seems plotly import overwrite # everything. import matplotlib.pyplot as plt axis = ["x", "y", "z"] if dimensions > 3: raise NotImplementedError elif dimensions == 2: axis.pop(-1) if isinstance(latent_space, str): latent_space = load(latent_space) full_ls = [] full_symbols = [] # This conditional is needed if you are passing an atomic feature database. if b"feature_space" in latent_space.keys(): latent_space = latent_space[b"feature_space"] for hash, feature_space in latent_space.items(): for symbol, feature_vector in feature_space: try: symbol = symbol.decode("utf-8") except AttributeError: pass if isinstance(feature_vector, np.ndarray) is False: feature_vector = feature_vector.numpy() full_symbols.append(symbol) full_ls.append(feature_vector) if method == "pca": from sklearn.decomposition import PCA labels = {str(axis[i]): "PCA-{}".format(i + 1) for i in range(len(axis))} dim_reduction = PCA(n_components=dimensions, **backend_kwargs) if preprocessor != None: f"Creating pipeline with preprocessor {preprocessor.__class__.__name__}..." ) dim_reduction = make_pipeline(preprocessor, dim_reduction) pca_result = dim_reduction.fit_transform(full_ls) to_pandas = [] entry = [] for i, element in enumerate(pca_result): entry = [full_symbols[i]] for d in range(dimensions): entry.append(element[d]) to_pandas.append(entry) columns = ["Symbol"] args = {} for key in axis: columns.append(labels[key]) args[key] = labels[key] df = pd.DataFrame(to_pandas, columns=columns) if dimensions == 3 and backend == "plotly": args["color"] = "Symbol" plt = px.scatter_3d(df, **args) plt.update_traces(marker=dict(size=dot_size)) elif dimensions == 2 and backend == "plotly": args["color"] = "Symbol" plt = px.scatter(df, **args) plt.update_traces(marker=dict(size=dot_size)) elif dimensions == 3 and backend == "seaborn": raise ("This backend is for 2D visualization") elif dimensions == 2 and backend == "seaborn": sns.scatterplot(**labels, data=df, hue="Symbol") elif method == "tsne": from sklearn import manifold labels = {str(axis[i]): "t-SNE-{}".format(i + 1) for i in range(len(axis))} dim_reduction = manifold.TSNE(n_components=dimensions, **backend_kwargs) if preprocessor != None: f"Creating pipeline with preprocessor {preprocessor.__class__.__name__}..." ) dim_reduction = make_pipeline(preprocessor, dim_reduction) tsne_result = dim_reduction.fit_transform(full_ls) to_pandas = [] entry = [] for i, element in enumerate(tsne_result): entry = [full_symbols[i]] for d in range(dimensions): entry.append(element[d]) to_pandas.append(entry) columns = ["Symbol"] args = {} for key in axis: columns.append(labels[key]) args[key] = labels[key] df = pd.DataFrame(to_pandas, columns=columns) if dimensions == 3 and backend == "plotly": args["color"] = "Symbol" plt = px.scatter_3d(df, **args) plt.update_traces(marker=dict(size=dot_size)) elif dimensions == 2 and backend == "plotly": args["color"] = "Symbol" plt = px.scatter(df, **args) plt.update_traces(marker=dict(size=dot_size)) elif dimensions == 3 and backend == "seaborn": raise ("This backend is for 2D visualization") elif dimensions == 2 and backend == "seaborn": sns.scatterplot(**labels, data=df, hue="Symbol") if data_only: return df, dim_reduction else: try: except: pass return plt, df, dim_reduction