import logging
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import make_pipeline
from ml4chem.data.serialization import load
logger = logging.getLogger()
[docs]def parity(predictions, true, scores=False, filename=None, **kwargs):
"""A parity plot function
Parameters
----------
predictions : list or ndarray
Model predictions in a list.
true : list or ndarray
Targets or true values.
scores : bool
Print scores in parity plot.
filename : str
A name to save the plot to a file. If filename is non existent, we
call plt.show().
Notes
-----
kwargs accepts all valid keyword arguments for matplotlib.pyplot.savefig.
"""
min_val = min(true)
max_val = max(true)
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111)
ax.plot(true, predictions, "r.")
ax.plot([min_val, max_val], [min_val, max_val], "k-", lw=0.3)
plt.xlabel("True Values")
plt.ylabel("ML4Chem Predictions")
if scores:
rmse = np.sqrt(mean_squared_error(true, predictions))
mae = mean_absolute_error(true, predictions)
correlation = r2_score(true, predictions)
plt.text(
min_val,
max_val,
"R-squared = {:.2f} \n"
"RMSE = {:.2f}\n"
"MAE = {:.2f}\n".format(correlation, rmse, mae),
)
if filename is None:
plt.show()
else:
plt.savefig(filename, **kwargs)
[docs]def read_log(logfile, metric="loss", refresh=None, data_only=False):
"""Read the logfile
Parameters
----------
logfile : str
Path to logfile.
metric : str
The keys,values of the dictionary are:
- "loss": Loss function values.
- "training": Training error.
- "test": Test error.
- "combined": training + test errors in same plot.
refresh : float
Interval in seconds before refreshing log file plot.
data_only : bool
If set to True, this function returns only data in a dataframe with
the following structure:
>>> df.head()
epochs loss training test
0 1 33779.46 815.6884 793.3943
Returns
-------
pandas.DataFrame or matplotlib.pyplot object
If data_only is true we return dataframe, otherwise a figure.
"""
if refresh is not None:
# This means that there is no dynamic update of the plot
# We create an interactive plot
plt.ion()
fig = plt.figure()
axes = fig.add_subplot(111)
# This is for autoscale
axes.set_autoscale_on(True)
axes.autoscale_view(True, True, True)
axes.set_xlabel("Epochs")
annotation = axes.text(0, 0, str(""))
plt.show(block=False)
metric = metric.lower()
f = open(logfile, "r")
check = "Epoch"
start = False
epochs = []
loss = []
training = []
test = []
initiliazed = False
while refresh is not None:
for line in f.readlines():
if check in line:
start = True
if start:
try:
line = line.split()
epochs.append(int(line[0]))
loss.append(float(line[3]))
training.append(float(line[4]))
test.append(float(line[6]))
except IndexError:
epochs.append(int(line[0]))
loss.append(float(line[3]))
training.append(float(line[4]))
except ValueError:
pass
if initiliazed is False:
if metric == "loss":
(fig,) = plt.plot(epochs, loss, label="Loss")
elif metric == "training":
(fig,) = plt.plot(epochs, training, label="Training")
elif metric == "test":
(fig,) = plt.plot(epochs, test, label="Test")
elif metric == "combined":
(fig,) = plt.plot(epochs, training, label="Training")
(fig2,) = plt.plot(epochs, test, label="Test")
else:
if metric == "loss":
fig.set_data(epochs, loss)
elif metric == "training":
fig.set_data(epochs, training)
elif metric == "test":
fig.set_data(epochs, test)
elif metric == "combined":
fig.set_data(epochs, training)
fig2.set_data(epochs, test)
# Updating annotation
if metric == "loss":
values = loss
else:
values = training
reported = values[-1]
x = int(epochs[-1] * 0.9)
y = float(reported * 1.3)
annotation.set_text("{:.5f}".format(reported))
annotation.set_position((x, y))
plt.legend(loc="upper left")
axes.relim()
axes.autoscale_view(True, True, True)
# Draw the plot
plt.draw()
plt.pause(refresh)
initiliazed = True
else:
for line in f.readlines():
if check in line:
start = True
if start:
try:
line = line.split()
epochs.append(int(line[0]))
loss.append(float(line[3]))
training.append(float(line[4]))
test.append(float(line[6]))
except ValueError:
pass
if metric == "loss":
(fig,) = plt.plot(epochs, loss, label="loss")
elif metric == "training":
(fig,) = plt.plot(epochs, training, label="Training")
elif metric == "test":
(fig,) = plt.plot(epochs, test, label="Training")
elif metric == "combined":
(fig,) = plt.plot(epochs, training, label="Training")
(fig,) = plt.plot(epochs, test, label="Test")
if data_only:
data = OrderedDict()
columns = ["epochs", "loss", "training", "test"]
arr = [epochs, loss, training, test]
if metric != "combined":
columns.pop(-1)
arr.pop(-1)
for i, column in enumerate(columns):
data[column] = arr[i]
return pd.DataFrame.from_dict(data)
else:
plt.show(block=True)
[docs]def plot_atomic_features(
latent_space,
method="PCA",
dimensions=2,
backend="seaborn",
data_only=False,
preprocessor=None,
backend_kwargs=None,
**kwargs,
):
"""Plot high dimensional atomic feature vectors
This function can take a feature space dictionary, or a database file
and plot the atomic features using PCA or t-SNE.
$ ml4chem --plot tsne --file path.db
Parameters
----------
latent_space : dict or str
Dictionary of atomic features of path to database file.
method : str, optional
Dimensionality reduction method to employed, by default "PCA".
Supported are: "PCA" and "TSNE".
dimensions : int, optional
Number of dimensions to reduce the high dimensional atomic feature
vectors, by default 2.
backend : str, optional
Select the backend to plot features. Supported are "plotly" and
"seaborn", by default "plotly".
preprocessor : obj
One of the preprocessors supported by sklearn e.g.: StandardScaler(),
Normalizer().
backend_kwargs : dict
Dictionary with extra keyword arguments to extend functionality of
backends that cannot be set with the defaults keyword arguments of
the plot_atomic_features function.
For more information see:
- https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
- https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
data_only : bool
If set to True, this function returns only data in a dataframe with
the following structure:
"""
if backend_kwargs == None:
backend_kwargs = {}
method = method.lower()
backend = backend.lower()
dot_size = kwargs.get("dot_size", 2)
supported_methods = ["pca", "tsne"]
if method not in supported_methods:
raise NotImplementedError
if backend == "seaborn":
# This hack is needed because it seems plotly import overwrite
# everything.
import matplotlib.pyplot as plt
axis = ["x", "y", "z"]
if dimensions > 3:
raise NotImplementedError
elif dimensions == 2:
axis.pop(-1)
if isinstance(latent_space, str):
latent_space = load(latent_space)
full_ls = []
full_symbols = []
# This conditional is needed if you are passing an atomic feature database.
if b"feature_space" in latent_space.keys():
latent_space = latent_space[b"feature_space"]
for hash, feature_space in latent_space.items():
for symbol, feature_vector in feature_space:
try:
symbol = symbol.decode("utf-8")
except AttributeError:
pass
if isinstance(feature_vector, np.ndarray) is False:
feature_vector = feature_vector.numpy()
full_symbols.append(symbol)
full_ls.append(feature_vector)
if method == "pca":
from sklearn.decomposition import PCA
labels = {str(axis[i]): "PCA-{}".format(i + 1) for i in range(len(axis))}
dim_reduction = PCA(n_components=dimensions, **backend_kwargs)
if preprocessor != None:
logger.info(
f"Creating pipeline with preprocessor {preprocessor.__class__.__name__}..."
)
dim_reduction = make_pipeline(preprocessor, dim_reduction)
pca_result = dim_reduction.fit_transform(full_ls)
to_pandas = []
entry = []
for i, element in enumerate(pca_result):
entry = [full_symbols[i]]
for d in range(dimensions):
entry.append(element[d])
to_pandas.append(entry)
columns = ["Symbol"]
args = {}
for key in axis:
columns.append(labels[key])
args[key] = labels[key]
df = pd.DataFrame(to_pandas, columns=columns)
if dimensions == 3 and backend == "plotly":
args["color"] = "Symbol"
plt = px.scatter_3d(df, **args)
plt.update_traces(marker=dict(size=dot_size))
elif dimensions == 2 and backend == "plotly":
args["color"] = "Symbol"
plt = px.scatter(df, **args)
plt.update_traces(marker=dict(size=dot_size))
elif dimensions == 3 and backend == "seaborn":
raise ("This backend is for 2D visualization")
elif dimensions == 2 and backend == "seaborn":
sns.scatterplot(**labels, data=df, hue="Symbol")
elif method == "tsne":
from sklearn import manifold
labels = {str(axis[i]): "t-SNE-{}".format(i + 1) for i in range(len(axis))}
dim_reduction = manifold.TSNE(n_components=dimensions, **backend_kwargs)
if preprocessor != None:
logger.info(
f"Creating pipeline with preprocessor {preprocessor.__class__.__name__}..."
)
dim_reduction = make_pipeline(preprocessor, dim_reduction)
tsne_result = dim_reduction.fit_transform(full_ls)
to_pandas = []
entry = []
for i, element in enumerate(tsne_result):
entry = [full_symbols[i]]
for d in range(dimensions):
entry.append(element[d])
to_pandas.append(entry)
columns = ["Symbol"]
args = {}
for key in axis:
columns.append(labels[key])
args[key] = labels[key]
df = pd.DataFrame(to_pandas, columns=columns)
if dimensions == 3 and backend == "plotly":
args["color"] = "Symbol"
plt = px.scatter_3d(df, **args)
plt.update_traces(marker=dict(size=dot_size))
elif dimensions == 2 and backend == "plotly":
args["color"] = "Symbol"
plt = px.scatter(df, **args)
plt.update_traces(marker=dict(size=dot_size))
elif dimensions == 3 and backend == "seaborn":
raise ("This backend is for 2D visualization")
elif dimensions == 2 and backend == "seaborn":
sns.scatterplot(**labels, data=df, hue="Symbol")
if data_only:
return df, dim_reduction
else:
try:
plt.show()
except:
pass
return plt, df, dim_reduction