import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import torch
from deeptime.decomposition.deep import TAE
from deeptime.util.data import TrajectoryDataset
from deeptime.util.torch import MLP
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from torch.utils.data import DataLoader
from dylightful.utilities import make_name, parse_file_path
[docs]def find_states_kmeans(
proj, prefix, save_path, num_cluster=15, tol=0.01, plotting=True
):
"""Cluster the projection to get realy discretized values necessary for the MSM
Args:
proj ([type]): [description]
num_cluster (int, optional): [description]. Defaults to 15.
tol (float, optional): [description]. Defaults to 0.01.
"""
scores = np.zeros(num_cluster)
sum_of_squared_distances = np.zeros(num_cluster)
for i in range(2, num_cluster):
clf = KMeans(n_clusters=i).fit(proj)
scores[i] = clf.score(proj)
sum_of_squared_distances[i] = clf.inertia_
if plotting:
plot_ellbow_kmeans(
metric=sum_of_squared_distances, prefix=prefix, save_path=save_path
)
plot_scores_kmeans(metric=scores, prefix=prefix, save_path=save_path)
print("K-Means Convergence")
print(sum_of_squared_distances)
return [scores, sum_of_squared_distances]
[docs]def tae_discretizer(
time_ser,
num_states,
clustering=None,
size=3,
prefix=None,
save_path=None,
num_cluster=15,
tol=0.01,
plotting=True,
):
"""Test MSM with time lagged autoencoders according to NoƩ et al.
Args:
time_ser ([type]): Dynophore trajectory converted by the parser
size (int, optional): 10*size is the autoencoder size Defaults to 3.
file_name (str, optional): Name the output file. Defaults to None.
save_path (str, optional): Path to output folder. Defaults to None.
num_cluster (int, optional): Maximal number of MSM states to fit the analysis to. Defaults to 15.
tol (float, optional): Tolerrance when to stop the clustering to find optimal states. Defaults to 0.01.
"""
cluster = False
if clustering is not None:
if num_states > num_cluster // 2:
num_cluster = 2 * num_states
cluster = True
save_path = parse_file_path(save_path)
num_superfeatures = len(time_ser[0])
# set_up tae
dataset = TrajectoryDataset(1, time_ser.astype(np.float32))
n_val = int(len(dataset) * 0.5)
train_data, val_data = torch.utils.data.random_split(
dataset, [len(dataset) - n_val, n_val]
)
loader_train = DataLoader(train_data, batch_size=64, shuffle=True)
loader_val = DataLoader(val_data, batch_size=len(val_data), shuffle=True)
units = [num_superfeatures] + [
size * num_superfeatures + size * num_superfeatures,
1,
]
encoder = MLP(
units,
nonlinearity=torch.nn.ReLU,
output_nonlinearity=torch.nn.Sigmoid,
initial_batchnorm=False,
)
decoder = MLP(units[::-1], nonlinearity=torch.nn.ReLU, initial_batchnorm=False)
tae = TAE(encoder, decoder, learning_rate=1e-3)
tae.fit(loader_train, n_epochs=150, validation_loader=loader_val)
tae_model = tae.fetch_model()
proj = tae_model.transform(time_ser)
if plotting:
plot_tae_training(tae_model=tae, prefix=prefix, save_path=save_path)
plot_tae_transform(proj=proj, prefix=prefix, save_path=save_path)
if cluster == True:
clustering(
proj=proj,
prefix=prefix,
save_path=save_path,
num_cluster=num_cluster,
tol=tol,
plotting=True,
)
else:
if cluster == True:
clustering(
proj=proj,
prefix=prefix,
save_path=save_path,
num_cluster=num_cluster,
tol=tol,
plotting=True,
)
return proj
[docs]def plot_tae_training(tae_model, prefix=None, save_path=None):
"""Plots the loss function for the trainig of the TAE model.
Args:
tae_model ([type]): [description]
file_name ([type], optional): [description]. Defaults to None.
save_path ([type], optional): [description]. Defaults to None.
"""
plt.clf()
plt.cla()
plt.tight_layout()
name = "_tae_training.png"
file_name = make_name(prefix=prefix, name=name, dir=save_path)
plt.semilogy(*tae_model.train_losses.T, label="train")
plt.semilogy(*tae_model.validation_losses.T, label="validation")
mpl.rcParams["font.size"] = "50"
plt.rc("axes", labelsize=50)
plt.xlabel("Training Step", fontsize="50")
plt.xticks(fontsize="50")
plt.ylabel("Loss", fontsize="50")
plt.yticks(fontsize="50")
plt.legend(fontsize="50")
plt.savefig(file_name, dpi=300, bbox_inches="tight")
return None
def plot_scores_kmeans(metric, prefix=None, save_path=None):
"""Plots the scores of the k_means finder
Args:
sum_of_squared_distances ([type]): [description]
file_name ([type], optional): [description]. Defaults to None.
save_path ([type], optional): [description]. Defaults to None.
Returns:
None
"""
plt.clf()
plt.cla()
mpl.rcParams["font.size"] = "50"
plt.tight_layout()
name = "_scores_kmeans.png"
file_name = make_name(prefix=prefix, name=name, dir=save_path)
plt.xlabel("Number of cluster")
plt.ylabel("Euclidean Norm $l^2$")
plt.plot(np.arange(2, len(metric), 1), metric[2:])
plt.scatter(np.arange(2, len(metric), 1), metric[2:])
plt.savefig(file_name, dpi=300, bbox_inches="tight")
return None
def plot_ellbow_kmeans(metric, prefix=None, save_path=None):
"""Plots the sum of squared distances for K-Means to do the ellbow method visually
Args:
sum_of_squared_distances ([type]): [description]
file_name ([type], optional): [description]. Defaults to None.
save_path ([type], optional): [description]. Defaults to None.
Returns:
None
"""
plt.clf()
plt.cla()
name = "_ellbow_kMeans.png"
mpl.rcParams["font.size"] = "50"
file_name = make_name(prefix=prefix, name=name, dir=save_path)
plt.xlabel("Number of cluster")
plt.ylabel("Sum of squared distances $R$")
plt.plot(np.arange(2, len(metric), 1), metric[2:])
plt.scatter(np.arange(2, len(metric), 1), metric[2:])
plt.savefig(file_name, dpi=300, bbox_inches="tight")
return None
def smooth_projection_k_means(arr, num_cluster):
"""Clusters an array with k_means according to num_cluster
Args:
proj ([type]): [description]
num_cluster ([type]): [description]
Returns:
[type]: [description]
"""
clf = KMeans(n_clusters=num_cluster).fit(arr)
return clf.labels_
[docs]def plot_scores_kmeans(metric, prefix=None, save_path=None, name="_scores_kmeans.png"):
"""Plots the scores of the k_means finder
Args:
sum_of_squared_distances ([type]): [description]
file_name ([type], optional): [description]. Defaults to None.
save_path ([type], optional): [description]. Defaults to None.
Returns:
None
"""
plt.clf()
plt.cla()
name = name
file_name = make_name(prefix=prefix, name=name, dir=save_path)
plt.xlabel("Number of cluster")
plt.ylabel("Euclidean Norm $l^2$")
plt.plot(np.arange(2, len(metric), 1), metric[2:])
plt.scatter(np.arange(2, len(metric), 1), metric[2:])
plt.savefig(file_name, dpi=300, bbox_inches="tight")
return None
[docs]def plot_ellbow_kmeans(
metric,
prefix=None,
save_path=None,
name="_ellbow_kMeans.png",
ylabel="Sum of squared distances $R$",
):
"""Plots the sum of squared distances for K-Means to do the ellbow method visually
Args:
sum_of_squared_distances ([type]): [description]
file_name ([type], optional): [description]. Defaults to None.
save_path ([type], optional): [description]. Defaults to None.
Returns:
None
"""
plt.clf()
plt.cla()
name = name
ylabel = ylabel
file_name = make_name(prefix=prefix, name=name, dir=save_path)
plt.xlabel("Number of cluster")
plt.ylabel(ylabel)
plt.plot(np.arange(2, len(metric), 1), metric[2:])
plt.scatter(np.arange(2, len(metric), 1), metric[2:])
plt.savefig(file_name, dpi=300, bbox_inches="tight")
print("Saved", file_name)
return None
[docs]def smooth_projection_k_means(arr, num_cluster):
"""Clusters an array with k_means according to num_cluster
Args:
proj ([type]): [description]
num_cluster ([type]): [description]
Returns:
[type]: [description]
"""
clf = KMeans(n_clusters=num_cluster).fit(arr)
return clf.labels_
[docs]def find_states_gaussian(proj, prefix, save_path, num_cluster=15, tol=0.01):
"""Cluster the projection to get realy discretized values necessary for the MSM
Args:
proj ([type]): [description]
num_cluster (int, optional): [description]. Defaults to 15.
tol (float, optional): [description]. Defaults to 0.01.
"""
bic = np.zeros(num_cluster)
aic = np.zeros(num_cluster)
print("finding ", num_cluster, " states")
for i in range(2, num_cluster):
clf = GaussianMixture(n_components=i).fit(proj)
bic[i] = clf.bic(proj)
aic[i] = clf.aic(proj)
plot_ellbow_kmeans(
metric=bic,
prefix=prefix,
save_path=save_path,
name="_bic_gmm_.png",
ylabel="BIC",
)
plot_ellbow_kmeans(
metric=aic,
prefix=prefix,
save_path=save_path,
name="_aic_gmm_.png",
ylabel="AIC",
)
return [bic, aic]
[docs]def smooth_projection_gaussian(arr, num_cluster):
"""Clusters an array with k_means according to num_cluster
Args:
proj ([type]): [description]
num_cluster ([type]): [description]
Returns:
[type]: [description]
"""
labels = GaussianMixture(n_components=num_cluster).fit_predict(arr)
return labels