diff --git a/.gitignore b/.gitignore index 0518e12..2f3ce2f 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ # Python egg metadata, regenerated from source files by setuptools. /*.egg-info /*.egg +/build/ diff --git a/scripts/sgdml_error_flattened_model.py b/scripts/sgdml_error_flattened_model.py new file mode 100644 index 0000000..33bf088 --- /dev/null +++ b/scripts/sgdml_error_flattened_model.py @@ -0,0 +1,953 @@ +#!/usr/bin/python + +# MIT License +# +# Copyright (c) 2018 Stefan Chmiela +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import print_function + +import argparse +import os +import re +import shutil +import sys +import time +from functools import partial + +import numpy as np + +from sgdml import __version__ +from sgdml.predict import GDMLPredict +from sgdml.train import GDMLTrain +from sgdml.utils import io, ui +from sgdml.cli import _print_splash + +from sgdml import cli + + +from sklearn.cluster import KMeans +from sklearn.cluster import AgglomerativeClustering +from sklearn.metrics.pairwise import euclidean_distances + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +PACKAGE_NAME = 'sgdml' + +def dataset_r_to_desc(dataset): + '''Convert positions to the descriptor + + Descriptor is taken from sgdml (desc.r_t_to_desc) + + Arguments: + dataset {npz} -- Dataset given by user + + Returns: + numpy array -- Descriptor for all samples in the given dataset + ''' + + from scipy.spatial.distance import pdist + from sgdml.utils import desc + + R=dataset['R'] + R_new=[] + + for r_i in R: + pdist=desc.pdist(r_i) + R_new.append(desc.r_to_desc(r_i,pdist)) + + return np.array(R_new) + +def dataset_extract_E(dataset): + return dataset['E'] + +def dataset_extract_F_concat(dataset): + F=dataset['F'] + n_samples,n_atoms,n_dim=F.shape + return np.reshape(F,(n_samples,n_atoms*n_dim)) + +def dataset_extract_R_concat(dataset): + R=dataset['R'] + n_samples,n_atoms,n_dim=R.shape + return np.reshape(R,(n_samples,n_atoms*n_dim)) + + +parameters={ + # These parameters control the settings for this script and serve + # as a way to easily adjust the algorithm to one's needs. + # When the input needed is 'func', it requires a string that + # corresponds to a function name in this file + + 'var_funcs':{ + # Functions to be called when loading the dataset + # This is the time to compute e.g. descriptors + # Parameters are {index}:{func}, + # Input given to the function is the dataset as provided by the user + 0:'dataset_r_to_desc', + 1:'dataset_extract_E', + 2:'dataset_extract_F_concat', + 3:'dataset_extract_R_concat', + }, + + 'clusters':{ + # Information about clustering schemes + 'init_cluster':[0,1], #array of indices*, defines sequence of initial clustering algorithms for the entire datasets + 'recluster':[2], #array of indices*, defines sequence of reclustering algorithms for learning subsets + # *indices: int: correspond to the clustering algorithms as described/defined below + + 0:{ + 'type':'agglomerative_clustering', # func, input: initial dataset, subset indices, clustering index (as defnied here); output: cluster indices + 'n_clusters':10, + 'initial_number':10000, # agglo. cl. first clusters a subset of {initial_number} points, then fills the clusters with the rest of the data + 'distance_matrix_function':'distance_matrix_euclidean', #func, input: samples indices; output: pairwise distance matrix + 'linkage':'complete', #string, defines linkage type + 'cluster_choice_criterion':'smallest_max_distance_euclidean', #func, input: sample, clusters (in descriptor space); output: cluster index that sample belongs to + 'var_index':0, # int: which descriptor fron var_funcs to use for clustering + }, + + 1:{ + 'type':'kmeans_clustering', + 'n_clusters':5, + 'var_index':1, + }, + + 2:{ + 'type':'agglomerative_clustering', + 'n_clusters':'auto', + #'n_clusters_args':[400], + 'initial_number':10000, #1 means the entire subset + 'distance_matrix_function':'distance_matrix_euclidean', + 'linkage':'complete', + 'cluster_choice_criterion':'smallest_max_distance_euclidean', + 'var_index':0, + }, + }, #end of 'clusters' + + 'predict_error':{ + # defines how to calculate prediction errors + 'predict':'forces', # string='forces' or 'energies' + 'input_var_index':3, # int, which descriptor from var_funcs to use as input for prediction function (i.e. R_concat) + 'comparison_var_index':2, # int, what to compare it to (i.e. forces) + 'error_func':'mean_squared_error_sample_wise', # array, input: real values, predicted values, output: array of sample-wise errors + }, #end of 'predict_error' + + 'generate_subset':{ + 'func':'cluster_above_mse', # func, input: iterator, args below, output: array of all indices in chosen clusters + 'args':[1], + }, #end of 'generate_subset' + + 'generate_training_data':{ + 'func':'within_cluster_weighted_err_N', # func, input: iterator, number of points N, output: array of indices (length N) + }, + +} #end of parameters + +def smallest_max_distance_euclidean(sample,clusters): + ''' + Finds the cluster that the given sample belongs to. Simple euclidean distance is used. + (The metric used should be the same as for the agglomerative clustering.) + + Paramters: + -sample: + numpy array containing positions of atoms of one samples + Dimensions: (n_atoms,n_dimensions) + + -clusters: + numpy array containing positions within each cluster + Dimensions: (n_clusters,n_atoms,n_dimensions) + + Returns: + -index of cluster that the sample belongs to / closest cluster + + ''' + + g=np.zeros(len(clusters)) + for i in range(len(clusters)): + g[i]=np.max(np.sum(np.square(clusters[i]-sample),1)) #numpy difference=>clusters[c]-sample elementwise for each c + return np.argmin(g) + +def distance_matrix_euclidean(data): + return euclidean_distances(data,data) + +cluster_funcs={} + +def find_function(s): + s=str(s) + if s in globals() and callable(globals()[s]): + return globals()[s] + + sys.exit( + ui.fail_str('[FAIL]') + + f' Could not find function of name: {s}' + ) + +def agglomerative_clustering(data_set,indices,clustering_index): + + + para=data_set.para['clusters'][clustering_index] + f_name=para["distance_matrix_function"] + matrix_f=find_function(f_name) + + data,N,ind_cluster,cluster_data=None,None,[],[] + n_clusters=para['n_clusters'] + + ##generate data + if not (indices is None): + data=data_set.vars[para['var_index']][indices] + else: + data=data_set.vars[para['var_index']] + + + ##get number of initial clusters + if para["initial_number"]>1: + N=para["initial_number"] + else: + N=len(data)*para["initial_number"] + + ui.progr_toggle(False,"Agglomerative clustering",f"(initial subset, {N} points)") + + ##prepare agglomerative vars + ind_all=np.arange(len(data)) + ind_init=np.random.permutation(ind_all)[:N] + data_init=data[ind_init] + ind_rest=np.delete(ind_all,ind_init) + data_rest=data[ind_rest] + M=matrix_f(data_init) + + + cinit_labels=AgglomerativeClustering(affinity="precomputed",n_clusters=n_clusters,linkage=para.get('linkage','complete')).fit_predict(M) + + ui.progr_toggle(True,"Agglomerative clustering",f"(initial subset, {N} points)") + + cluster_ind=[] + for i in range(n_clusters): + ind=np.concatenate(np.argwhere(cinit_labels==i)) + + #convert back to initial set of indices + ind=ind_init[ind] + + cluster_ind.append(ind.tolist()) + cluster_data.append(np.array(data[cluster_ind[i]])) + + + #divide rest into clusters + #using para['cluster_choice_criterion'] + #+ni to find the old index back from entire dataset + #print("Clustering rest of data...") + #outs=np.trunc(np.linspace(0,len(data_rest),99)) + choice_func=find_function(para["cluster_choice_criterion"]) + + + len_data=len(data_rest) + for i in range(len_data): + c=choice_func(data_rest[i],cluster_data) #c is the cluster# it belongs to + cluster_ind[c].append(ind_rest[i]) + ui.progr_bar(i,len_data,"Agglomerative Clustering",f"(rest of data, {len_data} points)") + + ui.progr_bar(len_data,len_data,"Agglomerative Clustering",f"(rest of data, {len_data} points)") + + if indices is None: + return cluster_ind + + #if needed, change the indices of every cluster back corresponding to original data set + for cl in cluster_ind: + for i in range(len(cl)): + cl[i]=indices[cl[i]] + ui.progr_toggle(True,"Agglomerative clustering",None) + return cluster_ind + +def kmeans_clustering(data_base,indices,clustering_index): + + ui.progr_toggle(False,f"KMeans clustering ({len(indices)} points)",None) + para=data_base.para['clusters'][clustering_index] + data,n_clusters,cluster_ind=None,para["n_clusters"],[] + if not (indices is None): + data=data_base.vars[para["var_index"]][indices] + else: + data=data_base.vars[para["var_index"]] + + + cluster_labels=KMeans(n_clusters=n_clusters,init="k-means++").fit_predict(data) + + for i in range(n_clusters): + ind=np.concatenate(np.argwhere(cluster_labels==i).tolist()) + #convert back to initial set of indices + #no need here + cluster_ind.append(ind) + + if indices is None: + return cluster_ind + + #if needed, change the indices of every cluster back corresponding to original data set + for cl in cluster_ind: + for i in range(len(cl)): + cl[i]=indices[cl[i]] + ui.progr_toggle(True,f"KMeans clustering ({len(indices)} points)",None) + + return cluster_ind + +def worst_N_clusters(self,N,*args): + mse=self.cluster_err + cl_ind=self.cluster_indices + sorted_ind=np.argsort(mse) + clusters=np.array(cl_ind)[sorted_ind[-N:]] + ind=np.concatenate(clusters) + return ind + +def mean_squared_error_sample_wise(x,y): + err=(np.array(x)-np.array(y))**2 + return err.mean(axis=1) + +def weighted_distribution(N,weights): + weights=np.array(weights)/np.sum(weights) + a=(weights*N) + b=a.astype(int) + c=a-b + s=np.sum(b) + + for i in range(N-s): + arg=np.argmax(c) + c[arg]=0 + b[arg]=b[arg]+1 + + return b + +def within_cluster_weighted_err_N(self,N): + + cl_ind,err=self.recluster_indices,self.sample_err + + new_ind=[] + + #find cluster errors and pops + mean_error=np.array([np.mean(err[x]) for x in cl_ind]) + pop=np.array([len(x) for x in cl_ind]) + + weights=(mean_error/np.sum(mean_error))*(pop/np.sum(pop)) + Ns=weighted_distribution(N,weights) + + for i in range(len(cl_ind)): + ind=np.array(cl_ind[i]) + cl_err=err[ind] + ni=Ns[i] + argmax=np.argsort(-cl_err)[:ni] + new_ind.extend(ind[argmax]) + + return new_ind + +def cluster_above_mse(self,fact,*args): + mse=np.array(self.cluster_err) + mmse=np.mean(mse) + cl_ind=self.cluster_indices + cl_ind_new=np.concatenate(np.argwhere(mse>mmse*fact)) + clusters=np.array(cl_ind)[cl_ind_new] + ind=np.concatenate(clusters) + return ind + +class iterator(): + + def __init__(self,args,para): + self.args=args + self.para=para + self.n_steps=args['n_steps'] + + #step size + self.step_size=(self.args['n_train_final']-self.args['n_train_init'])//self.args['n_steps'] + + #change n_clusters='auto' parameters to 2x step_size + for k,v in para['clusters'].items(): + if (type(v) is dict) and ('n_clusters' in v) and (v['n_clusters']=='auto'): + v['n_clusters']=int(2*self.step_size) + + self.create_storage_dir() + + current_stage=0 + current_step=0 + stages=[ + 'load_dataset', + 'train_initial_model', + 'cluster_dataset', + 'stop_or_loop', + 'compute_prediction_errors', + 'generate_subset', + 'recluster_subset', + 'generate_training_data', + 'train_step_model', + 'stop_or_loop', + ] + + def create_storage_dir(self): + name=os.path.basename(self.args['dataset'][0]).split('.')[0] + + dir_id=0 + basedir=os.path.join(os.getcwd(),f"{name}_i{self.args['n_train_init']}_{self.args['n_train_final']}_s{self.args['n_steps']}") + basedir_id=basedir + while os.path.exists(basedir_id): + dir_id=dir_id+1 + basedir_id=f"{basedir}_{dir_id}" + + os.mkdir(basedir_id) + self.dir=basedir_id + + def go_to_stage(self,s): + for i in range(len(self.stages)): + if self.stages[i]==s: + self.current_stage=i + break + + running=False + def run(self): + self.running=True + + print( + ui.white_back_str(f" INIT ") + +ui.white_bold_str(" Error-flattened model \n \n") + +f"Total steps: {self.n_steps:<6} \n" + +f"Initial model: {self.args['n_train_init']:<6} training points \n" + +f"Final model: {self.args['n_train_final']:<6} training points \n" + +f"Step size: {self.step_size:<6} training points" + ) + + while self.running: + self.proceed() + + print(ui.green_back_str(' DONE ') + ' Error-flattening successfully completed.') + print(f' Latest model: {self.step_model_lastest_path}') + print(f' Training poiints: {len(self.model["idxs_train"])}') + + def proceed(self): + + if self.current_stage', ''), + dest='sigs', + type=io.parse_list_or_range, + help='integer list and/or range :[:] for the kernel hyper-parameter sigma', + nargs='+', + ) + + parser.add_argument( + '--task_dir', + metavar='', + dest='task_dir', + help='user-defined task output dir name', + ) + + group = parser.add_mutually_exclusive_group() + group.add_argument( + '--gdml', + action='store_true', + help='don\'t include symmetries in the model (GDML)', + ) + group.add_argument( + '--cprsn', + dest='use_cprsn', + action='store_true', + help='compress kernel matrix along symmetric degrees of freedom', + ) + + group = parser.add_mutually_exclusive_group() + group.add_argument( + '--no_E', + dest='use_E', + action='store_false', + help='only reconstruct force field w/o potential energy surface', + ) + group.add_argument( + '--E_cstr', + dest='use_E_cstr', + action='store_true', + help='include the energy constraints in the kernel', + ) + + parser.add_argument( + 'n_test', + metavar='', + type=io.is_strict_pos_int, + help='test sample size', + nargs='?', + default=None, + ) + + parser.add_argument( + '--model_file', + metavar='', + dest='model_file', + help='user-defined model output file name', + ) + + + group = parser.add_mutually_exclusive_group() + group.add_argument( + '--cg', + dest='use_cg', + action='store_true', + help='use iterative solver (conjugate gradient) with Nystroem preconditioner', + #help=argparse.SUPPRESS + ) + group.add_argument( + '--fk', + dest='use_fk', + action='store_true', + help='use iterative solver (conjugate gradient) with Nystroem approximation', + #help=argparse.SUPPRESS + ) + + args = parser.parse_args() + + # post-processing for optional sig argument + if 'sigs' in args and args.sigs is not None: + args.sigs = np.hstack( + args.sigs + ).tolist() # flatten list, if (part of it) was generated using the range syntax + args.sigs = sorted(list(set(args.sigs))) # remove potential duplicates + + # post-processing for optional model output file argument + if 'model_file' in args and args.model_file is not None: + if not args.model_file.endswith('.npz'): + args.model_file += '.npz' + + _print_splash() + + # check PyTorch GPU support + if 'use_torch' in args and args.use_torch: + try: + import torch + except ImportError: + pass + else: + if not torch.cuda.is_available(): + print( + ui.warn_str('\n[WARN]') + + ' Your PyTorch installation does not support GPU computation!' + + '\n We recommend running CPU calculations without \'--torch\' for improved performance.' + ) + + # replace solver flags with keyword + args = vars(args) + args['solver'] = 'analytic' + if 'use_cg' in args and args['use_cg']: + args['solver'] = 'cg' + elif 'use_fk' in args and args['use_fk']: + args['solver'] = 'fk' + args.pop('use_cg', None) + args.pop('use_fk', None) + + return args + +if __name__=='__main__': + args=parse_arguments() + it=iterator(args,parameters) + it.run() + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/setup.py b/setup.py index 58ca1a6..03d1750 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ def get_property(property, package): author_email='sgdml@chmiela.com', license='LICENSE.txt', packages=find_packages(), - install_requires=['numpy >= 1.13.0', 'scipy >= 1.1.0', 'ase >= 3.16.2'], + install_requires=['numpy >= 1.13.0', 'scipy >= 1.1.0', 'ase >= 3.16.2', 'sklearn >= 0.0'], entry_points={ 'console_scripts': ['sgdml=sgdml.cli:main', 'sgdml-get=sgdml.get:main'] },