diff --git a/src/haddock/clis/re/clustfcc.py b/src/haddock/clis/re/clustfcc.py index 993479f735..36fb5186a7 100644 --- a/src/haddock/clis/re/clustfcc.py +++ b/src/haddock/clis/re/clustfcc.py @@ -1,9 +1,7 @@ """haddock3-re clustfcc subcommand.""" -from pathlib import Path import shutil -from fcc.scripts import cluster_fcc - +from pathlib import Path from haddock import log from haddock.core.defaults import INTERACTIVE_RE_SUFFIX @@ -17,8 +15,9 @@ rank_clusters, write_structure_list, ) -from haddock.libs.libontology import ModuleIO +from haddock.libs.libfcc import read_matrix from haddock.libs.libinteractive import look_for_capri, rewrite_capri_tables +from haddock.libs.libontology import ModuleIO from haddock.modules.analysis.clustfcc.clustfcc import ( get_cluster_centers, iterate_clustering, @@ -32,7 +31,7 @@ def add_clustfcc_arguments(clustfcc_subcommand): clustfcc_subcommand.add_argument( "clustfcc_dir", help="The clustfcc directory to recluster.", - ) + ) clustfcc_subcommand.add_argument( "-f", @@ -40,7 +39,7 @@ def add_clustfcc_arguments(clustfcc_subcommand): help="Minimum fraction of common contacts to be considered in a cluster.", # noqa: E501 required=False, type=float, - ) + ) clustfcc_subcommand.add_argument( "-s", @@ -48,56 +47,56 @@ def add_clustfcc_arguments(clustfcc_subcommand): help="Strictness factor.", required=False, type=float, - ) - + ) + clustfcc_subcommand.add_argument( "-t", "--min_population", help="Clustering population threshold.", required=False, type=int, - ) - + ) + clustfcc_subcommand.add_argument( "-p", "--plot_matrix", help="Generate the matrix plot with the clusters.", required=False, default=False, - action='store_true', - ) - + action="store_true", + ) + return clustfcc_subcommand def reclustfcc( - clustfcc_dir: str, - clust_cutoff: Union[bool, float] = None, - strictness: Union[bool, float] = None, - min_population: Union[bool, int] = None, - plot_matrix : bool = True, - ) -> Path: + clustfcc_dir: str, + clust_cutoff: Union[bool, float] = None, + strictness: Union[bool, float] = None, + min_population: Union[bool, int] = None, + plot_matrix: bool = True, +) -> Path: """ Recluster the models in the clustfcc directory. - + Parameters ---------- clustfcc_dir : str Path to the clustfcc directory. - + clust_cutoff : Union[bool, float] Fraction of common contacts to not be considered a singleton model. - + strictness : Union[bool, float] Fraction of common contacts to be considered to be part of the same cluster. - + min_population : Union[bool, int] Minimum cluster population. - + plot_matrix : bool Should the corresponding matrix plot be generated. - + Returns ------- outdir : Path @@ -121,8 +120,8 @@ def reclustfcc( # load the original clustering parameters via json clustfcc_params = read_config(Path(clustfcc_dir, "params.cfg")) - key = list(clustfcc_params['final_cfg'].keys())[0] - clustfcc_params = clustfcc_params['final_cfg'][key] + key = list(clustfcc_params["final_cfg"].keys())[0] + clustfcc_params = clustfcc_params["final_cfg"][key] log.info(f"Previous clustering parameters: {clustfcc_params}") # adjust the parameters @@ -135,18 +134,17 @@ def reclustfcc( clustfcc_params["plot_matrix"] = plot_matrix # load the fcc matrix - pool = cluster_fcc.read_matrix( + pool = read_matrix( Path(clustfcc_dir, "fcc.matrix"), - clustfcc_params['clust_cutoff'], - clustfcc_params['strictness'], - ) - + clustfcc_params["clust_cutoff"], + clustfcc_params["strictness"], + ) + # iterate clustering until at least one cluster is found clusters, min_population = iterate_clustering( - pool, - clustfcc_params['min_population'] - ) - clustfcc_params['min_population'] = min_population + pool, clustfcc_params["min_population"] + ) + clustfcc_params["min_population"] = min_population log.info(f"Updated clustering parameters: {clustfcc_params}") # Prepare output and read the elements @@ -160,20 +158,20 @@ def reclustfcc( _score_dic, sorted_score_dic = rank_clusters(clt_dic, min_population) output_models = add_cluster_info(sorted_score_dic, clt_dic) - + # Write unclustered structures - write_structure_list(models, - output_models, - out_fname=Path(outdir, "clustfcc.tsv")) - + write_structure_list( + models, output_models, out_fname=Path(outdir, "clustfcc.tsv") + ) + write_clustfcc_file( clusters, clt_centers, clt_dic, clustfcc_params, sorted_score_dic, - output_fname=Path(outdir, 'clustfcc.txt') - ) + output_fname=Path(outdir, "clustfcc.txt"), + ) save_config(clustfcc_params, Path(outdir, "params.cfg")) @@ -183,7 +181,7 @@ def reclustfcc( if caprieval_folder: log.info("Rewriting capri tables") rewrite_capri_tables(caprieval_folder, clt_dic, outdir) - + else: output_models = models @@ -194,25 +192,23 @@ def reclustfcc( final_order_idx, labels, cluster_ids = [], [], [] for pdb in output_models: final_order_idx.append(models.index(pdb)) - labels.append(pdb.file_name.replace('.pdb', '')) + labels.append(pdb.file_name.replace(".pdb", "")) cluster_ids.append(pdb.clt_id) # Get custom cluster data - matrix_cluster_dt, cluster_limits = get_cluster_matrix_plot_clt_dt( - cluster_ids - ) + matrix_cluster_dt, cluster_limits = get_cluster_matrix_plot_clt_dt(cluster_ids) # Define output filename - html_matrix_basepath = Path(outdir, 'fcc_matrix') + html_matrix_basepath = Path(outdir, "fcc_matrix") # Plot matrix html_matrixpath = plot_cluster_matrix( Path(clustfcc_dir, "fcc.matrix"), final_order_idx, labels, - dttype='FCC', + dttype="FCC", diag_fill=1, output_fname=html_matrix_basepath, matrix_cluster_dt=matrix_cluster_dt, cluster_limits=cluster_limits, - ) + ) log.info(f"Plotting matrix in {html_matrixpath}") - + return outdir diff --git a/src/haddock/libs/libfcc.py b/src/haddock/libs/libfcc.py new file mode 100644 index 0000000000..73972f52b9 --- /dev/null +++ b/src/haddock/libs/libfcc.py @@ -0,0 +1,233 @@ +"""FCC related functions + +NOTE: This functions were ported directly from `https://github.com/haddocking/fcc`! +""" + + +class Element: + """Defines a 'clusterable' Element""" + + __slots__ = ["name", "cluster", "neighbors"] + + def __init__(self, name): + self.name = name + self.cluster = 0 + self.neighbors = set() + + def add_neighbor(self, neighbor): + """Adds another element to the neighbor list""" + self.neighbors.add(neighbor) + + def assign_cluster(self, clust_id): + """Assigns the Element to Cluster. 0 if unclustered""" + self.cluster = clust_id + + +class Cluster: + """Defines a Cluster. A Cluster is created with a name and a center (Element class)""" + + __slots__ = ["name", "center", "members"] + + def __init__(self, name, center): + self.name = name + self.center = center + + self.members = [] + + self.populate() + + def __len__(self): + return len(self.members) + 1 # +1 Center + + def populate(self): + """ + Populates the Cluster member list through the + neighbor list of its center. + """ + + name = self.name + # Assign center + ctr = self.center + ctr.assign_cluster(name) + + mlist = self.members + # Assign members + ctr_nlist = (n for n in ctr.neighbors if not n.cluster) + for e in ctr_nlist: + mlist.append(e) + e.assign_cluster(name) + + def add_member(self, element): + """ + Adds one single element to the cluster. + """ + line = self.members + line.append(element) + element.assign_cluster(self.name) + + +def cluster_elements(e_pool, threshold): + """ + Groups Elements within a given threshold + together in the same cluster. + """ + + cluster_list = [] + threshold -= 1 # Account for center + ep = e_pool + cn = 1 # Cluster Number + while 1: + # Clusterable elements + ce = [e for e in ep if not ep[e].cluster] + if not ce: # No more elements to cluster + break + + # Select Cluster Center + # Element with largest neighbor list + ctr_nlist, ctr = sorted( + [(len([se for se in ep[e].neighbors if not se.cluster]), e) for e in ce] + )[-1] + + # Cluster until length of remaining elements lists are above threshold + if ctr_nlist < threshold: + break + + # Create Cluster + c = Cluster(cn, ep[ctr]) + cn += 1 + cluster_list.append(c) + + return ep, cluster_list + + +def output_clusters(handle, cluster): + """Outputs the cluster name, center, and members.""" + + write = handle.write + + for c in cluster: + write("Cluster %s -> %s " % (c.name, c.center.name)) + for m in sorted(c.members, key=lambda k: k.name): + write("%s " % m.name) + write("\n") + + +def read_matrix(path, cutoff_param, strictness): + """ + Reads in a four column matrix (1 2 0.123 0.456\n) + and creates an dictionary of Elements. + + The strictness factor is a that multiplies by the cutoff + to produce a new cutoff for the second half of the matrix. Used to + allow some variability while keeping very small interfaces from clustering + with anything remotely similar. + """ + + cutoff_param = float(cutoff_param) + partner_cutoff = float(cutoff_param) * float(strictness) + + elements = {} + + f = open(path, "r") + for line in f: + ref, mobi, d_rm, d_mr = line.split() + ref = int(ref) + mobi = int(mobi) + d_rm = float(d_rm) + d_mr = float(d_mr) + + # Create or Retrieve Elements + if ref not in elements: + r = Element(ref) + elements[ref] = r + else: + r = elements[ref] + + if mobi not in elements: + m = Element(mobi) + elements[mobi] = m + else: + m = elements[mobi] + + # Assign neighbors + if d_rm >= cutoff_param and d_mr >= partner_cutoff: + r.add_neighbor(m) + if d_mr >= cutoff_param and d_rm >= partner_cutoff: + m.add_neighbor(r) + + f.close() + + return elements + + +def parse_contact_file(f_list, ignore_chain): + """Parses a list of contact files.""" + + if ignore_chain: + contacts = [ + [int(line[0:5] + line[6:-1]) for line in open(con_f)] + for con_f in f_list + if con_f.strip() + ] + else: + contacts = [ + set([int(line) for line in open(con_f)]) + for con_f in f_list + if con_f.strip() + ] + + return contacts + + +def calculate_fcc(list_a, list_b): + """ + Calculates the fraction of common elements between two lists + taking into account chain IDs + """ + + cc = len(list_a.intersection(list_b)) + cc_v = len(list_b.intersection(list_a)) + + return cc, cc_v + + +def calculate_fcc_nc(list_a, list_b): + """ + Calculates the fraction of common elements between two lists + not taking into account chain IDs. Much Slower. + """ + + largest, smallest = sorted([list_a, list_b], key=len) + ncommon = len([ele for ele in largest if ele in smallest]) + return ncommon, ncommon + + +def calculate_pairwise_matrix(contacts, ignore_chain): + """Calculates a matrix of pairwise fraction of common contacts (FCC). + Outputs numeric indexes. + + contacts: list_of_unique_pairs_of_residues [set/list] + + Returns pairwise matrix as an iterator, each entry in the form: + FCC(cplx_1/cplx_2) FCC(cplx_2/cplx_1) + """ + + contact_lengths = [] + for con in contacts: + try: + ic = 1.0 / len(con) + except ZeroDivisionError: + ic = 0 + contact_lengths.append(ic) + + if ignore_chain: + calc_fcc = calculate_fcc_nc + else: + calc_fcc = calculate_fcc + + for i in range(len(contacts)): + + for k in range(i + 1, len(contacts)): + cc, cc_v = calc_fcc(contacts[i], contacts[k]) + fcc, fcc_v = cc * contact_lengths[i], cc * contact_lengths[k] + yield i + 1, k + 1, fcc, fcc_v diff --git a/src/haddock/modules/analysis/clustfcc/__init__.py b/src/haddock/modules/analysis/clustfcc/__init__.py index d6f179390b..a97819e5b5 100644 --- a/src/haddock/modules/analysis/clustfcc/__init__.py +++ b/src/haddock/modules/analysis/clustfcc/__init__.py @@ -10,8 +10,6 @@ import os from pathlib import Path -from fcc.scripts import calc_fcc_matrix, cluster_fcc - from haddock import FCC_path, log from haddock.core.defaults import MODULE_DEFAULT_YAML from haddock.core.typing import Union @@ -22,6 +20,11 @@ rank_clusters, write_structure_list, ) +from haddock.libs.libfcc import ( + calculate_pairwise_matrix, + parse_contact_file, + read_matrix, + ) from haddock.libs.libsubprocess import JobInputFirst from haddock.modules import BaseHaddockModule, get_engine, read_from_yaml_config from haddock.modules.analysis import get_analysis_exec_mode @@ -108,13 +111,13 @@ def _run(self) -> None: self.finish_with_error("Several files were not generated:" f" {not_found}") log.info("Calculating the FCC matrix") - parsed_contacts = calc_fcc_matrix.parse_contact_file( + parsed_contacts = parse_contact_file( contact_file_l, False, ) # Imporant: matrix is a generator object, be careful with it - matrix = calc_fcc_matrix.calculate_pairwise_matrix( + matrix = calculate_pairwise_matrix( parsed_contacts, False, ) @@ -130,7 +133,7 @@ def _run(self) -> None: # Cluster log.info("Clustering...") - pool = cluster_fcc.read_matrix( + pool = read_matrix( fcc_matrix_f, self.params["clust_cutoff"], self.params["strictness"], diff --git a/src/haddock/modules/analysis/clustfcc/clustfcc.py b/src/haddock/modules/analysis/clustfcc/clustfcc.py index 1314c97e2b..4cee2743a5 100644 --- a/src/haddock/modules/analysis/clustfcc/clustfcc.py +++ b/src/haddock/modules/analysis/clustfcc/clustfcc.py @@ -4,9 +4,9 @@ from pathlib import Path import numpy as np -from fcc.scripts import cluster_fcc from haddock import log +from haddock.libs.libfcc import cluster_elements, output_clusters def iterate_clustering(pool, min_population_param): @@ -33,7 +33,7 @@ def iterate_clustering(pool, min_population_param): while not cluster_check: for min_population in range(min_population_param, 0, -1): log.info(f"Clustering with min_population={min_population}") - _, clusters = cluster_fcc.cluster_elements( + _, clusters = cluster_elements( pool, threshold=min_population, ) @@ -70,7 +70,7 @@ def write_clusters(clusters, out_filename="cluster.out"): log.info(f"Saving output to {out_filename}") cluster_out = Path(out_filename) with open(cluster_out, "w") as fh: - cluster_fcc.output_clusters(fh, clusters) + output_clusters(fh, clusters) fh.close()