From 60079694b7dccc1a26c8bf020bcf141daf119c2e Mon Sep 17 00:00:00 2001 From: Javier Pineda Date: Tue, 7 Jan 2025 16:50:34 -0600 Subject: [PATCH 1/2] set batch size to 100 for embedding fingerprints --- src/indexing/mod.rs | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/indexing/mod.rs b/src/indexing/mod.rs index 036945c..ee3b243 100644 --- a/src/indexing/mod.rs +++ b/src/indexing/mod.rs @@ -138,18 +138,36 @@ pub fn batch_doc_creation( }) .collect::>(); - let mut morgan_fingerprints: Vec = Vec::with_capacity(mol_attributes.len()); - let mut morgan_bitvecs: Vec> = Vec::with_capacity(mol_attributes.len()); + let batch_size = 100; + let num_compounds = mol_attributes.len(); + let num_batches = (num_compounds as f32 / batch_size as f32).ceil() as usize; + let mut similarity_clusters: Vec>> = Vec::with_capacity(num_batches); + let mut morgan_bitvecs: Vec> = Vec::with_capacity(batch_size); + for attributes in &mol_attributes { let morgan_fp = attributes.morgan_fingerprint.clone(); - morgan_fingerprints.push(morgan_fp.clone()); morgan_bitvecs.push(morgan_fp.0); + + if morgan_bitvecs.len() == batch_size { + let similarity_cluster_batch = encode_fingerprints(&morgan_bitvecs, true) + .map_err(|e| eyre::eyre!("Failed batched similarity cluster assignment: {e}"))?; + + similarity_clusters.push(similarity_cluster_batch); + morgan_bitvecs.clear(); + } } - let similarity_clusters = encode_fingerprints(&morgan_bitvecs, true) - .map_err(|e| eyre::eyre!("Failed batched similarity cluster assignment: {e}"))?; + if !morgan_bitvecs.is_empty() { + let similarity_cluster_batch = encode_fingerprints(&morgan_bitvecs, true) + .map_err(|e| eyre::eyre!("Failed batched similarity cluster assignment: {e}"))?; - let num_compounds = mol_attributes.len(); + similarity_clusters.push(similarity_cluster_batch); + } + + let similarity_clusters = similarity_clusters + .into_iter() + .flatten() + .collect::>>(); let docs = (0..num_compounds) .into_par_iter() From 89a9fc83a1d31c297eaff6b76f3986ebe9f3e3fb Mon Sep 17 00:00:00 2001 From: Javier Pineda Date: Tue, 7 Jan 2025 17:15:41 -0600 Subject: [PATCH 2/2] actually, compromise and set batch size to 200 as this makes things a bit faster --- src/indexing/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/indexing/mod.rs b/src/indexing/mod.rs index ee3b243..51626ec 100644 --- a/src/indexing/mod.rs +++ b/src/indexing/mod.rs @@ -138,7 +138,7 @@ pub fn batch_doc_creation( }) .collect::>(); - let batch_size = 100; + let batch_size = 200; let num_compounds = mol_attributes.len(); let num_batches = (num_compounds as f32 / batch_size as f32).ceil() as usize; let mut similarity_clusters: Vec>> = Vec::with_capacity(num_batches);