From 03ba14eb9cbbf2738337c6c84e5d9f68aa2df973 Mon Sep 17 00:00:00 2001 From: Javier Pineda Date: Fri, 10 Jan 2025 15:52:18 -0600 Subject: [PATCH 1/5] previously, index-sdf was resetting the index each time --> converted this to an optional functionality --- src/command_line/indexing/index_sdf.rs | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/command_line/indexing/index_sdf.rs b/src/command_line/indexing/index_sdf.rs index 123adcf..fb3600a 100644 --- a/src/command_line/indexing/index_sdf.rs +++ b/src/command_line/indexing/index_sdf.rs @@ -2,6 +2,7 @@ use crate::command_line::prelude::*; use rayon::prelude::*; use rdkit::{MolBlockIter, RWMol}; use std::sync::{Arc, Mutex}; +use tantivy::directory::MmapDirectory; pub const NAME: &str = "index-sdf"; @@ -35,6 +36,12 @@ pub fn command() -> Command { .short('c') .num_args(1), ) + .arg( + Arg::new("create-or-reset-index") + .required(false) + .long("create-or-reset-index") + .num_args(0), + ) .arg( Arg::new("commit") .required(false) @@ -52,6 +59,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { .ok_or(eyre::eyre!("Failed to extract index path"))?; let limit = matches.get_one::("limit"); let chunksize = matches.get_one::("chunk-size"); + let reset_index: bool = matches.get_flag("create-or-reset-index"); let commit: bool = matches.get_flag("commit"); let chunksize = if let Some(chunksize) = chunksize { @@ -67,14 +75,6 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { limit ); - let index_dir_metadata = std::fs::metadata(index_dir); - if let Ok(metadata) = index_dir_metadata { - if metadata.is_dir() { - std::fs::remove_dir_all(index_dir)?; - } - } - std::fs::create_dir(index_dir)?; - let mol_iter = MolBlockIter::from_gz_file(sdf_path, true, true, false) .map_err(|e| eyre::eyre!("could not read gz file: {:?}", e))?; @@ -87,7 +87,14 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { let schema = crate::schema::LIBRARY .get("descriptor_v1") .ok_or(eyre::eyre!("Failed to extract schema"))?; - let index = create_or_reset_index(index_dir, schema)?; + + let index = if reset_index { + create_or_reset_index(index_dir, schema)? + } else { + let mmap_directory = MmapDirectory::open(index_dir)?; + tantivy::Index::open(mmap_directory)? + }; + let mut index_writer = index.writer_with_num_threads(1, 50 * 1024 * 1024)?; let mut counter = 0; From c8bad48a2afe5abf2cca141cb5c5045f68740748 Mon Sep 17 00:00:00 2001 From: Javier Pineda Date: Fri, 10 Jan 2025 15:58:16 -0600 Subject: [PATCH 2/5] update index writer memory budget to match tantivy examples --- src/command_line/indexing/bulk_delete.rs | 2 +- src/command_line/indexing/bulk_index.rs | 2 +- src/command_line/indexing/index_sdf.rs | 2 +- src/rest_api/api/indexing/bulk_delete.rs | 2 +- src/rest_api/api/indexing/bulk_index.rs | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/command_line/indexing/bulk_delete.rs b/src/command_line/indexing/bulk_delete.rs index 1564807..3a948ad 100644 --- a/src/command_line/indexing/bulk_delete.rs +++ b/src/command_line/indexing/bulk_delete.rs @@ -38,7 +38,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { let (storage_dir, index_name) = split_path(index_path)?; let index_manager = IndexManager::new(storage_dir.deref(), false)?; let index = index_manager.open(index_name.deref())?; - let mut deleter = index.writer::(16 * 1024 * 1024)?; + let mut deleter = index.writer::(50 * 1024 * 1024)?; let query_parser = QueryParser::for_index(&index, vec![]); for smiles in smiles_list { diff --git a/src/command_line/indexing/bulk_index.rs b/src/command_line/indexing/bulk_index.rs index 2e204ce..deba11f 100644 --- a/src/command_line/indexing/bulk_index.rs +++ b/src/command_line/indexing/bulk_index.rs @@ -36,7 +36,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { let index_manager = IndexManager::new(storage_dir.deref(), false)?; let index = index_manager.open(index_name.deref())?; - let mut writer = index.writer(16 * 1024 * 1024)?; + let mut writer = index.writer(50 * 1024 * 1024)?; let schema = index.schema(); let file = File::open(json_path)?; diff --git a/src/command_line/indexing/index_sdf.rs b/src/command_line/indexing/index_sdf.rs index fb3600a..2b2a27d 100644 --- a/src/command_line/indexing/index_sdf.rs +++ b/src/command_line/indexing/index_sdf.rs @@ -95,7 +95,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { tantivy::Index::open(mmap_directory)? }; - let mut index_writer = index.writer_with_num_threads(1, 50 * 1024 * 1024)?; + let mut index_writer = index.writer(50 * 1024 * 1024)?; let mut counter = 0; let failed_counter: Arc> = Arc::new(Mutex::new(0)); diff --git a/src/rest_api/api/indexing/bulk_delete.rs b/src/rest_api/api/indexing/bulk_delete.rs index cdfdf8f..2bd3bef 100644 --- a/src/rest_api/api/indexing/bulk_delete.rs +++ b/src/rest_api/api/indexing/bulk_delete.rs @@ -22,7 +22,7 @@ pub async fn v1_delete_index_bulk( } }; - let mut deleter = match index.writer::(16 * 1024 * 1024) { + let mut deleter = match index.writer::(50 * 1024 * 1024) { Ok(deleter) => deleter, Err(e) => { return DeleteIndexesBulkDeleteResponse::Err(Json(DeleteIndexBulkResponseError { diff --git a/src/rest_api/api/indexing/bulk_index.rs b/src/rest_api/api/indexing/bulk_index.rs index 460f2f4..be281f8 100644 --- a/src/rest_api/api/indexing/bulk_index.rs +++ b/src/rest_api/api/indexing/bulk_index.rs @@ -20,7 +20,7 @@ pub async fn v1_post_index_bulk( } }; - let mut writer = match index.writer(16 * 1024 * 1024) { + let mut writer = match index.writer(50 * 1024 * 1024) { Ok(writer) => writer, Err(e) => { return PostIndexesBulkIndexResponse::Err(Json(PostIndexBulkResponseError { From 4d599648ac54fd380401988001894075e340f2d2 Mon Sep 17 00:00:00 2001 From: Javier Pineda Date: Fri, 10 Jan 2025 16:03:30 -0600 Subject: [PATCH 3/5] fix bulk indexing api unit test --- tests/api_tests.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/api_tests.rs b/tests/api_tests.rs index ed7dbe1..e65c219 100644 --- a/tests/api_tests.rs +++ b/tests/api_tests.rs @@ -250,7 +250,7 @@ async fn test_bulk_indexing() -> eyre::Result<()> { let results = searcher.search(&query, &TopDocs::with_limit(100))?; assert_eq!(results.len(), 3); - let docs = results + let mut docs = results .into_iter() .map(|(_, doc_id)| searcher.doc::(doc_id).unwrap()) .map(|td| { @@ -261,7 +261,9 @@ async fn test_bulk_indexing() -> eyre::Result<()> { .to_owned() }) .collect::>(); - assert_eq!(&docs, &["CC", "c1ccccc1", "c1ccc(CCc2ccccc2)cc1",]); + + docs.sort(); + assert_eq!(&docs, &["CC", "c1ccc(CCc2ccccc2)cc1", "c1ccccc1"]); Ok(()) } From eb115d345c52c5013069eab6e06f88bf20e88077 Mon Sep 17 00:00:00 2001 From: Javier Pineda Date: Fri, 10 Jan 2025 16:15:47 -0600 Subject: [PATCH 4/5] make sure to also update counter whenever an rwmol fails --- src/command_line/indexing/index_sdf.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/command_line/indexing/index_sdf.rs b/src/command_line/indexing/index_sdf.rs index 2b2a27d..18b00ac 100644 --- a/src/command_line/indexing/index_sdf.rs +++ b/src/command_line/indexing/index_sdf.rs @@ -104,6 +104,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { for mol in mol_iter { if mol.is_err() { + counter += 1; let mut num = failed_counter.lock().unwrap(); *num += 1; continue; From ff55547d1f5bcb4390c1801ff2750c4339ac651a Mon Sep 17 00:00:00 2001 From: Javier Pineda Date: Fri, 10 Jan 2025 16:25:36 -0600 Subject: [PATCH 5/5] directory must already exist for create_or_reset_index() to work --- src/command_line/indexing/index_sdf.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/command_line/indexing/index_sdf.rs b/src/command_line/indexing/index_sdf.rs index 18b00ac..43be4c0 100644 --- a/src/command_line/indexing/index_sdf.rs +++ b/src/command_line/indexing/index_sdf.rs @@ -89,6 +89,14 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { .ok_or(eyre::eyre!("Failed to extract schema"))?; let index = if reset_index { + let index_dir_metadata = std::fs::metadata(index_dir); + if let Ok(metadata) = index_dir_metadata { + if metadata.is_dir() { + std::fs::remove_dir_all(index_dir)?; + } + } + + std::fs::create_dir(index_dir)?; create_or_reset_index(index_dir, schema)? } else { let mmap_directory = MmapDirectory::open(index_dir)?;