diff --git a/src/command_line/indexing/bulk_delete.rs b/src/command_line/indexing/bulk_delete.rs index 1564807..3a948ad 100644 --- a/src/command_line/indexing/bulk_delete.rs +++ b/src/command_line/indexing/bulk_delete.rs @@ -38,7 +38,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { let (storage_dir, index_name) = split_path(index_path)?; let index_manager = IndexManager::new(storage_dir.deref(), false)?; let index = index_manager.open(index_name.deref())?; - let mut deleter = index.writer::(16 * 1024 * 1024)?; + let mut deleter = index.writer::(50 * 1024 * 1024)?; let query_parser = QueryParser::for_index(&index, vec![]); for smiles in smiles_list { diff --git a/src/command_line/indexing/bulk_index.rs b/src/command_line/indexing/bulk_index.rs index 2e204ce..deba11f 100644 --- a/src/command_line/indexing/bulk_index.rs +++ b/src/command_line/indexing/bulk_index.rs @@ -36,7 +36,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { let index_manager = IndexManager::new(storage_dir.deref(), false)?; let index = index_manager.open(index_name.deref())?; - let mut writer = index.writer(16 * 1024 * 1024)?; + let mut writer = index.writer(50 * 1024 * 1024)?; let schema = index.schema(); let file = File::open(json_path)?; diff --git a/src/command_line/indexing/index_sdf.rs b/src/command_line/indexing/index_sdf.rs index 123adcf..43be4c0 100644 --- a/src/command_line/indexing/index_sdf.rs +++ b/src/command_line/indexing/index_sdf.rs @@ -2,6 +2,7 @@ use crate::command_line::prelude::*; use rayon::prelude::*; use rdkit::{MolBlockIter, RWMol}; use std::sync::{Arc, Mutex}; +use tantivy::directory::MmapDirectory; pub const NAME: &str = "index-sdf"; @@ -35,6 +36,12 @@ pub fn command() -> Command { .short('c') .num_args(1), ) + .arg( + Arg::new("create-or-reset-index") + .required(false) + .long("create-or-reset-index") + .num_args(0), + ) .arg( Arg::new("commit") .required(false) @@ -52,6 +59,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { .ok_or(eyre::eyre!("Failed to extract index path"))?; let limit = matches.get_one::("limit"); let chunksize = matches.get_one::("chunk-size"); + let reset_index: bool = matches.get_flag("create-or-reset-index"); let commit: bool = matches.get_flag("commit"); let chunksize = if let Some(chunksize) = chunksize { @@ -67,14 +75,6 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { limit ); - let index_dir_metadata = std::fs::metadata(index_dir); - if let Ok(metadata) = index_dir_metadata { - if metadata.is_dir() { - std::fs::remove_dir_all(index_dir)?; - } - } - std::fs::create_dir(index_dir)?; - let mol_iter = MolBlockIter::from_gz_file(sdf_path, true, true, false) .map_err(|e| eyre::eyre!("could not read gz file: {:?}", e))?; @@ -87,8 +87,23 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { let schema = crate::schema::LIBRARY .get("descriptor_v1") .ok_or(eyre::eyre!("Failed to extract schema"))?; - let index = create_or_reset_index(index_dir, schema)?; - let mut index_writer = index.writer_with_num_threads(1, 50 * 1024 * 1024)?; + + let index = if reset_index { + let index_dir_metadata = std::fs::metadata(index_dir); + if let Ok(metadata) = index_dir_metadata { + if metadata.is_dir() { + std::fs::remove_dir_all(index_dir)?; + } + } + + std::fs::create_dir(index_dir)?; + create_or_reset_index(index_dir, schema)? + } else { + let mmap_directory = MmapDirectory::open(index_dir)?; + tantivy::Index::open(mmap_directory)? + }; + + let mut index_writer = index.writer(50 * 1024 * 1024)?; let mut counter = 0; let failed_counter: Arc> = Arc::new(Mutex::new(0)); @@ -97,6 +112,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { for mol in mol_iter { if mol.is_err() { + counter += 1; let mut num = failed_counter.lock().unwrap(); *num += 1; continue; diff --git a/src/rest_api/api/indexing/bulk_delete.rs b/src/rest_api/api/indexing/bulk_delete.rs index cdfdf8f..2bd3bef 100644 --- a/src/rest_api/api/indexing/bulk_delete.rs +++ b/src/rest_api/api/indexing/bulk_delete.rs @@ -22,7 +22,7 @@ pub async fn v1_delete_index_bulk( } }; - let mut deleter = match index.writer::(16 * 1024 * 1024) { + let mut deleter = match index.writer::(50 * 1024 * 1024) { Ok(deleter) => deleter, Err(e) => { return DeleteIndexesBulkDeleteResponse::Err(Json(DeleteIndexBulkResponseError { diff --git a/src/rest_api/api/indexing/bulk_index.rs b/src/rest_api/api/indexing/bulk_index.rs index 460f2f4..be281f8 100644 --- a/src/rest_api/api/indexing/bulk_index.rs +++ b/src/rest_api/api/indexing/bulk_index.rs @@ -20,7 +20,7 @@ pub async fn v1_post_index_bulk( } }; - let mut writer = match index.writer(16 * 1024 * 1024) { + let mut writer = match index.writer(50 * 1024 * 1024) { Ok(writer) => writer, Err(e) => { return PostIndexesBulkIndexResponse::Err(Json(PostIndexBulkResponseError { diff --git a/tests/api_tests.rs b/tests/api_tests.rs index ed7dbe1..e65c219 100644 --- a/tests/api_tests.rs +++ b/tests/api_tests.rs @@ -250,7 +250,7 @@ async fn test_bulk_indexing() -> eyre::Result<()> { let results = searcher.search(&query, &TopDocs::with_limit(100))?; assert_eq!(results.len(), 3); - let docs = results + let mut docs = results .into_iter() .map(|(_, doc_id)| searcher.doc::(doc_id).unwrap()) .map(|td| { @@ -261,7 +261,9 @@ async fn test_bulk_indexing() -> eyre::Result<()> { .to_owned() }) .collect::>(); - assert_eq!(&docs, &["CC", "c1ccccc1", "c1ccc(CCc2ccccc2)cc1",]); + + docs.sort(); + assert_eq!(&docs, &["CC", "c1ccc(CCc2ccccc2)cc1", "c1ccccc1"]); Ok(()) }