From 5b94cba032cf210817cf2452897c15c8dc233adc Mon Sep 17 00:00:00 2001 From: Javier Pineda Date: Wed, 16 Oct 2024 12:04:08 -0500 Subject: [PATCH] while we're changing up the schema... move scaffolds to separate json field for cheminee use --- src/command_line/indexing/bulk_index.rs | 10 ++++-- src/command_line/indexing/index_sdf.rs | 11 +++--- src/indexing/mod.rs | 47 +++++++++++++------------ src/rest_api/api/indexing/bulk_index.rs | 12 ++++--- src/schema/mod.rs | 8 +++-- src/search/identity_search.rs | 4 +-- src/search/structure_search.rs | 8 ++--- tests/api_tests.rs | 19 +++++----- tests/cli_tests.rs | 12 ++++--- tests/cpd_processing_tests.rs | 4 --- tests/search_tests.rs | 24 +++++++++---- 11 files changed, 90 insertions(+), 69 deletions(-) diff --git a/src/command_line/indexing/bulk_index.rs b/src/command_line/indexing/bulk_index.rs index d541a0c..ab9dce8 100644 --- a/src/command_line/indexing/bulk_index.rs +++ b/src/command_line/indexing/bulk_index.rs @@ -46,6 +46,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { let pattern_fingerprint_field = schema.get_field("pattern_fingerprint")?; let morgan_fingerprint_field = schema.get_field("morgan_fingerprint")?; let extra_data_field = schema.get_field("extra_data")?; + let other_descriptors_field = schema.get_field("other_descriptors")?; let descriptor_fields = KNOWN_DESCRIPTORS .iter() .map(|kd| (*kd, schema.get_field(kd).unwrap())) @@ -73,6 +74,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { morgan_fingerprint_field, &descriptor_fields, extra_data_field, + other_descriptors_field, ); match doc { @@ -109,6 +111,7 @@ fn create_tantivy_doc( morgan_fingerprint_field: Field, descriptor_fields: &HashMap<&str, Field>, extra_data_field: Field, + other_descriptors_field: Field, ) -> eyre::Result { let smiles = record .get("smiles") @@ -133,9 +136,10 @@ fn create_tantivy_doc( false => serde_json::json!({"scaffolds": scaffold_matches}), }; - let extra_data_json = combine_json_objects(Some(scaffold_json), extra_data); - if let Some(extra_data_json) = extra_data_json { - doc.add_field_value(extra_data_field, extra_data_json); + doc.add_field_value(other_descriptors_field, scaffold_json); + + if let Some(extra_data) = extra_data { + doc.add_field_value(extra_data_field, extra_data); } for field in KNOWN_DESCRIPTORS { diff --git a/src/command_line/indexing/index_sdf.rs b/src/command_line/indexing/index_sdf.rs index e708b5f..e77ff40 100644 --- a/src/command_line/indexing/index_sdf.rs +++ b/src/command_line/indexing/index_sdf.rs @@ -93,7 +93,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { let smiles_field = schema.get_field("smiles")?; let pattern_fingerprint_field = schema.get_field("pattern_fingerprint")?; let morgan_fingerprint_field = schema.get_field("morgan_fingerprint")?; - let extra_data_field = schema.get_field("extra_data")?; + let other_descriptors_field = schema.get_field("other_descriptors")?; let descriptor_fields = KNOWN_DESCRIPTORS .iter() .map(|kd| (*kd, schema.get_field(kd).unwrap())) @@ -124,7 +124,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { pattern_fingerprint_field, morgan_fingerprint_field, &descriptor_fields, - extra_data_field, + other_descriptors_field, ); match doc { @@ -173,7 +173,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> { pattern_fingerprint_field, morgan_fingerprint_field, &descriptor_fields, - extra_data_field, + other_descriptors_field, ); match doc { @@ -218,7 +218,7 @@ fn create_tantivy_doc( pattern_fingerprint_field: Field, morgan_fingerprint_field: Field, descriptor_fields: &HashMap<&str, Field>, - extra_data_field: Field, + other_descriptors_field: Field, ) -> eyre::Result { // By default, do not attempt to fix problematic molecules let (canon_taut, fp, descriptors) = process_cpd(mol.as_smiles().as_str(), false)?; @@ -244,13 +244,12 @@ fn create_tantivy_doc( } let scaffold_matches = scaffold_search(&fp.0, &canon_taut, &PARSED_SCAFFOLDS)?; - let scaffold_json = match scaffold_matches.is_empty() { true => serde_json::json!({"scaffolds": vec![-1]}), false => serde_json::json!({"scaffolds": scaffold_matches}), }; - doc.add_field_value(extra_data_field, scaffold_json); + doc.add_field_value(other_descriptors_field, scaffold_json); Ok(doc) } diff --git a/src/indexing/mod.rs b/src/indexing/mod.rs index 448fb48..3c1f82c 100644 --- a/src/indexing/mod.rs +++ b/src/indexing/mod.rs @@ -77,26 +77,27 @@ pub fn open_index(p: impl AsRef) -> eyre::Result { Ok(index) } -pub fn combine_json_objects( - obj1: Option, - obj2: Option, -) -> Option { - match (obj1, obj2) { - (Some(obj1), Some(obj2)) => { - if let (serde_json::Value::Object(mut obj1_map), serde_json::Value::Object(obj2_map)) = - (obj1.clone(), obj2.clone()) - { - for (key, value) in obj2_map { - obj1_map.insert(key, value); - } - return Some(serde_json::Value::Object(obj1_map)); - } else if let serde_json::Value::Object(obj1_map) = obj1 { - return Some(serde_json::Value::Object(obj1_map)); - } - Some(obj2) - } - (Some(obj1), None) => Some(obj1), - (None, Some(obj2)) => Some(obj2), - (None, None) => None, - } -} +// // Don't need right now, but will be useful if we add more to the new "other_descriptors" field +// pub fn combine_json_objects( +// obj1: Option, +// obj2: Option, +// ) -> Option { +// match (obj1, obj2) { +// (Some(obj1), Some(obj2)) => { +// if let (serde_json::Value::Object(mut obj1_map), serde_json::Value::Object(obj2_map)) = +// (obj1.clone(), obj2.clone()) +// { +// for (key, value) in obj2_map { +// obj1_map.insert(key, value); +// } +// return Some(serde_json::Value::Object(obj1_map)); +// } else if let serde_json::Value::Object(obj1_map) = obj1 { +// return Some(serde_json::Value::Object(obj1_map)); +// } +// Some(obj2) +// } +// (Some(obj1), None) => Some(obj1), +// (None, Some(obj2)) => Some(obj2), +// (None, None) => None, +// } +// } diff --git a/src/rest_api/api/indexing/bulk_index.rs b/src/rest_api/api/indexing/bulk_index.rs index 0e3303d..ae356b5 100644 --- a/src/rest_api/api/indexing/bulk_index.rs +++ b/src/rest_api/api/indexing/bulk_index.rs @@ -1,4 +1,4 @@ -use crate::indexing::{combine_json_objects, index_manager::IndexManager, KNOWN_DESCRIPTORS}; +use crate::indexing::{index_manager::IndexManager, KNOWN_DESCRIPTORS}; use crate::rest_api::api::{ BulkRequest, BulkRequestDoc, PostIndexBulkResponseError, PostIndexBulkResponseOk, PostIndexBulkResponseOkStatus, PostIndexesBulkIndexResponse, @@ -41,6 +41,7 @@ pub async fn v1_post_index_bulk( let pattern_fingerprint_field = schema.get_field("pattern_fingerprint").unwrap(); let morgan_fingerprint_field = schema.get_field("morgan_fingerprint").unwrap(); let extra_data_field = schema.get_field("extra_data").unwrap(); + let other_descriptors_field = schema.get_field("other_descriptors").unwrap(); let descriptors_fields = KNOWN_DESCRIPTORS .iter() @@ -59,6 +60,7 @@ pub async fn v1_post_index_bulk( morgan_fingerprint_field, &descriptors_fields, extra_data_field, + other_descriptors_field, ) }) .collect::>() @@ -124,6 +126,7 @@ fn bulk_request_doc_to_tantivy_doc( morgan_fingerprint_field: Field, descriptors_fields: &HashMap<&str, Field>, extra_data_field: Field, + other_descriptors_field: Field, ) -> eyre::Result { // By default, do not attempt to fix problematic molecules let (canon_taut, pattern_fingerprint, descriptors) = @@ -150,9 +153,10 @@ fn bulk_request_doc_to_tantivy_doc( false => serde_json::json!({"scaffolds": scaffold_matches}), }; - let extra_data_json = combine_json_objects(Some(scaffold_json), bulk_request_doc.extra_data); - if let Some(extra_data_json) = extra_data_json { - doc.add_field_value(extra_data_field, extra_data_json); + doc.add_field_value(other_descriptors_field, scaffold_json); + + if let Some(extra_data) = bulk_request_doc.extra_data { + doc.add_field_value(extra_data_field, extra_data); } for field in KNOWN_DESCRIPTORS { diff --git a/src/schema/mod.rs b/src/schema/mod.rs index fe907bf..9d2ecee 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -25,9 +25,13 @@ fn descriptor_v1_schema() -> Schema { builder.add_bytes_field("pattern_fingerprint", STORED); builder.add_bytes_field("morgan_fingerprint", STORED); - let json_options: JsonObjectOptions = + let extra_data_options: JsonObjectOptions = JsonObjectOptions::from(TEXT | STORED).set_expand_dots_enabled(); - builder.add_json_field("extra_data", json_options); + builder.add_json_field("extra_data", extra_data_options); + + let other_descriptors_options: JsonObjectOptions = + JsonObjectOptions::from(TEXT).set_expand_dots_enabled(); + builder.add_json_field("other_descriptors", other_descriptors_options.clone()); builder.build() } diff --git a/src/search/identity_search.rs b/src/search/identity_search.rs index f5ab15d..3139474 100644 --- a/src/search/identity_search.rs +++ b/src/search/identity_search.rs @@ -128,10 +128,10 @@ pub fn build_identity_query( if let Some(scaffolds) = matching_scaffolds { if scaffolds.is_empty() { - query_parts.push("extra_data.scaffolds:-1".to_string()); + query_parts.push("other_descriptors.scaffolds:-1".to_string()); } else { for s in scaffolds { - query_parts.push(format!("extra_data.scaffolds:{s}")) + query_parts.push(format!("other_descriptors.scaffolds:{s}")) } } } diff --git a/src/search/structure_search.rs b/src/search/structure_search.rs index 372ff9f..2005030 100644 --- a/src/search/structure_search.rs +++ b/src/search/structure_search.rs @@ -185,7 +185,7 @@ pub fn build_substructure_query( if let Some(scaffolds) = matching_scaffolds { if !scaffolds.is_empty() { for s in scaffolds { - query_parts.push(format!("extra_data.scaffolds:{s}")) + query_parts.push(format!("other_descriptors.scaffolds:{s}")) } } } @@ -219,14 +219,14 @@ pub fn build_superstructure_query( if let Some(scaffolds) = matching_scaffolds { if scaffolds.is_empty() { - query = format!("{query} AND extra_data.scaffolds:-1"); + query = format!("{query} AND other_descriptors.scaffolds:-1"); } else { let mut scaffold_parts = scaffolds .iter() - .map(|s| format!("extra_data.scaffolds:{s}")) + .map(|s| format!("other_descriptors.scaffolds:{s}")) .collect::>(); - scaffold_parts.push("extra_data.scaffolds:-1".to_string()); + scaffold_parts.push("other_descriptors.scaffolds:-1".to_string()); let scaffolds_query = scaffold_parts.join(" OR "); diff --git a/tests/api_tests.rs b/tests/api_tests.rs index 2e65685..35781cb 100644 --- a/tests/api_tests.rs +++ b/tests/api_tests.rs @@ -2,7 +2,7 @@ use cheminee::indexing::index_manager::IndexManager; use cheminee::rest_api::openapi_server::{api_service, API_PREFIX}; use std::collections::HashMap; -use cheminee::indexing::{combine_json_objects, KNOWN_DESCRIPTORS}; +use cheminee::indexing::KNOWN_DESCRIPTORS; use cheminee::search::compound_processing::process_cpd; use cheminee::search::scaffold_search::{scaffold_search, PARSED_SCAFFOLDS}; use poem::test::TestResponse; @@ -102,6 +102,7 @@ fn fill_test_index(tantivy_index: Index) -> eyre::Result<()> { let smiles_field = schema.get_field("smiles")?; let extra_data_field = schema.get_field("extra_data")?; + let other_descriptors_field = schema.get_field("other_descriptors")?; let pattern_fingerprint_field = schema.get_field("pattern_fingerprint")?; let morgan_fingerprint_field = schema.get_field("morgan_fingerprint")?; let descriptor_fields = KNOWN_DESCRIPTORS @@ -135,10 +136,8 @@ fn fill_test_index(tantivy_index: Index) -> eyre::Result<()> { false => serde_json::json!({"scaffolds": scaffold_matches}), }; - let extra_data_json = combine_json_objects(Some(scaffold_json), Some(extra_data)); - if let Some(extra_data_json) = extra_data_json { - doc.add_field_value(extra_data_field, extra_data_json); - } + doc.add_field_value(other_descriptors_field, scaffold_json); + doc.add_field_value(extra_data_field, extra_data); for field in KNOWN_DESCRIPTORS { if let Some(val) = descriptors.get(field) { @@ -279,7 +278,7 @@ async fn test_basic_search() -> eyre::Result<()> { response.assert_status_is_ok(); response .assert_json(&serde_json::json!([{ - "extra_data": {"extra": "data", "scaffolds": [0, 126]}, + "extra_data": {"extra": "data"}, "query": "NumAtoms:[13 TO 100]", "smiles": "c1ccc(CCc2ccccc2)cc1" }])) @@ -310,7 +309,7 @@ async fn test_identity_search() -> eyre::Result<()> { response.assert_status_is_ok(); response .assert_json(&serde_json::json!([{ - "extra_data": {"extra": "data", "scaffolds": [0, 126]}, + "extra_data": {"extra": "data"}, "query": "C1=CC=CC=C1CCC2=CC=CC=C2", "score": 1.0, "smiles": "c1ccc(CCc2ccccc2)cc1", @@ -343,7 +342,7 @@ async fn test_substructure_search() -> eyre::Result<()> { response.assert_status_is_ok(); response .assert_json(&serde_json::json!([{ - "extra_data": {"extra": "data", "scaffolds": [0, 126]}, + "extra_data": {"extra": "data"}, "query": "C1=CC=CC=C1", "score": 1.0, "smiles": "c1ccc(CCc2ccccc2)cc1", @@ -378,14 +377,14 @@ async fn test_superstructure_search() -> eyre::Result<()> { response .assert_json(&serde_json::json!([ { - "extra_data": {"extra": "data", "scaffolds": [-1]}, + "extra_data": {"extra": "data"}, "query": "C1=CC=CC=C1CCC2=CC=CC=C2", "score": 1.0, "smiles": "CC", "used_tautomers": false }, { - "extra_data": {"extra": "data", "scaffolds": [0]}, + "extra_data": {"extra": "data"}, "query": "C1=CC=CC=C1CCC2=CC=CC=C2", "score": 1.0, "smiles": "c1ccccc1", diff --git a/tests/cli_tests.rs b/tests/cli_tests.rs index 620096d..d6baa8e 100644 --- a/tests/cli_tests.rs +++ b/tests/cli_tests.rs @@ -55,13 +55,17 @@ fn test_create_delete_query() { } } - let json_options: JsonObjectOptions = + let extra_data_options: JsonObjectOptions = JsonObjectOptions::from(TEXT | STORED).set_expand_dots_enabled(); - - let extra_data_field = builder.add_json_field("extra_data", json_options); - + let extra_data_field = builder.add_json_field("extra_data", extra_data_options); doc.add_field_value(extra_data_field, json!({"extra_data": ""})); + let other_descriptors_options: JsonObjectOptions = + JsonObjectOptions::from(TEXT).set_expand_dots_enabled(); + let other_descriptors_field = + builder.add_json_field("other_descriptors", other_descriptors_options); + doc.add_field_value(other_descriptors_field, json!({"other_descriptors": ""})); + let schema = builder.build(); let builder = IndexBuilder::new().schema(schema); let index = builder.create_in_ram().unwrap(); diff --git a/tests/cpd_processing_tests.rs b/tests/cpd_processing_tests.rs index 6b8af9f..53be9d3 100644 --- a/tests/cpd_processing_tests.rs +++ b/tests/cpd_processing_tests.rs @@ -48,10 +48,6 @@ fn test_add_formal_charge() { #[test] fn test_fix_chemistry_problems() { - let smiles1 = "F[Si-2](F)(F)(F)(F)F.CC"; - let romol1 = fix_chemistry_problems(smiles1).unwrap(); - assert_eq!(romol1.as_smiles(), "CC"); - let smiles2 = "C[N](C)(C)C"; let romol2 = fix_chemistry_problems(smiles2).unwrap(); assert_eq!(romol2.as_smiles(), "C[N+](C)(C)C"); diff --git a/tests/search_tests.rs b/tests/search_tests.rs index 8473e90..52fddd6 100644 --- a/tests/search_tests.rs +++ b/tests/search_tests.rs @@ -35,7 +35,7 @@ fn test_build_superstructure_query() { let query = build_superstructure_query(&descriptors, "", &Some(vec![0, 1])); assert_eq!( query, - "NumAtoms:[0 TO 10] AND (extra_data.scaffolds:0 OR extra_data.scaffolds:1 OR extra_data.scaffolds:-1)" + "NumAtoms:[0 TO 10] AND (other_descriptors.scaffolds:0 OR other_descriptors.scaffolds:1 OR other_descriptors.scaffolds:-1)" ); } @@ -166,14 +166,19 @@ fn test_substructure_search() { let smiles_field = builder.add_text_field("smiles", STRING | STORED); let pattern_fingerprint_field = builder.add_bytes_field("pattern_fingerprint", FAST | STORED); - let json_options: JsonObjectOptions = + let extra_data_options: JsonObjectOptions = JsonObjectOptions::from(TEXT | STORED).set_expand_dots_enabled(); - let extra_data_field = builder.add_json_field("extra_data", json_options); + let _extra_data_field = builder.add_json_field("extra_data", extra_data_options); + + let other_descriptors_options: JsonObjectOptions = + JsonObjectOptions::from(TEXT).set_expand_dots_enabled(); + let other_descriptors_field = + builder.add_json_field("other_descriptors", other_descriptors_options); let mut doc = doc!( smiles_field => index_mol.as_smiles(), pattern_fingerprint_field => index_pattern_fingerprint.0.as_raw_slice(), - extra_data_field => json![{ "scaffolds": index_scaffolds }], + other_descriptors_field => json![{ "scaffolds": index_scaffolds }], ); for (descriptor, val) in &index_descriptors { @@ -232,14 +237,19 @@ fn test_superstructure_search() { let smiles_field = builder.add_text_field("smiles", STRING | STORED); let pattern_fingerprint_field = builder.add_bytes_field("pattern_fingerprint", FAST | STORED); - let json_options: JsonObjectOptions = + let extra_data_options: JsonObjectOptions = JsonObjectOptions::from(TEXT | STORED).set_expand_dots_enabled(); - let extra_data_field = builder.add_json_field("extra_data", json_options); + let _extra_data_field = builder.add_json_field("extra_data", extra_data_options); + + let other_descriptors_options: JsonObjectOptions = + JsonObjectOptions::from(TEXT).set_expand_dots_enabled(); + let other_descriptors_field = + builder.add_json_field("other_descriptors", other_descriptors_options); let mut doc = doc!( smiles_field => index_mol.as_smiles(), pattern_fingerprint_field => index_pattern_fingerprint.0.as_raw_slice(), - extra_data_field => json![{ "scaffolds": index_scaffolds }], + other_descriptors_field => json![{ "scaffolds": index_scaffolds }], ); for (descriptor, val) in &index_descriptors {