Skip to content

Commit

Permalink
while we're changing up the schema... move scaffolds to separate json…
Browse files Browse the repository at this point in the history
… field for cheminee use
  • Loading branch information
JJ-Pineda committed Oct 16, 2024
1 parent e20227c commit 5b94cba
Show file tree
Hide file tree
Showing 11 changed files with 90 additions and 69 deletions.
10 changes: 7 additions & 3 deletions src/command_line/indexing/bulk_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> {
let pattern_fingerprint_field = schema.get_field("pattern_fingerprint")?;
let morgan_fingerprint_field = schema.get_field("morgan_fingerprint")?;
let extra_data_field = schema.get_field("extra_data")?;
let other_descriptors_field = schema.get_field("other_descriptors")?;
let descriptor_fields = KNOWN_DESCRIPTORS
.iter()
.map(|kd| (*kd, schema.get_field(kd).unwrap()))
Expand Down Expand Up @@ -73,6 +74,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> {
morgan_fingerprint_field,
&descriptor_fields,
extra_data_field,
other_descriptors_field,
);

match doc {
Expand Down Expand Up @@ -109,6 +111,7 @@ fn create_tantivy_doc(
morgan_fingerprint_field: Field,
descriptor_fields: &HashMap<&str, Field>,
extra_data_field: Field,
other_descriptors_field: Field,
) -> eyre::Result<impl tantivy::Document> {
let smiles = record
.get("smiles")
Expand All @@ -133,9 +136,10 @@ fn create_tantivy_doc(
false => serde_json::json!({"scaffolds": scaffold_matches}),
};

let extra_data_json = combine_json_objects(Some(scaffold_json), extra_data);
if let Some(extra_data_json) = extra_data_json {
doc.add_field_value(extra_data_field, extra_data_json);
doc.add_field_value(other_descriptors_field, scaffold_json);

if let Some(extra_data) = extra_data {
doc.add_field_value(extra_data_field, extra_data);
}

for field in KNOWN_DESCRIPTORS {
Expand Down
11 changes: 5 additions & 6 deletions src/command_line/indexing/index_sdf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> {
let smiles_field = schema.get_field("smiles")?;
let pattern_fingerprint_field = schema.get_field("pattern_fingerprint")?;
let morgan_fingerprint_field = schema.get_field("morgan_fingerprint")?;
let extra_data_field = schema.get_field("extra_data")?;
let other_descriptors_field = schema.get_field("other_descriptors")?;
let descriptor_fields = KNOWN_DESCRIPTORS
.iter()
.map(|kd| (*kd, schema.get_field(kd).unwrap()))
Expand Down Expand Up @@ -124,7 +124,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> {
pattern_fingerprint_field,
morgan_fingerprint_field,
&descriptor_fields,
extra_data_field,
other_descriptors_field,
);

match doc {
Expand Down Expand Up @@ -173,7 +173,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> {
pattern_fingerprint_field,
morgan_fingerprint_field,
&descriptor_fields,
extra_data_field,
other_descriptors_field,
);

match doc {
Expand Down Expand Up @@ -218,7 +218,7 @@ fn create_tantivy_doc(
pattern_fingerprint_field: Field,
morgan_fingerprint_field: Field,
descriptor_fields: &HashMap<&str, Field>,
extra_data_field: Field,
other_descriptors_field: Field,
) -> eyre::Result<impl tantivy::Document> {
// By default, do not attempt to fix problematic molecules
let (canon_taut, fp, descriptors) = process_cpd(mol.as_smiles().as_str(), false)?;
Expand All @@ -244,13 +244,12 @@ fn create_tantivy_doc(
}

let scaffold_matches = scaffold_search(&fp.0, &canon_taut, &PARSED_SCAFFOLDS)?;

let scaffold_json = match scaffold_matches.is_empty() {
true => serde_json::json!({"scaffolds": vec![-1]}),
false => serde_json::json!({"scaffolds": scaffold_matches}),
};

doc.add_field_value(extra_data_field, scaffold_json);
doc.add_field_value(other_descriptors_field, scaffold_json);

Ok(doc)
}
47 changes: 24 additions & 23 deletions src/indexing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,26 +77,27 @@ pub fn open_index(p: impl AsRef<Path>) -> eyre::Result<Index> {
Ok(index)
}

pub fn combine_json_objects(
obj1: Option<serde_json::Value>,
obj2: Option<serde_json::Value>,
) -> Option<serde_json::Value> {
match (obj1, obj2) {
(Some(obj1), Some(obj2)) => {
if let (serde_json::Value::Object(mut obj1_map), serde_json::Value::Object(obj2_map)) =
(obj1.clone(), obj2.clone())
{
for (key, value) in obj2_map {
obj1_map.insert(key, value);
}
return Some(serde_json::Value::Object(obj1_map));
} else if let serde_json::Value::Object(obj1_map) = obj1 {
return Some(serde_json::Value::Object(obj1_map));
}
Some(obj2)
}
(Some(obj1), None) => Some(obj1),
(None, Some(obj2)) => Some(obj2),
(None, None) => None,
}
}
// // Don't need right now, but will be useful if we add more to the new "other_descriptors" field
// pub fn combine_json_objects(
// obj1: Option<serde_json::Value>,
// obj2: Option<serde_json::Value>,
// ) -> Option<serde_json::Value> {
// match (obj1, obj2) {
// (Some(obj1), Some(obj2)) => {
// if let (serde_json::Value::Object(mut obj1_map), serde_json::Value::Object(obj2_map)) =
// (obj1.clone(), obj2.clone())
// {
// for (key, value) in obj2_map {
// obj1_map.insert(key, value);
// }
// return Some(serde_json::Value::Object(obj1_map));
// } else if let serde_json::Value::Object(obj1_map) = obj1 {
// return Some(serde_json::Value::Object(obj1_map));
// }
// Some(obj2)
// }
// (Some(obj1), None) => Some(obj1),
// (None, Some(obj2)) => Some(obj2),
// (None, None) => None,
// }
// }
12 changes: 8 additions & 4 deletions src/rest_api/api/indexing/bulk_index.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::indexing::{combine_json_objects, index_manager::IndexManager, KNOWN_DESCRIPTORS};
use crate::indexing::{index_manager::IndexManager, KNOWN_DESCRIPTORS};
use crate::rest_api::api::{
BulkRequest, BulkRequestDoc, PostIndexBulkResponseError, PostIndexBulkResponseOk,
PostIndexBulkResponseOkStatus, PostIndexesBulkIndexResponse,
Expand Down Expand Up @@ -41,6 +41,7 @@ pub async fn v1_post_index_bulk(
let pattern_fingerprint_field = schema.get_field("pattern_fingerprint").unwrap();
let morgan_fingerprint_field = schema.get_field("morgan_fingerprint").unwrap();
let extra_data_field = schema.get_field("extra_data").unwrap();
let other_descriptors_field = schema.get_field("other_descriptors").unwrap();

let descriptors_fields = KNOWN_DESCRIPTORS
.iter()
Expand All @@ -59,6 +60,7 @@ pub async fn v1_post_index_bulk(
morgan_fingerprint_field,
&descriptors_fields,
extra_data_field,
other_descriptors_field,
)
})
.collect::<Vec<_>>()
Expand Down Expand Up @@ -124,6 +126,7 @@ fn bulk_request_doc_to_tantivy_doc(
morgan_fingerprint_field: Field,
descriptors_fields: &HashMap<&str, Field>,
extra_data_field: Field,
other_descriptors_field: Field,
) -> eyre::Result<impl tantivy::Document> {
// By default, do not attempt to fix problematic molecules
let (canon_taut, pattern_fingerprint, descriptors) =
Expand All @@ -150,9 +153,10 @@ fn bulk_request_doc_to_tantivy_doc(
false => serde_json::json!({"scaffolds": scaffold_matches}),
};

let extra_data_json = combine_json_objects(Some(scaffold_json), bulk_request_doc.extra_data);
if let Some(extra_data_json) = extra_data_json {
doc.add_field_value(extra_data_field, extra_data_json);
doc.add_field_value(other_descriptors_field, scaffold_json);

if let Some(extra_data) = bulk_request_doc.extra_data {
doc.add_field_value(extra_data_field, extra_data);
}

for field in KNOWN_DESCRIPTORS {
Expand Down
8 changes: 6 additions & 2 deletions src/schema/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,13 @@ fn descriptor_v1_schema() -> Schema {
builder.add_bytes_field("pattern_fingerprint", STORED);
builder.add_bytes_field("morgan_fingerprint", STORED);

let json_options: JsonObjectOptions =
let extra_data_options: JsonObjectOptions =
JsonObjectOptions::from(TEXT | STORED).set_expand_dots_enabled();
builder.add_json_field("extra_data", json_options);
builder.add_json_field("extra_data", extra_data_options);

let other_descriptors_options: JsonObjectOptions =
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
builder.add_json_field("other_descriptors", other_descriptors_options.clone());

builder.build()
}
4 changes: 2 additions & 2 deletions src/search/identity_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,10 @@ pub fn build_identity_query(

if let Some(scaffolds) = matching_scaffolds {
if scaffolds.is_empty() {
query_parts.push("extra_data.scaffolds:-1".to_string());
query_parts.push("other_descriptors.scaffolds:-1".to_string());
} else {
for s in scaffolds {
query_parts.push(format!("extra_data.scaffolds:{s}"))
query_parts.push(format!("other_descriptors.scaffolds:{s}"))
}
}
}
Expand Down
8 changes: 4 additions & 4 deletions src/search/structure_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ pub fn build_substructure_query(
if let Some(scaffolds) = matching_scaffolds {
if !scaffolds.is_empty() {
for s in scaffolds {
query_parts.push(format!("extra_data.scaffolds:{s}"))
query_parts.push(format!("other_descriptors.scaffolds:{s}"))
}
}
}
Expand Down Expand Up @@ -219,14 +219,14 @@ pub fn build_superstructure_query(

if let Some(scaffolds) = matching_scaffolds {
if scaffolds.is_empty() {
query = format!("{query} AND extra_data.scaffolds:-1");
query = format!("{query} AND other_descriptors.scaffolds:-1");
} else {
let mut scaffold_parts = scaffolds
.iter()
.map(|s| format!("extra_data.scaffolds:{s}"))
.map(|s| format!("other_descriptors.scaffolds:{s}"))
.collect::<Vec<String>>();

scaffold_parts.push("extra_data.scaffolds:-1".to_string());
scaffold_parts.push("other_descriptors.scaffolds:-1".to_string());

let scaffolds_query = scaffold_parts.join(" OR ");

Expand Down
19 changes: 9 additions & 10 deletions tests/api_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use cheminee::indexing::index_manager::IndexManager;
use cheminee::rest_api::openapi_server::{api_service, API_PREFIX};
use std::collections::HashMap;

use cheminee::indexing::{combine_json_objects, KNOWN_DESCRIPTORS};
use cheminee::indexing::KNOWN_DESCRIPTORS;
use cheminee::search::compound_processing::process_cpd;
use cheminee::search::scaffold_search::{scaffold_search, PARSED_SCAFFOLDS};
use poem::test::TestResponse;
Expand Down Expand Up @@ -102,6 +102,7 @@ fn fill_test_index(tantivy_index: Index) -> eyre::Result<()> {

let smiles_field = schema.get_field("smiles")?;
let extra_data_field = schema.get_field("extra_data")?;
let other_descriptors_field = schema.get_field("other_descriptors")?;
let pattern_fingerprint_field = schema.get_field("pattern_fingerprint")?;
let morgan_fingerprint_field = schema.get_field("morgan_fingerprint")?;
let descriptor_fields = KNOWN_DESCRIPTORS
Expand Down Expand Up @@ -135,10 +136,8 @@ fn fill_test_index(tantivy_index: Index) -> eyre::Result<()> {
false => serde_json::json!({"scaffolds": scaffold_matches}),
};

let extra_data_json = combine_json_objects(Some(scaffold_json), Some(extra_data));
if let Some(extra_data_json) = extra_data_json {
doc.add_field_value(extra_data_field, extra_data_json);
}
doc.add_field_value(other_descriptors_field, scaffold_json);
doc.add_field_value(extra_data_field, extra_data);

for field in KNOWN_DESCRIPTORS {
if let Some(val) = descriptors.get(field) {
Expand Down Expand Up @@ -279,7 +278,7 @@ async fn test_basic_search() -> eyre::Result<()> {
response.assert_status_is_ok();
response
.assert_json(&serde_json::json!([{
"extra_data": {"extra": "data", "scaffolds": [0, 126]},
"extra_data": {"extra": "data"},
"query": "NumAtoms:[13 TO 100]",
"smiles": "c1ccc(CCc2ccccc2)cc1"
}]))
Expand Down Expand Up @@ -310,7 +309,7 @@ async fn test_identity_search() -> eyre::Result<()> {
response.assert_status_is_ok();
response
.assert_json(&serde_json::json!([{
"extra_data": {"extra": "data", "scaffolds": [0, 126]},
"extra_data": {"extra": "data"},
"query": "C1=CC=CC=C1CCC2=CC=CC=C2",
"score": 1.0,
"smiles": "c1ccc(CCc2ccccc2)cc1",
Expand Down Expand Up @@ -343,7 +342,7 @@ async fn test_substructure_search() -> eyre::Result<()> {
response.assert_status_is_ok();
response
.assert_json(&serde_json::json!([{
"extra_data": {"extra": "data", "scaffolds": [0, 126]},
"extra_data": {"extra": "data"},
"query": "C1=CC=CC=C1",
"score": 1.0,
"smiles": "c1ccc(CCc2ccccc2)cc1",
Expand Down Expand Up @@ -378,14 +377,14 @@ async fn test_superstructure_search() -> eyre::Result<()> {
response
.assert_json(&serde_json::json!([
{
"extra_data": {"extra": "data", "scaffolds": [-1]},
"extra_data": {"extra": "data"},
"query": "C1=CC=CC=C1CCC2=CC=CC=C2",
"score": 1.0,
"smiles": "CC",
"used_tautomers": false
},
{
"extra_data": {"extra": "data", "scaffolds": [0]},
"extra_data": {"extra": "data"},
"query": "C1=CC=CC=C1CCC2=CC=CC=C2",
"score": 1.0,
"smiles": "c1ccccc1",
Expand Down
12 changes: 8 additions & 4 deletions tests/cli_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,17 @@ fn test_create_delete_query() {
}
}

let json_options: JsonObjectOptions =
let extra_data_options: JsonObjectOptions =
JsonObjectOptions::from(TEXT | STORED).set_expand_dots_enabled();

let extra_data_field = builder.add_json_field("extra_data", json_options);

let extra_data_field = builder.add_json_field("extra_data", extra_data_options);
doc.add_field_value(extra_data_field, json!({"extra_data": ""}));

let other_descriptors_options: JsonObjectOptions =
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
let other_descriptors_field =
builder.add_json_field("other_descriptors", other_descriptors_options);
doc.add_field_value(other_descriptors_field, json!({"other_descriptors": ""}));

let schema = builder.build();
let builder = IndexBuilder::new().schema(schema);
let index = builder.create_in_ram().unwrap();
Expand Down
4 changes: 0 additions & 4 deletions tests/cpd_processing_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,6 @@ fn test_add_formal_charge() {

#[test]
fn test_fix_chemistry_problems() {
let smiles1 = "F[Si-2](F)(F)(F)(F)F.CC";
let romol1 = fix_chemistry_problems(smiles1).unwrap();
assert_eq!(romol1.as_smiles(), "CC");

let smiles2 = "C[N](C)(C)C";
let romol2 = fix_chemistry_problems(smiles2).unwrap();
assert_eq!(romol2.as_smiles(), "C[N+](C)(C)C");
Expand Down
Loading

0 comments on commit 5b94cba

Please sign in to comment.