Skip to content

Commit 5b94cba

Browse files
committed
while we're changing up the schema... move scaffolds to separate json field for cheminee use
1 parent e20227c commit 5b94cba

File tree

11 files changed

+90
-69
lines changed

11 files changed

+90
-69
lines changed

src/command_line/indexing/bulk_index.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> {
4646
let pattern_fingerprint_field = schema.get_field("pattern_fingerprint")?;
4747
let morgan_fingerprint_field = schema.get_field("morgan_fingerprint")?;
4848
let extra_data_field = schema.get_field("extra_data")?;
49+
let other_descriptors_field = schema.get_field("other_descriptors")?;
4950
let descriptor_fields = KNOWN_DESCRIPTORS
5051
.iter()
5152
.map(|kd| (*kd, schema.get_field(kd).unwrap()))
@@ -73,6 +74,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> {
7374
morgan_fingerprint_field,
7475
&descriptor_fields,
7576
extra_data_field,
77+
other_descriptors_field,
7678
);
7779

7880
match doc {
@@ -109,6 +111,7 @@ fn create_tantivy_doc(
109111
morgan_fingerprint_field: Field,
110112
descriptor_fields: &HashMap<&str, Field>,
111113
extra_data_field: Field,
114+
other_descriptors_field: Field,
112115
) -> eyre::Result<impl tantivy::Document> {
113116
let smiles = record
114117
.get("smiles")
@@ -133,9 +136,10 @@ fn create_tantivy_doc(
133136
false => serde_json::json!({"scaffolds": scaffold_matches}),
134137
};
135138

136-
let extra_data_json = combine_json_objects(Some(scaffold_json), extra_data);
137-
if let Some(extra_data_json) = extra_data_json {
138-
doc.add_field_value(extra_data_field, extra_data_json);
139+
doc.add_field_value(other_descriptors_field, scaffold_json);
140+
141+
if let Some(extra_data) = extra_data {
142+
doc.add_field_value(extra_data_field, extra_data);
139143
}
140144

141145
for field in KNOWN_DESCRIPTORS {

src/command_line/indexing/index_sdf.rs

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> {
9393
let smiles_field = schema.get_field("smiles")?;
9494
let pattern_fingerprint_field = schema.get_field("pattern_fingerprint")?;
9595
let morgan_fingerprint_field = schema.get_field("morgan_fingerprint")?;
96-
let extra_data_field = schema.get_field("extra_data")?;
96+
let other_descriptors_field = schema.get_field("other_descriptors")?;
9797
let descriptor_fields = KNOWN_DESCRIPTORS
9898
.iter()
9999
.map(|kd| (*kd, schema.get_field(kd).unwrap()))
@@ -124,7 +124,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> {
124124
pattern_fingerprint_field,
125125
morgan_fingerprint_field,
126126
&descriptor_fields,
127-
extra_data_field,
127+
other_descriptors_field,
128128
);
129129

130130
match doc {
@@ -173,7 +173,7 @@ pub fn action(matches: &ArgMatches) -> eyre::Result<()> {
173173
pattern_fingerprint_field,
174174
morgan_fingerprint_field,
175175
&descriptor_fields,
176-
extra_data_field,
176+
other_descriptors_field,
177177
);
178178

179179
match doc {
@@ -218,7 +218,7 @@ fn create_tantivy_doc(
218218
pattern_fingerprint_field: Field,
219219
morgan_fingerprint_field: Field,
220220
descriptor_fields: &HashMap<&str, Field>,
221-
extra_data_field: Field,
221+
other_descriptors_field: Field,
222222
) -> eyre::Result<impl tantivy::Document> {
223223
// By default, do not attempt to fix problematic molecules
224224
let (canon_taut, fp, descriptors) = process_cpd(mol.as_smiles().as_str(), false)?;
@@ -244,13 +244,12 @@ fn create_tantivy_doc(
244244
}
245245

246246
let scaffold_matches = scaffold_search(&fp.0, &canon_taut, &PARSED_SCAFFOLDS)?;
247-
248247
let scaffold_json = match scaffold_matches.is_empty() {
249248
true => serde_json::json!({"scaffolds": vec![-1]}),
250249
false => serde_json::json!({"scaffolds": scaffold_matches}),
251250
};
252251

253-
doc.add_field_value(extra_data_field, scaffold_json);
252+
doc.add_field_value(other_descriptors_field, scaffold_json);
254253

255254
Ok(doc)
256255
}

src/indexing/mod.rs

Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -77,26 +77,27 @@ pub fn open_index(p: impl AsRef<Path>) -> eyre::Result<Index> {
7777
Ok(index)
7878
}
7979

80-
pub fn combine_json_objects(
81-
obj1: Option<serde_json::Value>,
82-
obj2: Option<serde_json::Value>,
83-
) -> Option<serde_json::Value> {
84-
match (obj1, obj2) {
85-
(Some(obj1), Some(obj2)) => {
86-
if let (serde_json::Value::Object(mut obj1_map), serde_json::Value::Object(obj2_map)) =
87-
(obj1.clone(), obj2.clone())
88-
{
89-
for (key, value) in obj2_map {
90-
obj1_map.insert(key, value);
91-
}
92-
return Some(serde_json::Value::Object(obj1_map));
93-
} else if let serde_json::Value::Object(obj1_map) = obj1 {
94-
return Some(serde_json::Value::Object(obj1_map));
95-
}
96-
Some(obj2)
97-
}
98-
(Some(obj1), None) => Some(obj1),
99-
(None, Some(obj2)) => Some(obj2),
100-
(None, None) => None,
101-
}
102-
}
80+
// // Don't need right now, but will be useful if we add more to the new "other_descriptors" field
81+
// pub fn combine_json_objects(
82+
// obj1: Option<serde_json::Value>,
83+
// obj2: Option<serde_json::Value>,
84+
// ) -> Option<serde_json::Value> {
85+
// match (obj1, obj2) {
86+
// (Some(obj1), Some(obj2)) => {
87+
// if let (serde_json::Value::Object(mut obj1_map), serde_json::Value::Object(obj2_map)) =
88+
// (obj1.clone(), obj2.clone())
89+
// {
90+
// for (key, value) in obj2_map {
91+
// obj1_map.insert(key, value);
92+
// }
93+
// return Some(serde_json::Value::Object(obj1_map));
94+
// } else if let serde_json::Value::Object(obj1_map) = obj1 {
95+
// return Some(serde_json::Value::Object(obj1_map));
96+
// }
97+
// Some(obj2)
98+
// }
99+
// (Some(obj1), None) => Some(obj1),
100+
// (None, Some(obj2)) => Some(obj2),
101+
// (None, None) => None,
102+
// }
103+
// }

src/rest_api/api/indexing/bulk_index.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use crate::indexing::{combine_json_objects, index_manager::IndexManager, KNOWN_DESCRIPTORS};
1+
use crate::indexing::{index_manager::IndexManager, KNOWN_DESCRIPTORS};
22
use crate::rest_api::api::{
33
BulkRequest, BulkRequestDoc, PostIndexBulkResponseError, PostIndexBulkResponseOk,
44
PostIndexBulkResponseOkStatus, PostIndexesBulkIndexResponse,
@@ -41,6 +41,7 @@ pub async fn v1_post_index_bulk(
4141
let pattern_fingerprint_field = schema.get_field("pattern_fingerprint").unwrap();
4242
let morgan_fingerprint_field = schema.get_field("morgan_fingerprint").unwrap();
4343
let extra_data_field = schema.get_field("extra_data").unwrap();
44+
let other_descriptors_field = schema.get_field("other_descriptors").unwrap();
4445

4546
let descriptors_fields = KNOWN_DESCRIPTORS
4647
.iter()
@@ -59,6 +60,7 @@ pub async fn v1_post_index_bulk(
5960
morgan_fingerprint_field,
6061
&descriptors_fields,
6162
extra_data_field,
63+
other_descriptors_field,
6264
)
6365
})
6466
.collect::<Vec<_>>()
@@ -124,6 +126,7 @@ fn bulk_request_doc_to_tantivy_doc(
124126
morgan_fingerprint_field: Field,
125127
descriptors_fields: &HashMap<&str, Field>,
126128
extra_data_field: Field,
129+
other_descriptors_field: Field,
127130
) -> eyre::Result<impl tantivy::Document> {
128131
// By default, do not attempt to fix problematic molecules
129132
let (canon_taut, pattern_fingerprint, descriptors) =
@@ -150,9 +153,10 @@ fn bulk_request_doc_to_tantivy_doc(
150153
false => serde_json::json!({"scaffolds": scaffold_matches}),
151154
};
152155

153-
let extra_data_json = combine_json_objects(Some(scaffold_json), bulk_request_doc.extra_data);
154-
if let Some(extra_data_json) = extra_data_json {
155-
doc.add_field_value(extra_data_field, extra_data_json);
156+
doc.add_field_value(other_descriptors_field, scaffold_json);
157+
158+
if let Some(extra_data) = bulk_request_doc.extra_data {
159+
doc.add_field_value(extra_data_field, extra_data);
156160
}
157161

158162
for field in KNOWN_DESCRIPTORS {

src/schema/mod.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,13 @@ fn descriptor_v1_schema() -> Schema {
2525
builder.add_bytes_field("pattern_fingerprint", STORED);
2626
builder.add_bytes_field("morgan_fingerprint", STORED);
2727

28-
let json_options: JsonObjectOptions =
28+
let extra_data_options: JsonObjectOptions =
2929
JsonObjectOptions::from(TEXT | STORED).set_expand_dots_enabled();
30-
builder.add_json_field("extra_data", json_options);
30+
builder.add_json_field("extra_data", extra_data_options);
31+
32+
let other_descriptors_options: JsonObjectOptions =
33+
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
34+
builder.add_json_field("other_descriptors", other_descriptors_options.clone());
3135

3236
builder.build()
3337
}

src/search/identity_search.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,10 +128,10 @@ pub fn build_identity_query(
128128

129129
if let Some(scaffolds) = matching_scaffolds {
130130
if scaffolds.is_empty() {
131-
query_parts.push("extra_data.scaffolds:-1".to_string());
131+
query_parts.push("other_descriptors.scaffolds:-1".to_string());
132132
} else {
133133
for s in scaffolds {
134-
query_parts.push(format!("extra_data.scaffolds:{s}"))
134+
query_parts.push(format!("other_descriptors.scaffolds:{s}"))
135135
}
136136
}
137137
}

src/search/structure_search.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ pub fn build_substructure_query(
185185
if let Some(scaffolds) = matching_scaffolds {
186186
if !scaffolds.is_empty() {
187187
for s in scaffolds {
188-
query_parts.push(format!("extra_data.scaffolds:{s}"))
188+
query_parts.push(format!("other_descriptors.scaffolds:{s}"))
189189
}
190190
}
191191
}
@@ -219,14 +219,14 @@ pub fn build_superstructure_query(
219219

220220
if let Some(scaffolds) = matching_scaffolds {
221221
if scaffolds.is_empty() {
222-
query = format!("{query} AND extra_data.scaffolds:-1");
222+
query = format!("{query} AND other_descriptors.scaffolds:-1");
223223
} else {
224224
let mut scaffold_parts = scaffolds
225225
.iter()
226-
.map(|s| format!("extra_data.scaffolds:{s}"))
226+
.map(|s| format!("other_descriptors.scaffolds:{s}"))
227227
.collect::<Vec<String>>();
228228

229-
scaffold_parts.push("extra_data.scaffolds:-1".to_string());
229+
scaffold_parts.push("other_descriptors.scaffolds:-1".to_string());
230230

231231
let scaffolds_query = scaffold_parts.join(" OR ");
232232

tests/api_tests.rs

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use cheminee::indexing::index_manager::IndexManager;
22
use cheminee::rest_api::openapi_server::{api_service, API_PREFIX};
33
use std::collections::HashMap;
44

5-
use cheminee::indexing::{combine_json_objects, KNOWN_DESCRIPTORS};
5+
use cheminee::indexing::KNOWN_DESCRIPTORS;
66
use cheminee::search::compound_processing::process_cpd;
77
use cheminee::search::scaffold_search::{scaffold_search, PARSED_SCAFFOLDS};
88
use poem::test::TestResponse;
@@ -102,6 +102,7 @@ fn fill_test_index(tantivy_index: Index) -> eyre::Result<()> {
102102

103103
let smiles_field = schema.get_field("smiles")?;
104104
let extra_data_field = schema.get_field("extra_data")?;
105+
let other_descriptors_field = schema.get_field("other_descriptors")?;
105106
let pattern_fingerprint_field = schema.get_field("pattern_fingerprint")?;
106107
let morgan_fingerprint_field = schema.get_field("morgan_fingerprint")?;
107108
let descriptor_fields = KNOWN_DESCRIPTORS
@@ -135,10 +136,8 @@ fn fill_test_index(tantivy_index: Index) -> eyre::Result<()> {
135136
false => serde_json::json!({"scaffolds": scaffold_matches}),
136137
};
137138

138-
let extra_data_json = combine_json_objects(Some(scaffold_json), Some(extra_data));
139-
if let Some(extra_data_json) = extra_data_json {
140-
doc.add_field_value(extra_data_field, extra_data_json);
141-
}
139+
doc.add_field_value(other_descriptors_field, scaffold_json);
140+
doc.add_field_value(extra_data_field, extra_data);
142141

143142
for field in KNOWN_DESCRIPTORS {
144143
if let Some(val) = descriptors.get(field) {
@@ -279,7 +278,7 @@ async fn test_basic_search() -> eyre::Result<()> {
279278
response.assert_status_is_ok();
280279
response
281280
.assert_json(&serde_json::json!([{
282-
"extra_data": {"extra": "data", "scaffolds": [0, 126]},
281+
"extra_data": {"extra": "data"},
283282
"query": "NumAtoms:[13 TO 100]",
284283
"smiles": "c1ccc(CCc2ccccc2)cc1"
285284
}]))
@@ -310,7 +309,7 @@ async fn test_identity_search() -> eyre::Result<()> {
310309
response.assert_status_is_ok();
311310
response
312311
.assert_json(&serde_json::json!([{
313-
"extra_data": {"extra": "data", "scaffolds": [0, 126]},
312+
"extra_data": {"extra": "data"},
314313
"query": "C1=CC=CC=C1CCC2=CC=CC=C2",
315314
"score": 1.0,
316315
"smiles": "c1ccc(CCc2ccccc2)cc1",
@@ -343,7 +342,7 @@ async fn test_substructure_search() -> eyre::Result<()> {
343342
response.assert_status_is_ok();
344343
response
345344
.assert_json(&serde_json::json!([{
346-
"extra_data": {"extra": "data", "scaffolds": [0, 126]},
345+
"extra_data": {"extra": "data"},
347346
"query": "C1=CC=CC=C1",
348347
"score": 1.0,
349348
"smiles": "c1ccc(CCc2ccccc2)cc1",
@@ -378,14 +377,14 @@ async fn test_superstructure_search() -> eyre::Result<()> {
378377
response
379378
.assert_json(&serde_json::json!([
380379
{
381-
"extra_data": {"extra": "data", "scaffolds": [-1]},
380+
"extra_data": {"extra": "data"},
382381
"query": "C1=CC=CC=C1CCC2=CC=CC=C2",
383382
"score": 1.0,
384383
"smiles": "CC",
385384
"used_tautomers": false
386385
},
387386
{
388-
"extra_data": {"extra": "data", "scaffolds": [0]},
387+
"extra_data": {"extra": "data"},
389388
"query": "C1=CC=CC=C1CCC2=CC=CC=C2",
390389
"score": 1.0,
391390
"smiles": "c1ccccc1",

tests/cli_tests.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,17 @@ fn test_create_delete_query() {
5555
}
5656
}
5757

58-
let json_options: JsonObjectOptions =
58+
let extra_data_options: JsonObjectOptions =
5959
JsonObjectOptions::from(TEXT | STORED).set_expand_dots_enabled();
60-
61-
let extra_data_field = builder.add_json_field("extra_data", json_options);
62-
60+
let extra_data_field = builder.add_json_field("extra_data", extra_data_options);
6361
doc.add_field_value(extra_data_field, json!({"extra_data": ""}));
6462

63+
let other_descriptors_options: JsonObjectOptions =
64+
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
65+
let other_descriptors_field =
66+
builder.add_json_field("other_descriptors", other_descriptors_options);
67+
doc.add_field_value(other_descriptors_field, json!({"other_descriptors": ""}));
68+
6569
let schema = builder.build();
6670
let builder = IndexBuilder::new().schema(schema);
6771
let index = builder.create_in_ram().unwrap();

tests/cpd_processing_tests.rs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,6 @@ fn test_add_formal_charge() {
4848

4949
#[test]
5050
fn test_fix_chemistry_problems() {
51-
let smiles1 = "F[Si-2](F)(F)(F)(F)F.CC";
52-
let romol1 = fix_chemistry_problems(smiles1).unwrap();
53-
assert_eq!(romol1.as_smiles(), "CC");
54-
5551
let smiles2 = "C[N](C)(C)C";
5652
let romol2 = fix_chemistry_problems(smiles2).unwrap();
5753
assert_eq!(romol2.as_smiles(), "C[N+](C)(C)C");

0 commit comments

Comments
 (0)