Merge pull request #5 from EBISPOT/hett

Hett
EBISPOT · Jul 24, 2024 · d0414e7 · d0414e7
2 parents 944cfdd + 6dbbdde
commit d0414e7
Show file tree

Hide file tree

Showing 26 changed files with 521 additions and 178 deletions.
diff --git a/00_fetch_data/chembl/export.py b/00_fetch_data/chembl/export.py
@@ -0,0 +1,39 @@
+import sqlite3
+import json
+import sys
+
+def export_tables_to_jsonl(sqlite_file):
+    conn = sqlite3.connect(sqlite_file)
+    cursor = conn.cursor()
+
+    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
+    tables = cursor.fetchall()
+
+    for table in tables:
+        table_name = table[0]
+
+        cursor.execute(f"PRAGMA table_info({table_name});")
+        columns_info = cursor.fetchall()
+        primary_key = None
+        for column_info in columns_info:
+            if column_info[5] == 1:  # PK column
+                primary_key = column_info[1]
+                break
+
+        cursor.execute(f"SELECT * FROM {table_name};")
+        rows = cursor.fetchall()
+        column_names = [description[0] for description in cursor.description]
+
+        for row in rows:
+            row_dict = dict(zip(column_names, row))
+            row_dict = {f"chembl:{key}": value for key, value in row_dict.items()}
+            if primary_key:
+                row_dict["id"] = row_dict["chembl:"+primary_key]
+            row_dict["grebi:type"] = f"chembl:{table_name}"
+            print(json.dumps(row_dict))
+
+    conn.close()
+
+export_tables_to_jsonl('chembl_34/chembl_34_sqlite/chembl_34.db')
+
+
diff --git a/00_fetch_data/hett_pesticides/ActiveSubstanceExport_12-07-2024.xlsx b/00_fetch_data/hett_pesticides/ActiveSubstanceExport_12-07-2024.xlsx
diff --git a/00_fetch_data/hett_pesticides/active-substance-register.xlsx b/00_fetch_data/hett_pesticides/active-substance-register.xlsx
diff --git a/00_fetch_data/hett_pesticides/download_epa.sh b/00_fetch_data/hett_pesticides/download_epa.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+wget https://www3.epa.gov/pesticides/appril/apprildatadump_public.xlsx
diff --git a/00_fetch_data/hett_pesticides/download_hse.sh b/00_fetch_data/hett_pesticides/download_hse.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+wget https://www.hse.gov.uk/pesticides/assets/docs/active-substance-register.xlsx
diff --git a/00_fetch_data/hra_kg/fetch.sh b/00_fetch_data/hra_kg/fetch.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-rm -f https://cdn.humanatlas.io/digital-objects/blazegraph.jnl
+rm -f blazegraph.jnl
 wget https://cdn.humanatlas.io/digital-objects/blazegraph.jnl
 
 docker run --entrypoint /data/entrypoint.dockersh -v $(pwd):/data ghcr.io/ebispot/blazegraph-docker:2.1.5 

diff --git a/01_ingest/grebi_ingest_mondo_efo_mappings/ingest.py b/01_ingest/grebi_ingest_mondo_efo_mappings/ingest.py
@@ -2,7 +2,13 @@
 
 import sys
 import json
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--datasource-name', required=True)
+parser.add_argument('--filename', required=False)
+args = parser.parse_args()
 
 for line in sys.stdin:
     tokens = line.strip().split('\t')
-    print(json.dumps({ 'id': tokens[0], 'grebi:equivalentTo': tokens[1].split(',') }))
+    print(json.dumps({ 'id': tokens[0], 'grebi:datasource': args.datasource_name, 'grebi:equivalentTo': tokens[1].split(',') }))
diff --git a/01_ingest/hett_pesticides_appril.py b/01_ingest/hett_pesticides_appril.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import argparse
+import pandas as pd
+import io
+import re
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--datasource-name', required=True)
+parser.add_argument('--filename', required=False)
+args = parser.parse_args()
+
+df = pd.read_excel(io.BytesIO(sys.stdin.buffer.read()), dtype=str)
+df.rename(columns={col: 'grebi:name' for col in df.columns if col == 'PRODUCT_NAME'}, inplace=True)
+df['id'] = 'appril:'+df['REG_NUM']
+df['grebi:type'] = 'hett:PesticideProduct'
+df['grebi:datasource'] = args.datasource_name
+
+df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
+
+for obj in df.to_dict(orient='records'):
+    obj = {k: v for k, v in obj.items() if pd.notna(v)}
+
+    if 'PESTS' in obj:
+        obj['PESTS'] = list(map(lambda p: p.strip(), obj['PESTS'].split(',')))
+    if 'SITES' in obj:
+        obj['SITES'] = list(map(lambda p: p.strip(), obj['SITES'].split(',')))
+
+    if 'AIS' in obj:
+        cas = list(map(lambda cas: 'cas:'+cas, re.findall(r'\d{1,7}-\d{2}-\d', obj['AIS'])))
+        for c in cas:
+            print(json.dumps({'id': c, 'grebi:type': 'grebi:Chemical', 'grebi:datasource': args.datasource_name}))
+        obj['hett:hasActiveIngredient'] = cas
+
+    if 'INERTS' in obj:
+        cas = list(map(lambda cas: 'cas:'+cas, re.findall(r'\d{1,7}-\d{2}-\d', obj['INERTS'])))
+        for c in cas:
+            print(json.dumps({'id': c, 'grebi:type': 'grebi:Chemical', 'grebi:datasource': args.datasource_name}))
+        obj['hett:hasInertIngredient'] = cas
+
+    print(json.dumps(obj))
+
diff --git a/01_ingest/hett_pesticides_eu.py b/01_ingest/hett_pesticides_eu.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import argparse
+import pandas as pd
+import io
+import re
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--datasource-name', required=True)
+parser.add_argument('--filename', required=False)
+args = parser.parse_args()
+
+df = pd.read_excel(io.BytesIO(sys.stdin.buffer.read()), skiprows=3, dtype=str)
+df.rename(columns={col: 'Category' for col in df.columns if col.startswith('Category')}, inplace=True)
+
+df['id'] = df['Substance Name']
+df.rename(columns={col: 'grebi:name' for col in df.columns if col == 'Substance Name'}, inplace=True)
+
+df['grebi:type'] = 'hett:AgroSubstance'
+df['grebi:datasource'] = args.datasource_name
+
+df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
+
+for obj in df.to_dict(orient='records'):
+    obj = {k: v for k, v in obj.items() if pd.notna(v)}
+
+    if 'Category' in obj:
+        obj['Category'] = obj['Category'].split(',')
+
+    if 'CAS Number' in obj:
+        # match cas numbers by regex
+        cas =  list(map(lambda cas: 'cas:'+cas, re.findall(r'\d{1,7}-\d{2}-\d', obj['CAS Number'])))
+        for c in cas:
+            print(json.dumps({'id': c, 'grebi:type': 'grebi:Chemical', 'grebi:datasource': args.datasource_name}))
+        obj['CAS Number'] = cas
+
+    if 'IUPAC Name' in obj:
+        iupac = list(map(lambda iupac: iupac.strip(), re.split(r', | or |;', obj['IUPAC Name'])))
+        iupac = list(map(lambda i: i.strip(), iupac))
+        iupac = list(filter(lambda i: not i.lower().startswith('not '), iupac))
+        obj['grebi:equivalentTo'] = iupac
+
+    print(json.dumps(obj))
+
diff --git a/01_ingest/hett_pesticides_gb.py b/01_ingest/hett_pesticides_gb.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import argparse
+import pandas as pd
+import io
+import re
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--datasource-name', required=True)
+parser.add_argument('--filename', required=False)
+args = parser.parse_args()
+
+df = pd.read_excel(io.BytesIO(sys.stdin.buffer.read()), skiprows=2, dtype=str)
+# df.rename(columns={col: 'Status' for col in df.columns if col.startswith('Category')}, inplace=True)
+
+df['id'] = df['Substance']
+df.rename(columns={col: 'grebi:name' for col in df.columns if col == 'Substance'}, inplace=True)
+
+df['grebi:type'] = 'hett:AgroSubstance'
+df['grebi:datasource'] = args.datasource_name
+
+df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
+
+for obj in df.to_dict(orient='records'):
+    obj = {k: v for k, v in obj.items() if pd.notna(v)}
+
+    if 'Authorised' in obj:
+        obj['Authorised'] = list(map(lambda p: p.strip(), obj['Authorised'].split(',')))
+
+    if 'CAS Number' in obj:
+        # match cas numbers by regex
+        cas =  list(map(lambda cas: 'cas:'+cas, re.findall(r'\d{1,7}-\d{2}-\d', obj['CAS Number'])))
+        for c in cas:
+            print(json.dumps({'id': c, 'grebi:type': 'grebi:Chemical', 'grebi:datasource': args.datasource_name}))
+        obj['CAS Number'] = cas
+
+    print(json.dumps(obj))
+
diff --git a/02_equivalences/grebi_assign_ids/src/main.rs b/02_equivalences/grebi_assign_ids/src/main.rs
@@ -18,8 +18,11 @@ use grebi_shared::find_strings;
 struct Args {
 
     #[arg(long)]
-    groups_txt: String,
+    add_prefix: String, // used to prepend the subgraph name like hra_kg:
 
+    #[arg(long)]
+    groups_txt: String,
+
     #[arg(long)]
     preserve_field: Vec<String>
 
@@ -33,6 +36,8 @@ fn main() {
     let args = Args::parse();
     let preserve_fields:HashSet<Vec<u8>> = args.preserve_field.iter().map(|x| x.as_bytes().to_vec()).collect();
 
+    let add_prefix = args.add_prefix;
+
     let id_to_group:HashMap<Vec<u8>, Vec<u8>> = {
 
         let start_time = std::time::Instant::now();
@@ -102,11 +107,13 @@ fn main() {
 
             // the subject mapped to an equivalence group
             writer.write_all("{\"grebi:nodeId\":\"".as_bytes()).unwrap();
+            writer.write_all(add_prefix.as_bytes()).unwrap();
             writer.write_all(group.unwrap().as_slice()).unwrap();
             writer.write_all("\"".as_bytes()).unwrap();
         } else {
             // the subject did not map to an equivalence group
             writer.write_all("{\"grebi:nodeId\":\"".as_bytes()).unwrap();
+            writer.write_all(add_prefix.as_bytes()).unwrap();
             writer.write_all(id.unwrap()).unwrap();
             writer.write_all("\"".as_bytes()).unwrap();
         }
@@ -122,6 +129,7 @@ fn main() {
             } else {
                 let name_group = id_to_group.get(name);
                 if name_group.is_some() {
+                    writer.write_all(add_prefix.as_bytes()).unwrap();
                     writer.write_all(name_group.unwrap()).unwrap();
                 } else {
                     writer.write_all(name).unwrap();
@@ -132,7 +140,7 @@ fn main() {
             if name.eq(b"id") || preserve_fields.contains(name) {
                 writer.write_all(json.value()).unwrap();
             } else {
-                write_value(&mut writer, json.value(), &id_to_group);
+                write_value(&mut writer, json.value(), &id_to_group, &add_prefix);
             }
         }
 
@@ -143,7 +151,7 @@ fn main() {
 
 }
 
-fn write_value(writer:&mut BufWriter<io::StdoutLock>, value:&[u8], id_to_group:&HashMap<Vec<u8>, Vec<u8>>) {
+fn write_value(writer:&mut BufWriter<io::StdoutLock>, value:&[u8], id_to_group:&HashMap<Vec<u8>, Vec<u8>>, add_prefix:&str) {
 
     let string_locations = find_strings(&value);
 
@@ -166,6 +174,7 @@ fn write_value(writer:&mut BufWriter<io::StdoutLock>, value:&[u8], id_to_group:&
 
         let pv_group = id_to_group.get(str);
         if pv_group.is_some() {
+            writer.write_all(add_prefix.as_bytes()).unwrap();
             writer.write_all(pv_group.unwrap()).unwrap();
         } else {
             writer.write_all(str).unwrap();

diff --git a/05_materialise/grebi_materialise/src/main.rs b/05_materialise/grebi_materialise/src/main.rs
@@ -91,7 +91,7 @@ fn main() -> std::io::Result<()> {
 
         sliced.props.iter().for_each(|prop| {
             for val in &prop.values {
-                maybe_write_edge(sliced.id, prop, &val, &mut edges_writer, &exclude, &node_metadata, &val.datasources);
+                maybe_write_edge(sliced.id, prop, &val, &mut edges_writer, &exclude, &node_metadata, &val.datasources, sliced.subgraph);
             }
         });
 
@@ -123,7 +123,7 @@ fn main() -> std::io::Result<()> {
     Ok(())
 }
 
-fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyValue,  edges_writer: &mut BufWriter<File>, exclude:&BTreeSet<Vec<u8>>, node_metadata:&BTreeMap<Vec<u8>, Metadata>, datasources:&Vec<&[u8]>) {
+fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyValue,  edges_writer: &mut BufWriter<File>, exclude:&BTreeSet<Vec<u8>>, node_metadata:&BTreeMap<Vec<u8>, Metadata>, datasources:&Vec<&[u8]>, subgraph:&[u8]) {
 
     if prop.key.eq(b"id") || prop.key.starts_with(b"grebi:") || exclude.contains(prop.key) {
         return;
@@ -140,7 +140,7 @@ fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyVal
                 let str = JsonParser::parse(&buf).string();
                 let exists = node_metadata.contains_key(str);
                 if exists {
-                    write_edge(from_id, str, prop.key,  Some(&reified_u.props), edges_writer,  node_metadata, &datasources);
+                    write_edge(from_id, str, prop.key,  Some(&reified_u.props), edges_writer,  node_metadata, &datasources, &subgraph);
                 }
             } else {
                 // panic!("unexpected kind: {:?}", reified_u.value_kind);
@@ -154,7 +154,7 @@ fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyVal
         let exists = node_metadata.contains_key(str);
 
         if exists {
-            write_edge(from_id, str, prop.key, None, edges_writer, node_metadata, &datasources);
+            write_edge(from_id, str, prop.key, None, edges_writer, node_metadata, &datasources, &subgraph);
         }
 
     } else if val.kind == JsonTokenType::StartArray {
@@ -169,12 +169,14 @@ fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyVal
 
 }
 
-fn write_edge(from_id: &[u8], to_id: &[u8], edge:&[u8], edge_props:Option<&Vec<SlicedProperty>>, edges_writer: &mut BufWriter<File>, node_metadata:&BTreeMap<Vec<u8>,Metadata>, datasources:&Vec<&[u8]>) {
+fn write_edge(from_id: &[u8], to_id: &[u8], edge:&[u8], edge_props:Option<&Vec<SlicedProperty>>, edges_writer: &mut BufWriter<File>, node_metadata:&BTreeMap<Vec<u8>,Metadata>, datasources:&Vec<&[u8]>, subgraph:&[u8]) {
 
     let mut buf = Vec::new();
 
     buf.extend(b"\"grebi:type\":\"");
     buf.extend(edge);
+    buf.extend(b"\",\"grebi:subgraph\":\"");
+    buf.extend(subgraph);
     buf.extend(b"\",\"grebi:from\":\"");
     buf.extend(from_id);
     buf.extend(b"\",\"grebi:to\":\"");
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/bash

		wget https://www3.epa.gov/pesticides/appril/apprildatadump_public.xlsx
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/bash

		wget https://www.hse.gov.uk/pesticides/assets/docs/active-substance-register.xlsx