Skip to content

Commit

Permalink
Merge pull request #5 from EBISPOT/hett
Browse files Browse the repository at this point in the history
Hett
  • Loading branch information
jamesamcl authored Jul 24, 2024
2 parents 944cfdd + 6dbbdde commit d0414e7
Show file tree
Hide file tree
Showing 26 changed files with 521 additions and 178 deletions.
39 changes: 39 additions & 0 deletions 00_fetch_data/chembl/export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import sqlite3
import json
import sys

def export_tables_to_jsonl(sqlite_file):
conn = sqlite3.connect(sqlite_file)
cursor = conn.cursor()

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

for table in tables:
table_name = table[0]

cursor.execute(f"PRAGMA table_info({table_name});")
columns_info = cursor.fetchall()
primary_key = None
for column_info in columns_info:
if column_info[5] == 1: # PK column
primary_key = column_info[1]
break

cursor.execute(f"SELECT * FROM {table_name};")
rows = cursor.fetchall()
column_names = [description[0] for description in cursor.description]

for row in rows:
row_dict = dict(zip(column_names, row))
row_dict = {f"chembl:{key}": value for key, value in row_dict.items()}
if primary_key:
row_dict["id"] = row_dict["chembl:"+primary_key]
row_dict["grebi:type"] = f"chembl:{table_name}"
print(json.dumps(row_dict))

conn.close()

export_tables_to_jsonl('chembl_34/chembl_34_sqlite/chembl_34.db')


Binary file not shown.
Binary file not shown.
3 changes: 3 additions & 0 deletions 00_fetch_data/hett_pesticides/download_epa.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

wget https://www3.epa.gov/pesticides/appril/apprildatadump_public.xlsx
3 changes: 3 additions & 0 deletions 00_fetch_data/hett_pesticides/download_hse.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

wget https://www.hse.gov.uk/pesticides/assets/docs/active-substance-register.xlsx
2 changes: 1 addition & 1 deletion 00_fetch_data/hra_kg/fetch.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

rm -f https://cdn.humanatlas.io/digital-objects/blazegraph.jnl
rm -f blazegraph.jnl
wget https://cdn.humanatlas.io/digital-objects/blazegraph.jnl

docker run --entrypoint /data/entrypoint.dockersh -v $(pwd):/data ghcr.io/ebispot/blazegraph-docker:2.1.5
Expand Down
8 changes: 7 additions & 1 deletion 01_ingest/grebi_ingest_mondo_efo_mappings/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@

import sys
import json
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--datasource-name', required=True)
parser.add_argument('--filename', required=False)
args = parser.parse_args()

for line in sys.stdin:
tokens = line.strip().split('\t')
print(json.dumps({ 'id': tokens[0], 'grebi:equivalentTo': tokens[1].split(',') }))
print(json.dumps({ 'id': tokens[0], 'grebi:datasource': args.datasource_name, 'grebi:equivalentTo': tokens[1].split(',') }))
44 changes: 44 additions & 0 deletions 01_ingest/hett_pesticides_appril.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python3

import sys
import json
import argparse
import pandas as pd
import io
import re

parser = argparse.ArgumentParser()
parser.add_argument('--datasource-name', required=True)
parser.add_argument('--filename', required=False)
args = parser.parse_args()

df = pd.read_excel(io.BytesIO(sys.stdin.buffer.read()), dtype=str)
df.rename(columns={col: 'grebi:name' for col in df.columns if col == 'PRODUCT_NAME'}, inplace=True)
df['id'] = 'appril:'+df['REG_NUM']
df['grebi:type'] = 'hett:PesticideProduct'
df['grebi:datasource'] = args.datasource_name

df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

for obj in df.to_dict(orient='records'):
obj = {k: v for k, v in obj.items() if pd.notna(v)}

if 'PESTS' in obj:
obj['PESTS'] = list(map(lambda p: p.strip(), obj['PESTS'].split(',')))
if 'SITES' in obj:
obj['SITES'] = list(map(lambda p: p.strip(), obj['SITES'].split(',')))

if 'AIS' in obj:
cas = list(map(lambda cas: 'cas:'+cas, re.findall(r'\d{1,7}-\d{2}-\d', obj['AIS'])))
for c in cas:
print(json.dumps({'id': c, 'grebi:type': 'grebi:Chemical', 'grebi:datasource': args.datasource_name}))
obj['hett:hasActiveIngredient'] = cas

if 'INERTS' in obj:
cas = list(map(lambda cas: 'cas:'+cas, re.findall(r'\d{1,7}-\d{2}-\d', obj['INERTS'])))
for c in cas:
print(json.dumps({'id': c, 'grebi:type': 'grebi:Chemical', 'grebi:datasource': args.datasource_name}))
obj['hett:hasInertIngredient'] = cas

print(json.dumps(obj))

46 changes: 46 additions & 0 deletions 01_ingest/hett_pesticides_eu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env python3

import sys
import json
import argparse
import pandas as pd
import io
import re

parser = argparse.ArgumentParser()
parser.add_argument('--datasource-name', required=True)
parser.add_argument('--filename', required=False)
args = parser.parse_args()

df = pd.read_excel(io.BytesIO(sys.stdin.buffer.read()), skiprows=3, dtype=str)
df.rename(columns={col: 'Category' for col in df.columns if col.startswith('Category')}, inplace=True)

df['id'] = df['Substance Name']
df.rename(columns={col: 'grebi:name' for col in df.columns if col == 'Substance Name'}, inplace=True)

df['grebi:type'] = 'hett:AgroSubstance'
df['grebi:datasource'] = args.datasource_name

df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

for obj in df.to_dict(orient='records'):
obj = {k: v for k, v in obj.items() if pd.notna(v)}

if 'Category' in obj:
obj['Category'] = obj['Category'].split(',')

if 'CAS Number' in obj:
# match cas numbers by regex
cas = list(map(lambda cas: 'cas:'+cas, re.findall(r'\d{1,7}-\d{2}-\d', obj['CAS Number'])))
for c in cas:
print(json.dumps({'id': c, 'grebi:type': 'grebi:Chemical', 'grebi:datasource': args.datasource_name}))
obj['CAS Number'] = cas

if 'IUPAC Name' in obj:
iupac = list(map(lambda iupac: iupac.strip(), re.split(r', | or |;', obj['IUPAC Name'])))
iupac = list(map(lambda i: i.strip(), iupac))
iupac = list(filter(lambda i: not i.lower().startswith('not '), iupac))
obj['grebi:equivalentTo'] = iupac

print(json.dumps(obj))

40 changes: 40 additions & 0 deletions 01_ingest/hett_pesticides_gb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env python3

import sys
import json
import argparse
import pandas as pd
import io
import re

parser = argparse.ArgumentParser()
parser.add_argument('--datasource-name', required=True)
parser.add_argument('--filename', required=False)
args = parser.parse_args()

df = pd.read_excel(io.BytesIO(sys.stdin.buffer.read()), skiprows=2, dtype=str)
# df.rename(columns={col: 'Status' for col in df.columns if col.startswith('Category')}, inplace=True)

df['id'] = df['Substance']
df.rename(columns={col: 'grebi:name' for col in df.columns if col == 'Substance'}, inplace=True)

df['grebi:type'] = 'hett:AgroSubstance'
df['grebi:datasource'] = args.datasource_name

df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

for obj in df.to_dict(orient='records'):
obj = {k: v for k, v in obj.items() if pd.notna(v)}

if 'Authorised' in obj:
obj['Authorised'] = list(map(lambda p: p.strip(), obj['Authorised'].split(',')))

if 'CAS Number' in obj:
# match cas numbers by regex
cas = list(map(lambda cas: 'cas:'+cas, re.findall(r'\d{1,7}-\d{2}-\d', obj['CAS Number'])))
for c in cas:
print(json.dumps({'id': c, 'grebi:type': 'grebi:Chemical', 'grebi:datasource': args.datasource_name}))
obj['CAS Number'] = cas

print(json.dumps(obj))

15 changes: 12 additions & 3 deletions 02_equivalences/grebi_assign_ids/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ use grebi_shared::find_strings;
struct Args {

#[arg(long)]
groups_txt: String,
add_prefix: String, // used to prepend the subgraph name like hra_kg:

#[arg(long)]
groups_txt: String,

#[arg(long)]
preserve_field: Vec<String>

Expand All @@ -33,6 +36,8 @@ fn main() {
let args = Args::parse();
let preserve_fields:HashSet<Vec<u8>> = args.preserve_field.iter().map(|x| x.as_bytes().to_vec()).collect();

let add_prefix = args.add_prefix;

let id_to_group:HashMap<Vec<u8>, Vec<u8>> = {

let start_time = std::time::Instant::now();
Expand Down Expand Up @@ -102,11 +107,13 @@ fn main() {

// the subject mapped to an equivalence group
writer.write_all("{\"grebi:nodeId\":\"".as_bytes()).unwrap();
writer.write_all(add_prefix.as_bytes()).unwrap();
writer.write_all(group.unwrap().as_slice()).unwrap();
writer.write_all("\"".as_bytes()).unwrap();
} else {
// the subject did not map to an equivalence group
writer.write_all("{\"grebi:nodeId\":\"".as_bytes()).unwrap();
writer.write_all(add_prefix.as_bytes()).unwrap();
writer.write_all(id.unwrap()).unwrap();
writer.write_all("\"".as_bytes()).unwrap();
}
Expand All @@ -122,6 +129,7 @@ fn main() {
} else {
let name_group = id_to_group.get(name);
if name_group.is_some() {
writer.write_all(add_prefix.as_bytes()).unwrap();
writer.write_all(name_group.unwrap()).unwrap();
} else {
writer.write_all(name).unwrap();
Expand All @@ -132,7 +140,7 @@ fn main() {
if name.eq(b"id") || preserve_fields.contains(name) {
writer.write_all(json.value()).unwrap();
} else {
write_value(&mut writer, json.value(), &id_to_group);
write_value(&mut writer, json.value(), &id_to_group, &add_prefix);
}
}

Expand All @@ -143,7 +151,7 @@ fn main() {

}

fn write_value(writer:&mut BufWriter<io::StdoutLock>, value:&[u8], id_to_group:&HashMap<Vec<u8>, Vec<u8>>) {
fn write_value(writer:&mut BufWriter<io::StdoutLock>, value:&[u8], id_to_group:&HashMap<Vec<u8>, Vec<u8>>, add_prefix:&str) {

let string_locations = find_strings(&value);

Expand All @@ -166,6 +174,7 @@ fn write_value(writer:&mut BufWriter<io::StdoutLock>, value:&[u8], id_to_group:&

let pv_group = id_to_group.get(str);
if pv_group.is_some() {
writer.write_all(add_prefix.as_bytes()).unwrap();
writer.write_all(pv_group.unwrap()).unwrap();
} else {
writer.write_all(str).unwrap();
Expand Down
12 changes: 7 additions & 5 deletions 05_materialise/grebi_materialise/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ fn main() -> std::io::Result<()> {

sliced.props.iter().for_each(|prop| {
for val in &prop.values {
maybe_write_edge(sliced.id, prop, &val, &mut edges_writer, &exclude, &node_metadata, &val.datasources);
maybe_write_edge(sliced.id, prop, &val, &mut edges_writer, &exclude, &node_metadata, &val.datasources, sliced.subgraph);
}
});

Expand Down Expand Up @@ -123,7 +123,7 @@ fn main() -> std::io::Result<()> {
Ok(())
}

fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyValue, edges_writer: &mut BufWriter<File>, exclude:&BTreeSet<Vec<u8>>, node_metadata:&BTreeMap<Vec<u8>, Metadata>, datasources:&Vec<&[u8]>) {
fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyValue, edges_writer: &mut BufWriter<File>, exclude:&BTreeSet<Vec<u8>>, node_metadata:&BTreeMap<Vec<u8>, Metadata>, datasources:&Vec<&[u8]>, subgraph:&[u8]) {

if prop.key.eq(b"id") || prop.key.starts_with(b"grebi:") || exclude.contains(prop.key) {
return;
Expand All @@ -140,7 +140,7 @@ fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyVal
let str = JsonParser::parse(&buf).string();
let exists = node_metadata.contains_key(str);
if exists {
write_edge(from_id, str, prop.key, Some(&reified_u.props), edges_writer, node_metadata, &datasources);
write_edge(from_id, str, prop.key, Some(&reified_u.props), edges_writer, node_metadata, &datasources, &subgraph);
}
} else {
// panic!("unexpected kind: {:?}", reified_u.value_kind);
Expand All @@ -154,7 +154,7 @@ fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyVal
let exists = node_metadata.contains_key(str);

if exists {
write_edge(from_id, str, prop.key, None, edges_writer, node_metadata, &datasources);
write_edge(from_id, str, prop.key, None, edges_writer, node_metadata, &datasources, &subgraph);
}

} else if val.kind == JsonTokenType::StartArray {
Expand All @@ -169,12 +169,14 @@ fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyVal

}

fn write_edge(from_id: &[u8], to_id: &[u8], edge:&[u8], edge_props:Option<&Vec<SlicedProperty>>, edges_writer: &mut BufWriter<File>, node_metadata:&BTreeMap<Vec<u8>,Metadata>, datasources:&Vec<&[u8]>) {
fn write_edge(from_id: &[u8], to_id: &[u8], edge:&[u8], edge_props:Option<&Vec<SlicedProperty>>, edges_writer: &mut BufWriter<File>, node_metadata:&BTreeMap<Vec<u8>,Metadata>, datasources:&Vec<&[u8]>, subgraph:&[u8]) {

let mut buf = Vec::new();

buf.extend(b"\"grebi:type\":\"");
buf.extend(edge);
buf.extend(b"\",\"grebi:subgraph\":\"");
buf.extend(subgraph);
buf.extend(b"\",\"grebi:from\":\"");
buf.extend(from_id);
buf.extend(b"\",\"grebi:to\":\"");
Expand Down
Loading

0 comments on commit d0414e7

Please sign in to comment.