Skip to content

Commit

Permalink
hett updates
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesamcl committed Jul 21, 2024
1 parent f166d9d commit 6dbbdde
Show file tree
Hide file tree
Showing 10 changed files with 212 additions and 21 deletions.
20 changes: 0 additions & 20 deletions 01_ingest/grebi_ingest_hett_pesticides_eu/ingest.py

This file was deleted.

44 changes: 44 additions & 0 deletions 01_ingest/hett_pesticides_appril.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python3

import sys
import json
import argparse
import pandas as pd
import io
import re

parser = argparse.ArgumentParser()
parser.add_argument('--datasource-name', required=True)
parser.add_argument('--filename', required=False)
args = parser.parse_args()

df = pd.read_excel(io.BytesIO(sys.stdin.buffer.read()), dtype=str)
df.rename(columns={col: 'grebi:name' for col in df.columns if col == 'PRODUCT_NAME'}, inplace=True)
df['id'] = 'appril:'+df['REG_NUM']
df['grebi:type'] = 'hett:PesticideProduct'
df['grebi:datasource'] = args.datasource_name

df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

for obj in df.to_dict(orient='records'):
obj = {k: v for k, v in obj.items() if pd.notna(v)}

if 'PESTS' in obj:
obj['PESTS'] = list(map(lambda p: p.strip(), obj['PESTS'].split(',')))
if 'SITES' in obj:
obj['SITES'] = list(map(lambda p: p.strip(), obj['SITES'].split(',')))

if 'AIS' in obj:
cas = list(map(lambda cas: 'cas:'+cas, re.findall(r'\d{1,7}-\d{2}-\d', obj['AIS'])))
for c in cas:
print(json.dumps({'id': c, 'grebi:type': 'grebi:Chemical', 'grebi:datasource': args.datasource_name}))
obj['hett:hasActiveIngredient'] = cas

if 'INERTS' in obj:
cas = list(map(lambda cas: 'cas:'+cas, re.findall(r'\d{1,7}-\d{2}-\d', obj['INERTS'])))
for c in cas:
print(json.dumps({'id': c, 'grebi:type': 'grebi:Chemical', 'grebi:datasource': args.datasource_name}))
obj['hett:hasInertIngredient'] = cas

print(json.dumps(obj))

46 changes: 46 additions & 0 deletions 01_ingest/hett_pesticides_eu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env python3

import sys
import json
import argparse
import pandas as pd
import io
import re

parser = argparse.ArgumentParser()
parser.add_argument('--datasource-name', required=True)
parser.add_argument('--filename', required=False)
args = parser.parse_args()

df = pd.read_excel(io.BytesIO(sys.stdin.buffer.read()), skiprows=3, dtype=str)
df.rename(columns={col: 'Category' for col in df.columns if col.startswith('Category')}, inplace=True)

df['id'] = df['Substance Name']
df.rename(columns={col: 'grebi:name' for col in df.columns if col == 'Substance Name'}, inplace=True)

df['grebi:type'] = 'hett:AgroSubstance'
df['grebi:datasource'] = args.datasource_name

df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

for obj in df.to_dict(orient='records'):
obj = {k: v for k, v in obj.items() if pd.notna(v)}

if 'Category' in obj:
obj['Category'] = obj['Category'].split(',')

if 'CAS Number' in obj:
# match cas numbers by regex
cas = list(map(lambda cas: 'cas:'+cas, re.findall(r'\d{1,7}-\d{2}-\d', obj['CAS Number'])))
for c in cas:
print(json.dumps({'id': c, 'grebi:type': 'grebi:Chemical', 'grebi:datasource': args.datasource_name}))
obj['CAS Number'] = cas

if 'IUPAC Name' in obj:
iupac = list(map(lambda iupac: iupac.strip(), re.split(r', | or |;', obj['IUPAC Name'])))
iupac = list(map(lambda i: i.strip(), iupac))
iupac = list(filter(lambda i: not i.lower().startswith('not '), iupac))
obj['grebi:equivalentTo'] = iupac

print(json.dumps(obj))

40 changes: 40 additions & 0 deletions 01_ingest/hett_pesticides_gb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env python3

import sys
import json
import argparse
import pandas as pd
import io
import re

parser = argparse.ArgumentParser()
parser.add_argument('--datasource-name', required=True)
parser.add_argument('--filename', required=False)
args = parser.parse_args()

df = pd.read_excel(io.BytesIO(sys.stdin.buffer.read()), skiprows=2, dtype=str)
# df.rename(columns={col: 'Status' for col in df.columns if col.startswith('Category')}, inplace=True)

df['id'] = df['Substance']
df.rename(columns={col: 'grebi:name' for col in df.columns if col == 'Substance'}, inplace=True)

df['grebi:type'] = 'hett:AgroSubstance'
df['grebi:datasource'] = args.datasource_name

df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

for obj in df.to_dict(orient='records'):
obj = {k: v for k, v in obj.items() if pd.notna(v)}

if 'Authorised' in obj:
obj['Authorised'] = list(map(lambda p: p.strip(), obj['Authorised'].split(',')))

if 'CAS Number' in obj:
# match cas numbers by regex
cas = list(map(lambda cas: 'cas:'+cas, re.findall(r'\d{1,7}-\d{2}-\d', obj['CAS Number'])))
for c in cas:
print(json.dumps({'id': c, 'grebi:type': 'grebi:Chemical', 'grebi:datasource': args.datasource_name}))
obj['CAS Number'] = cas

print(json.dumps(obj))

11 changes: 11 additions & 0 deletions configs/datasource_configs/hett_pesticides_appril.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"name": "HETT_Pesticides.APPRIL",
"enabled": true,
"ingests": [
{
"ingest_files": ["./00_fetch_data/hett_pesticides/apprildatadump_public.xlsx"],
"ingest_script": "./01_ingest/hett_pesticides_appril.py",
"ingest_args": []
}
]
}
11 changes: 11 additions & 0 deletions configs/datasource_configs/hett_pesticides_eu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"name": "HETT_Pesticides.EU",
"enabled": true,
"ingests": [
{
"ingest_files": ["./00_fetch_data/hett_pesticides/ActiveSubstanceExport*"],
"ingest_script": "./01_ingest/hett_pesticides_gb.py",
"ingest_args": []
}
]
}
11 changes: 11 additions & 0 deletions configs/datasource_configs/hett_pesticides_gb.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"name": "HETT_Pesticides.GB",
"enabled": true,
"ingests": [
{
"ingest_files": ["./00_fetch_data/hett_pesticides/active-substance-register.xlsx"],
"ingest_script": "./01_ingest/hett_pesticides_eu.py",
"ingest_args": []
}
]
}
3 changes: 2 additions & 1 deletion configs/pipeline_configs/ebi.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"subgraphs": [
"ebi_full_monarch",
"monarch",
"hra_kg"
"hra_kg",
"hett"
]
}
5 changes: 5 additions & 0 deletions configs/pipeline_configs/hett_only.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"subgraphs": [
"hett"
]
}
42 changes: 42 additions & 0 deletions configs/subgraph_configs/hett.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"id": "HETT",
"name": "EMBL Human Ecosystems",
"bytes_per_merged_file": 1073741824,
"equivalence_props": [
"owl:equivalentClass",
"owl:equivalentProperty",
"owl:sameAs",
"grebi:equivalentTo",
"ols:iri",
"ols:shortForm",
"hgnc:ensembl_gene_id",
"obo:chebi/inchi",
"obo:chebi/inchikey",
"obo:chebi/smiles",
"impc:pmId",
"impc:humanGeneAccId",
"monarch:iri"
],
"additional_equivalence_groups": [
["grebi:name", "ols:label", "rdfs:label", "monarch:name", "impc:name", "reactome:displayName"],
["grebi:description", "iao:definition", "monarch:description", "ols:definition"],
["grebi:synonym", "monarch:synonym", "iao:alternative_label", "ols:synonym", "oboinowl:hasExactSynonym"],
["mondo:0000001", "ogms:0000031"]
],
"exclude_props": [
"ols:curie",
"ols:shortForm",
"ols:ontologyPreferredPrefix",
"ols:iri",
"oboinowl:id",
"oboinowl:url",
"monarch:iri"
],
"exclude_edges": [
],
"datasource_configs": [
"./configs/datasource_configs/hett_pesticides_appril.json",
"./configs/datasource_configs/hett_pesticides_eu.json",
"./configs/datasource_configs/hett_pesticides_gb.json"
]
}

0 comments on commit 6dbbdde

Please sign in to comment.