Skip to content

Commit

Permalink
fix hett scripts and nextflow pipeline mixing up orders
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesamcl committed Jul 25, 2024
1 parent cd65ae4 commit c4e4976
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 29 deletions.
Empty file modified 01_ingest/hett_pesticides_appril.py
100644 → 100755
Empty file.
18 changes: 6 additions & 12 deletions 01_ingest/hett_pesticides_eu.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
parser.add_argument('--filename', required=False)
args = parser.parse_args()

df = pd.read_excel(io.BytesIO(sys.stdin.buffer.read()), skiprows=3, dtype=str)
df.rename(columns={col: 'Category' for col in df.columns if col.startswith('Category')}, inplace=True)
df = pd.read_excel(io.BytesIO(sys.stdin.buffer.read()), skiprows=2, dtype=str)
# df.rename(columns={col: 'Status' for col in df.columns if col.startswith('Category')}, inplace=True)

df['id'] = df['Substance Name']
df.rename(columns={col: 'grebi:name' for col in df.columns if col == 'Substance Name'}, inplace=True)
df['id'] = df['Substance']
df.rename(columns={col: 'grebi:name' for col in df.columns if col == 'Substance'}, inplace=True)

df['grebi:type'] = 'hett:AgroSubstance'
df['grebi:datasource'] = args.datasource_name
Expand All @@ -26,8 +26,8 @@
for obj in df.to_dict(orient='records'):
obj = {k: v for k, v in obj.items() if pd.notna(v)}

if 'Category' in obj:
obj['Category'] = obj['Category'].split(',')
if 'Authorised' in obj:
obj['Authorised'] = list(map(lambda p: p.strip(), obj['Authorised'].split(',')))

if 'CAS Number' in obj:
# match cas numbers by regex
Expand All @@ -36,11 +36,5 @@
print(json.dumps({'id': c, 'grebi:type': 'grebi:Chemical', 'grebi:datasource': args.datasource_name}))
obj['CAS Number'] = cas

if 'IUPAC Name' in obj:
iupac = list(map(lambda iupac: iupac.strip(), re.split(r', | or |;', obj['IUPAC Name'])))
iupac = list(map(lambda i: i.strip(), iupac))
iupac = list(filter(lambda i: not i.lower().startswith('not '), iupac))
obj['grebi:equivalentTo'] = iupac

print(json.dumps(obj))

19 changes: 13 additions & 6 deletions 01_ingest/hett_pesticides_gb.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@
parser.add_argument('--filename', required=False)
args = parser.parse_args()

df = pd.read_excel(io.BytesIO(sys.stdin.buffer.read()), skiprows=2, dtype=str)
# df.rename(columns={col: 'Status' for col in df.columns if col.startswith('Category')}, inplace=True)
df = pd.read_excel(io.BytesIO(sys.stdin.buffer.read()), skiprows=3, dtype=str)
df.columns = df.columns.astype(str)
df.rename(columns={col: 'Category' for col in df.columns if col.startswith('Category')}, inplace=True)

df['id'] = df['Substance']
df.rename(columns={col: 'grebi:name' for col in df.columns if col == 'Substance'}, inplace=True)
df['id'] = df['Substance Name']
df.rename(columns={col: 'grebi:name' for col in df.columns if col == 'Substance Name'}, inplace=True)

df['grebi:type'] = 'hett:AgroSubstance'
df['grebi:datasource'] = args.datasource_name
Expand All @@ -26,8 +27,8 @@
for obj in df.to_dict(orient='records'):
obj = {k: v for k, v in obj.items() if pd.notna(v)}

if 'Authorised' in obj:
obj['Authorised'] = list(map(lambda p: p.strip(), obj['Authorised'].split(',')))
if 'Category' in obj:
obj['Category'] = obj['Category'].split(',')

if 'CAS Number' in obj:
# match cas numbers by regex
Expand All @@ -36,5 +37,11 @@
print(json.dumps({'id': c, 'grebi:type': 'grebi:Chemical', 'grebi:datasource': args.datasource_name}))
obj['CAS Number'] = cas

if 'IUPAC Name' in obj:
iupac = list(map(lambda iupac: iupac.strip(), re.split(r', | or |;', obj['IUPAC Name'])))
iupac = list(map(lambda i: i.strip(), iupac))
iupac = list(filter(lambda i: not i.lower().startswith('not '), iupac))
obj['grebi:equivalentTo'] = iupac

print(json.dumps(obj))

4 changes: 2 additions & 2 deletions configs/datasource_configs/hett_pesticides_eu.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
"ingests": [
{
"ingest_files": ["./00_fetch_data/hett_pesticides/ActiveSubstanceExport*"],
"ingest_script": "./01_ingest/hett_pesticides_gb.py",
"ingest_script": "./01_ingest/hett_pesticides_eu.py",
"ingest_args": []
}
]
}
}
4 changes: 2 additions & 2 deletions configs/datasource_configs/hett_pesticides_gb.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
"ingests": [
{
"ingest_files": ["./00_fetch_data/hett_pesticides/active-substance-register.xlsx"],
"ingest_script": "./01_ingest/hett_pesticides_eu.py",
"ingest_script": "./01_ingest/hett_pesticides_gb.py",
"ingest_args": []
}
]
}
}
9 changes: 4 additions & 5 deletions nextflow/01_create_subgraph.nf
Original file line number Diff line number Diff line change
Expand Up @@ -606,10 +606,9 @@ def parseJson(json) {
}

def getDecompressionCommand(filename) {
def f = filename
if (filename.startsWith(".")) {
f = new File(params.home, filename).toString()
} else {
f = filename
}
if (f.endsWith(".gz")) {
return "zcat ${f}"
Expand All @@ -625,19 +624,19 @@ def getIngestCommand(script) {
}

def buildIngestArgs(ingestArgs) {
res = ""
def res = ""
ingestArgs.each { arg -> res += "${arg.name} ${arg.value} " }
return res
}

def buildAddEquivGroupArgs(equivGroups) {
res = ""
def res = ""
equivGroups.each { arg -> res += "--add-group ${arg.iterator().join(",")} " }
return res
}

def buildMergeArgs(assigned) {
res = ""
def res = ""
assigned.each { a ->
res += "${a[0]}:${a[1]} "
}
Expand Down
9 changes: 7 additions & 2 deletions scripts/dataload_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@
for subgraph in config['subgraphs']:
print(f"===== LOADING SUBGRAPH: {subgraph} =====")
os.environ['GREBI_SUBGRAPH'] = subgraph
os.system(f'nextflow {GREBI_HOME}/nextflow/01_create_subgraph.nf')
ret = os.system(f'nextflow {GREBI_HOME}/nextflow/01_create_subgraph.nf')
if ret != 0:
exit(ret)
print(f"===== FINISHED LOADING SUBGRAPH: {subgraph} =====")

os.system(f'nextflow {GREBI_HOME}/nextflow/02_create_dbs.nf')
ret = os.system(f'nextflow {GREBI_HOME}/nextflow/02_create_dbs.nf')
if ret != 0:
exit(ret)

0 comments on commit c4e4976

Please sign in to comment.