Skip to content

Commit

Permalink
pipeline and mapping updates for impc-gwas
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesamcl committed Dec 4, 2024
1 parent c5d073a commit 11afc55
Show file tree
Hide file tree
Showing 23 changed files with 50 additions and 26 deletions.
7 changes: 6 additions & 1 deletion dataload/00_fetch_data/sssom/fetch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@ wget https://data.monarchinitiative.org/mappings/latest/hp_mesh.sssom.tsv
wget https://data.monarchinitiative.org/mappings/latest/mesh_chebi_biomappings.sssom.tsv
wget https://data.monarchinitiative.org/mappings/latest/mondo.sssom.tsv
wget https://data.monarchinitiative.org/mappings/latest/umls_hp.sssom.tsv
wget https://data.monarchinitiative.org/mappings/latest/upheno_custom.sssom.tsv
wget https://data.monarchinitiative.org/mappings/latest/upheno-cross-species.sssom.tsv
wget https://data.monarchinitiative.org/mappings/latest/upheno-species-independent.sssom.tsv
wget https://data.monarchinitiative.org/mappings/latest/nbo-go.sssom.tsv
wget https://data.monarchinitiative.org/mappings/latest/uberon.sssom.tsv
wget https://raw.githubusercontent.com/obophenotype/upheno-dev/refs/heads/master/src/mappings/upheno-oba.sssom.tsv
wget https://raw.githubusercontent.com/mapping-commons/disease-mappings/refs/heads/main/mappings/mondo_hp_lexical.sssom.tsv

wget https://raw.githubusercontent.com/mapping-commons/mh_mapping_initiative/master/mappings/mp_hp_mgi_all.sssom.tsv
wget https://raw.githubusercontent.com/obophenotype/bio-attribute-ontology/master/src/mappings/oba-efo.sssom.tsv
Expand Down
Binary file modified dataload/00_fetch_data/sssom/gene_mappings.sssom.tsv.gz
Binary file not shown.
Binary file modified dataload/00_fetch_data/sssom/hp_mesh.sssom.tsv.gz
Binary file not shown.
Binary file modified dataload/00_fetch_data/sssom/mesh_chebi_biomappings.sssom.tsv.gz
Binary file not shown.
Binary file modified dataload/00_fetch_data/sssom/mondo.sssom.tsv.gz
Binary file not shown.
Binary file not shown.
Binary file modified dataload/00_fetch_data/sssom/mp_hp_mgi_all.sssom.tsv.gz
Binary file not shown.
Binary file added dataload/00_fetch_data/sssom/nbo-go.sssom.tsv.gz
Binary file not shown.
Binary file modified dataload/00_fetch_data/sssom/oba-efo.sssom.tsv.gz
Binary file not shown.
Binary file modified dataload/00_fetch_data/sssom/oba-vt.sssom.tsv.gz
Binary file not shown.
Binary file added dataload/00_fetch_data/sssom/uberon.sssom.tsv.gz
Binary file not shown.
Binary file modified dataload/00_fetch_data/sssom/umls_hp.sssom.tsv.gz
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
7 changes: 4 additions & 3 deletions dataload/07_create_db/solr/solr_import.dockerpy
Original file line number Diff line number Diff line change
Expand Up @@ -40,20 +40,21 @@ def main():

os.environ['SOLR_ENABLE_REMOTE_STREAMING'] = 'true'
os.environ['SOLR_SECURITY_MANAGER_ENABLED'] = 'false'
os.environ['JAVA_TOOL_OPTIONS'] = '-Djava.net.useSystemProxies=false'

cmd = ['solr', 'start', '-m', mem, '-p', port, '-noprompt', '-force']
print(' '.join(cmd))
subprocess.run(cmd)

time.sleep(30)

subprocess.run(['wait-for-solr.sh', '--solr-url', f"http://localhost:{port}"])
#subprocess.run(['wait-for-solr.sh', '--solr-url', f"http://127.0.0.1:{port}/solr/{core}/select?q=*:*"])

time.sleep(30)

if "_autocomplete" in core:
print("Uploading names.txt")
response = session.get(f"http://localhost:{port}/solr/{core}/update",
response = session.get(f"http://127.0.0.1:{port}/solr/{core}/update",
params={
'stream.file': '/names.txt',
'fieldnames': 'label',
Expand Down Expand Up @@ -85,7 +86,7 @@ def main():

def upload_file(core, port, filename):
print(f"Uploading {core.split('_')[1]} file: {filename}")
response = session.get(f"http://localhost:{port}/solr/{core}/update/json/docs",
response = session.get(f"http://127.0.0.1:{port}/solr/{core}/update/json/docs",
params={
'stream.file': filename,
'stream.contentType': 'application/json;charset=utf-8',
Expand Down
6 changes: 1 addition & 5 deletions dataload/configs/pipeline_configs/ebi.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
{
"subgraphs": [
"ebi_monarch_xspecies",
"impc_x_gwas",
"hra_kg"
]
}
"impc_x_gwas" ]}
2 changes: 1 addition & 1 deletion dataload/nextflow/codon_nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ process {
}
process {
withName: create_solr_edges_core {
memory = 150.GB
memory = 1500.GB
cpus = 32
}
}
Expand Down
38 changes: 28 additions & 10 deletions dataload/nextflow/load_subgraph.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ params.config = "$GREBI_CONFIG"
params.subgraph = "$GREBI_SUBGRAPH"
params.timestamp = "$GREBI_TIMESTAMP"
params.is_ebi = "$GREBI_IS_EBI"
params.solr_mem = "140g"

workflow {

Expand Down Expand Up @@ -56,17 +57,16 @@ workflow {
rocks_tgz = package_rocks(rocks_db)

if(params.is_ebi == "true") {
copy_summary_to_ftp(merge_summary_jsons.out)
copy_solr_to_ftp(solr_tgz)
copy_neo_to_ftp(neo_tgz)
copy_rocks_to_ftp(rocks_tgz)
copy_summary_to_ftp(merge_summary_jsons.out)
copy_solr_to_ftp(solr_tgz)
copy_neo_to_ftp(neo_tgz)
copy_rocks_to_ftp(rocks_tgz)

if(params.config == "ebi") {
copy_summary_to_staging(merge_summary_jsons.out)
copy_solr_config_to_staging()
copy_solr_cores_to_staging(solr_nodes_core.concat(solr_edges_core).concat(solr_autocomplete_core))
copy_rocksdb_to_staging(rocks_db)
}
copy_neo_to_staging(neo_db)
}
}

Expand Down Expand Up @@ -440,13 +440,13 @@ process create_solr_nodes_core {
--in-template-config-dir ${params.home}/06_prepare_db_import/solr_config_template \
--out-config-dir solr_config
python3 ${params.home}/07_create_db/solr/solr_import.slurm.py \
--solr-config solr_config --core grebi_nodes_${params.subgraph} --in-data . --out-path solr --port 8985 --mem ${task.memory.toGiga()-2}g
--solr-config solr_config --core grebi_nodes_${params.subgraph} --in-data . --out-path solr --port 8985 --mem ${params.solr_mem}
"""
}

process create_solr_edges_core {
cache "lenient"
memory "4 GB"
memory "1500 GB"
time "23h"
cpus "8"

Expand All @@ -469,7 +469,8 @@ process create_solr_edges_core {
--in-template-config-dir ${params.home}/06_prepare_db_import/solr_config_template \
--out-config-dir solr_config
python3 ${params.home}/07_create_db/solr/solr_import.slurm.py \
--solr-config solr_config --core grebi_edges_${params.subgraph} --in-data . --out-path solr --port 8986 --mem ${task.memory.toGiga()-2}g
--solr-config solr_config --core grebi_edges_${params.subgraph} --in-data . --out-path /dev/shm/solr --port 8986 --mem ${params.solr_mem}
mv /dev/shm/solr solr
"""
}

Expand All @@ -496,7 +497,7 @@ process create_solr_autocomplete_core {
--in-template-config-dir ${params.home}/06_prepare_db_import/solr_config_template \
--out-config-dir solr_config
python3 ${params.home}/07_create_db/solr/solr_import.slurm.py \
--solr-config solr_config --core grebi_autocomplete_${params.subgraph} --in-data . --in-names-txt ${names_txt} --out-path solr --port 8987 --mem ${task.memory.toGiga()-2}g
--solr-config solr_config --core grebi_autocomplete_${params.subgraph} --in-data . --in-names-txt ${names_txt} --out-path solr --port 8987 --mem ${params.solr_mem}
"""
}

Expand Down Expand Up @@ -716,6 +717,23 @@ process copy_rocksdb_to_staging {
"""
}

process copy_neo_to_staging {
cache "lenient"
memory "4 GB"
time "8h"
queue "datamover"

input:
path(neodb)

script:
"""
#!/usr/bin/env bash
set -Eeuo pipefail
mkdir -p /nfs/public/rw/ontoapps/grebi/staging/neo4j
cp -LR * /nfs/public/rw/ontoapps/grebi/staging/neo4j/
"""
}

def parseJson(json) {
return new JsonSlurper().parseText(json)
Expand Down
2 changes: 2 additions & 0 deletions dataload/nextflow/local_nextflow.config
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
params.solr_mem = "1g"

process {
withName: assign_ids {
memory = 4.GB
Expand Down
5 changes: 4 additions & 1 deletion dataload/scripts/dataload.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
import os
import subprocess
from pathlib import Path

GREBI_DATALOAD_HOME = os.environ['GREBI_DATALOAD_HOME']
GREBI_CONFIG = os.environ['GREBI_CONFIG']
Expand All @@ -13,7 +14,9 @@
for subgraph in config['subgraphs']:
print(f"===== LOADING SUBGRAPH: {subgraph} =====")
os.environ['GREBI_SUBGRAPH'] = subgraph
res = os.system(f'NXF_WORK=work_{subgraph} nextflow {GREBI_DATALOAD_HOME}/nextflow/load_subgraph.nf -c {GREBI_NEXTFLOW_CONFIG} -resume')
nextflow_dir_path = "nextflow_" + subgraph
Path(nextflow_dir_path).mkdir(parents=True, exist_ok=True)
res = os.system(f'cd {nextflow_dir_path} && nextflow {GREBI_DATALOAD_HOME}/nextflow/load_subgraph.nf -c {GREBI_NEXTFLOW_CONFIG} -resume')
if res != 0:
exit(res)
print(f"===== FINISHED LOADING SUBGRAPH: {subgraph} =====")
Expand Down
9 changes: 4 additions & 5 deletions dataload/scripts/dataload_codon.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
export GREBI_DATALOAD_HOME=/nfs/production/parkinso/spot/grebi
export GREBI_DATALOAD_HOME=/nfs/production/parkinso/spot/grebi/dataload
export GREBI_TMP=/hps/nobackup/parkinso/spot/grebi/tmp
export GREBI_FAST_TMP=/dev/shm
export GREBI_CONFIG=ebi
Expand All @@ -9,12 +9,11 @@ export GREBI_MAX_ENTITIES=1000000000
export GREBI_NEXTFLOW_CONFIG=$GREBI_DATALOAD_HOME/nextflow/codon_nextflow.config
module load nextflow-22.10.1-gcc-11.2.0-ju5saqw
module load python
module load py-pyyaml
export PYTHONPATH="/homes/spotbot/.local/lib/python3.6/site-packages:$PYTHONPATH"
source /nfs/production/parkinso/spot/grebi/.venv/bin/activate
cd /hps/nobackup/parkinso/spot/grebi/
export PYTHONUNBUFFERED=true
srun -p datamover --time 1:0:0 --mem 8g bash -c "rm -rf /nfs/public/rw/ontoapps/grebi/staging && mkdir /nfs/public/rw/ontoapps/grebi/staging"
srun --time 3-0:0:0 --mem 8g bash -c "rm -rf work* tmp && python3 ${GREBI_DATALOAD_HOME}/scripts/dataload.py"
#srun --time 23:0:0 --mem 8g bash -c "python3 ${GREBI_DATALOAD_HOME}/scripts/dataload_codon.py"
srun --time 3-0:0:0 --mem 8g bash -c "rm -rf nextflow* work* tmp"
srun --time 3-0:0:0 --mem 8g bash -c "python3 ${GREBI_DATALOAD_HOME}/scripts/dataload.py"


0 comments on commit 11afc55

Please sign in to comment.