From 3caaf205e5b6eb6cba8c64e44ceb339882a1a30c Mon Sep 17 00:00:00 2001 From: Kirill Tsukanov Date: Fri, 11 Oct 2019 12:24:48 +0100 Subject: [PATCH 1/8] More simple & reliable retry policy for JSON requests --- bin/clinvar_jsons/traits_to_zooma_format.py | 4 +-- eva_cttv_pipeline/trait_mapping/ols.py | 4 +-- eva_cttv_pipeline/trait_mapping/utils.py | 39 +++++---------------- eva_cttv_pipeline/trait_mapping/zooma.py | 4 +-- requirements.txt | 1 + 5 files changed, 15 insertions(+), 37 deletions(-) diff --git a/bin/clinvar_jsons/traits_to_zooma_format.py b/bin/clinvar_jsons/traits_to_zooma_format.py index 8e1859ad..38c69d22 100644 --- a/bin/clinvar_jsons/traits_to_zooma_format.py +++ b/bin/clinvar_jsons/traits_to_zooma_format.py @@ -11,7 +11,7 @@ import requests from clinvar_jsons_shared_lib import clinvar_jsons, get_traits_from_json, has_allowed_clinical_significance -from eva_cttv_pipeline.trait_mapping.utils import request_retry_helper +from eva_cttv_pipeline.trait_mapping.utils import json_request DATE = strftime("%d/%m/%y %H:%M", gmtime()) @@ -110,7 +110,7 @@ def get_clinvar_accession(clinvar_json): def get_zooma_uris(trait_name, zooma_host, filters): url = build_zooma_query(trait_name, filters, zooma_host) - json_response = request_retry_helper(url) + json_response = json_request(url) if json_response is None: return None diff --git a/eva_cttv_pipeline/trait_mapping/ols.py b/eva_cttv_pipeline/trait_mapping/ols.py index 15b55a89..a97fe61b 100644 --- a/eva_cttv_pipeline/trait_mapping/ols.py +++ b/eva_cttv_pipeline/trait_mapping/ols.py @@ -3,7 +3,7 @@ import requests import urllib -from eva_cttv_pipeline.trait_mapping.utils import request_retry_helper +from eva_cttv_pipeline.trait_mapping.utils import json_request OLS_EFO_SERVER = 'https://www.ebi.ac.uk/ols' @@ -30,7 +30,7 @@ def get_ontology_label_from_ols(ontology_uri: str) -> str: :return: Term label for the ontology URI provided in the parameters. """ url = build_ols_query(ontology_uri) - json_response = request_retry_helper(url) + json_response = json_request(url) if not json_response: return None diff --git a/eva_cttv_pipeline/trait_mapping/utils.py b/eva_cttv_pipeline/trait_mapping/utils.py index 485b47e4..8c10edb3 100644 --- a/eva_cttv_pipeline/trait_mapping/utils.py +++ b/eva_cttv_pipeline/trait_mapping/utils.py @@ -1,37 +1,14 @@ import logging import requests +from retry import retry logger = logging.getLogger(__package__) -def json_request(url: str, payload: dict) -> dict: - """Makes a GET request with the specified URL and payload, attempts to parse the result as a JSON string and - return it as a dictionary, on failure raises an exception.""" - result = requests.get(url, data=payload) - assert result.ok +@retry(exceptions=(ConnectionError, requests.RequestException), logger=logger, + tries=4, delay=2, backoff=1.2, jitter=(1, 3)) +def json_request(url: str, payload: dict = None, method=requests.get) -> dict: + """Makes a request of a specified type (by default GET) with the specified URL and payload, attempts to parse the + result as a JSON string and return it as a dictionary, on failure raises an exception.""" + result = method(url, data=payload) + result.raise_for_status() return result.json() - - -def retry_helper(function, kwargs: dict, retry_count: int): - """ - Given a function, make a `retry_count` number of attempts to call it until it returns a value without raising - an exception, and subsequently return this value. If all attempts to run the function are unsuccessful, return None. - - :param function: Function that could need multiple attempts to return a value - :param kwargs: Dictionary with function's keyword arguments - :param retry_count: Number of attempts to make - :return: Returned value of the function. - """ - for retry_num in range(retry_count): - try: - return function(**kwargs) - except Exception as e: - logger.warning("Attempt {}: failed running function {} with kwargs {}".format(retry_num, function, kwargs)) - logger.warning(e) - logger.warning("Error on last attempt, skipping") - return None - - -def request_retry_helper(url: str, payload: dict = None, retry_count: int = 4): - """Makes a GET request with the specified URL and payload via `json_request`, makes several attempts and handles - the exceptions via `retry_helper`.""" - return retry_helper(json_request, {'url': url, 'payload': payload}, retry_count) diff --git a/eva_cttv_pipeline/trait_mapping/zooma.py b/eva_cttv_pipeline/trait_mapping/zooma.py index 17a1a79a..ed643cad 100644 --- a/eva_cttv_pipeline/trait_mapping/zooma.py +++ b/eva_cttv_pipeline/trait_mapping/zooma.py @@ -3,7 +3,7 @@ import logging from eva_cttv_pipeline.trait_mapping.ols import get_ontology_label_from_ols, is_current_and_in_efo, is_in_efo -from eva_cttv_pipeline.trait_mapping.utils import request_retry_helper +from eva_cttv_pipeline.trait_mapping.utils import json_request logger = logging.getLogger(__package__) @@ -98,7 +98,7 @@ def get_zooma_results(trait_name: str, filters: dict, zooma_host: str) -> list: """ url = build_zooma_query(trait_name, filters, zooma_host) - zooma_response_list = request_retry_helper(url) + zooma_response_list = json_request(url) if zooma_response_list is None: return [] diff --git a/requirements.txt b/requirements.txt index 46217520..63578444 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,7 @@ query-string==0.0.4 request==0.0.2 requests>=2.20.0 requests_mock +retry==0.9.2 setuptools-git==1.1 sh==1.11 six==1.12.0 From 9ad1a92fe64b5b53753df0f02aead5fbed604093 Mon Sep 17 00:00:00 2001 From: Kirill Tsukanov Date: Fri, 11 Oct 2019 12:25:01 +0100 Subject: [PATCH 2/8] Logs: set default level to DEBUG --- eva_cttv_pipeline/evidence_string_generation/__init__.py | 2 +- eva_cttv_pipeline/trait_mapping/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/eva_cttv_pipeline/evidence_string_generation/__init__.py b/eva_cttv_pipeline/evidence_string_generation/__init__.py index 7174042f..4912c701 100644 --- a/eva_cttv_pipeline/evidence_string_generation/__init__.py +++ b/eva_cttv_pipeline/evidence_string_generation/__init__.py @@ -1,4 +1,4 @@ import logging logging.basicConfig() logger = logging.getLogger(__package__) -logger.setLevel(level=logging.INFO) +logger.setLevel(level=logging.DEBUG) diff --git a/eva_cttv_pipeline/trait_mapping/__init__.py b/eva_cttv_pipeline/trait_mapping/__init__.py index 7174042f..4912c701 100644 --- a/eva_cttv_pipeline/trait_mapping/__init__.py +++ b/eva_cttv_pipeline/trait_mapping/__init__.py @@ -1,4 +1,4 @@ import logging logging.basicConfig() logger = logging.getLogger(__package__) -logger.setLevel(level=logging.INFO) +logger.setLevel(level=logging.DEBUG) From c5d74a2c836eb2d8c62bb0d1432d5d643d8385e2 Mon Sep 17 00:00:00 2001 From: Kirill Tsukanov Date: Fri, 11 Oct 2019 12:25:19 +0100 Subject: [PATCH 3/8] Logs: remove double warning for the same error --- eva_cttv_pipeline/trait_mapping/zooma.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/eva_cttv_pipeline/trait_mapping/zooma.py b/eva_cttv_pipeline/trait_mapping/zooma.py index ed643cad..9c800d86 100644 --- a/eva_cttv_pipeline/trait_mapping/zooma.py +++ b/eva_cttv_pipeline/trait_mapping/zooma.py @@ -112,8 +112,6 @@ def get_zooma_results(trait_name: str, filters: dict, zooma_host: str) -> list: zooma_mapping.ontology_label = label else: # If no label is returned (because OLS failed to provide it), keep the existing one from ZOOMA - logger.warning(("Couldn't retrieve ontology label from OLS for trait '{}', using label specified " - "by ZOOMA instead").format(trait_name)) zooma_mapping.ontology_label = zooma_result.zooma_label uri_is_current_and_in_efo = is_current_and_in_efo(zooma_mapping.uri) From f145fd848e19edcfb298f4590c95ca4f05893990 Mon Sep 17 00:00:00 2001 From: Kirill Tsukanov Date: Fri, 11 Oct 2019 12:25:41 +0100 Subject: [PATCH 4/8] Logs: do not show warnings for expected errors (MedGen and OMIM are not in OLS) --- eva_cttv_pipeline/trait_mapping/ols.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/eva_cttv_pipeline/trait_mapping/ols.py b/eva_cttv_pipeline/trait_mapping/ols.py index a97fe61b..7b655419 100644 --- a/eva_cttv_pipeline/trait_mapping/ols.py +++ b/eva_cttv_pipeline/trait_mapping/ols.py @@ -37,7 +37,8 @@ def get_ontology_label_from_ols(ontology_uri: str) -> str: # If the '_embedded' section is missing from the response, it means that the term is not found in OLS if '_embedded' not in json_response: - logger.warning('OLS queried OK but did not return any results for URL {}'.format(url)) + if '/medgen/' not in url and '/omim/' not in url: + logger.warning('OLS queried OK but did not return any results for URL {}'.format(url)) return None # Go through all terms found by the requested identifier and try to find the one where the _identifier_ and the @@ -48,7 +49,8 @@ def get_ontology_label_from_ols(ontology_uri: str) -> str: if term["is_defining_ontology"]: return term["label"] - logger.warning('OLS queried OK, but there is no defining ontology in its results for URL {}'.format(url)) + if '/medgen/' not in url and '/omim/' not in url: + logger.warning('OLS queried OK, but there is no defining ontology in its results for URL {}'.format(url)) return None From da7d18895476d4140564af96b21fc982544728e6 Mon Sep 17 00:00:00 2001 From: Kirill Tsukanov Date: Fri, 11 Oct 2019 12:25:53 +0100 Subject: [PATCH 5/8] Increase memory for trait mapping; remove --unattended flag --- bin/trait_mapping.py | 5 +---- docs/submit-opentargets-batch.md | 6 +++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/bin/trait_mapping.py b/bin/trait_mapping.py index f17f9ca7..68cf0634 100644 --- a/bin/trait_mapping.py +++ b/bin/trait_mapping.py @@ -9,7 +9,7 @@ def launch(): main.main(parser.input_filepath, parser.output_mappings_filepath, parser.output_curation_filepath, parser.filters, parser.zooma_host, - parser.oxo_target_list, parser.oxo_distance, parser.unattended) + parser.oxo_target_list, parser.oxo_distance) class ArgParser: @@ -39,8 +39,6 @@ def __init__(self, argv): help="target ontologies to use with OxO") parser.add_argument("-d", dest="oxo_distance", default=3, help="distance to use to query OxO.") - parser.add_argument('-u', dest="unattended", action='store_true', - help="unattended launch, hide ETA estimates") args = parser.parse_args(args=argv[1:]) @@ -55,7 +53,6 @@ def __init__(self, argv): self.zooma_host = args.zooma_host self.oxo_target_list = [target.strip() for target in args.oxo_target_list.split(",")] self.oxo_distance = args.oxo_distance - self.unattended = args.unattended if __name__ == '__main__': diff --git a/docs/submit-opentargets-batch.md b/docs/submit-opentargets-batch.md index bb98b06e..087cf0b0 100644 --- a/docs/submit-opentargets-batch.md +++ b/docs/submit-opentargets-batch.md @@ -87,7 +87,7 @@ A new Java package will be generated in the directory `clinvar-xml-parser/src/ma Here we transform ClinVar's XML file into a JSON file which can be parsed by the downstream tools, using an XML parser which we (if necessary) updated during the previous step. ```bash -cd ${CODE_ROOT} && ${BSUB_CMDLINE} -M 4G \ +cd ${CODE_ROOT} && ${BSUB_CMDLINE} -n 8 -M 4G \ -o ${BATCH_ROOT}/logs/convert_clinvar_files.out \ -e ${BATCH_ROOT}/logs/convert_clinvar_files.err \ java -jar ${CODE_ROOT}/clinvar-xml-parser/target/clinvar-parser-1.0-SNAPSHOT-jar-with-dependencies.jar \ @@ -168,10 +168,10 @@ The TSV file eventually returned by OpenTargets has these columns: See information about the trait mapping pipeline [here](trait-mapping-pipeline.md). It is run with the following command: ```bash -cd ${CODE_ROOT} && ${BSUB_CMDLINE} \ +cd ${CODE_ROOT} && ${BSUB_CMDLINE} -M 4G \ -o ${BATCH_ROOT}/logs/trait_mapping.out \ -e ${BATCH_ROOT}/logs/trait_mapping.err \ - python bin/trait_mapping.py -u \ + python bin/trait_mapping.py \ -i ${BATCH_ROOT}/clinvar/clinvar.filtered.json.gz \ -o ${BATCH_ROOT}/trait_mapping/automated_trait_mappings.tsv \ -c ${BATCH_ROOT}/trait_mapping/traits_requiring_curation.tsv From f803c2007aef887841602d1b60332e296c9bd055 Mon Sep 17 00:00:00 2001 From: Kirill Tsukanov Date: Fri, 11 Oct 2019 12:26:08 +0100 Subject: [PATCH 6/8] Tolerate OxO HTTP errors (known bug in OxO) --- eva_cttv_pipeline/trait_mapping/oxo.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/eva_cttv_pipeline/trait_mapping/oxo.py b/eva_cttv_pipeline/trait_mapping/oxo.py index 30a52959..9862ba3b 100644 --- a/eva_cttv_pipeline/trait_mapping/oxo.py +++ b/eva_cttv_pipeline/trait_mapping/oxo.py @@ -1,10 +1,11 @@ from functools import total_ordering, lru_cache import logging import re +import requests from eva_cttv_pipeline.trait_mapping.ols import get_ontology_label_from_ols, is_in_efo from eva_cttv_pipeline.trait_mapping.ols import is_current_and_in_efo -from eva_cttv_pipeline.trait_mapping.utils import request_retry_helper +from eva_cttv_pipeline.trait_mapping.utils import json_request logger = logging.getLogger(__package__) @@ -206,7 +207,13 @@ def get_oxo_results(id_list: list, target_list: list, distance: int) -> list: """ url = "https://www.ebi.ac.uk/spot/oxo/api/search?size=5000" payload = build_oxo_payload(id_list, target_list, distance) - oxo_response = request_retry_helper(url, payload) + try: + oxo_response = json_request(url, payload, requests.post) + except requests.HTTPError: + # Sometimes, OxO fails to process a completely valid request even after several attempts. + # See https://github.com/EBISPOT/OXO/issues/26 for details + logger.error('OxO failed to process request for id_list {} (probably a known bug in OxO)'.format(id_list)) + return [] if oxo_response is None: return [] From 2f2cb853e2f58d21b2b306adbec68e04e80a93f8 Mon Sep 17 00:00:00 2001 From: Kirill Tsukanov Date: Fri, 11 Oct 2019 12:26:32 +0100 Subject: [PATCH 7/8] OxO fix: do not try to query with an empty ID list --- eva_cttv_pipeline/trait_mapping/main.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/eva_cttv_pipeline/trait_mapping/main.py b/eva_cttv_pipeline/trait_mapping/main.py index 1fc020c4..65a9cdb4 100644 --- a/eva_cttv_pipeline/trait_mapping/main.py +++ b/eva_cttv_pipeline/trait_mapping/main.py @@ -45,8 +45,8 @@ def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list: "distance" parameter. :return: The original trait after querying Zooma and possibly OxO, with any results found. """ - logger.debug('Processing trait {}'.format(trait.name)) + trait.zooma_result_list = get_zooma_results(trait.name, filters, zooma_host) trait.process_zooma_results() if (trait.is_finished @@ -55,15 +55,14 @@ def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list: for mapping in trait.zooma_result_list for entry in mapping.mapping_list])): return trait + uris_for_oxo_set = get_uris_for_oxo(trait.zooma_result_list) - if len(uris_for_oxo_set) == 0: - return trait oxo_input_id_list = uris_to_oxo_format(uris_for_oxo_set) + if len(oxo_input_id_list) == 0: + return trait trait.oxo_result_list = get_oxo_results(oxo_input_id_list, oxo_target_list, oxo_distance) - if not trait.oxo_result_list: - logger.warning('No OxO mapping for trait {}'.format(trait.name)) - + logger.debug('No OxO mapping for trait {}'.format(trait.name)) trait.process_oxo_mappings() return trait From 27e0588b7933f64d333dc057c9c2e10d89107589 Mon Sep 17 00:00:00 2001 From: Kirill Tsukanov Date: Fri, 11 Oct 2019 12:26:50 +0100 Subject: [PATCH 8/8] Process traits in parallel for faster turnaround times --- eva_cttv_pipeline/trait_mapping/main.py | 36 ++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/eva_cttv_pipeline/trait_mapping/main.py b/eva_cttv_pipeline/trait_mapping/main.py index 65a9cdb4..454a4c07 100644 --- a/eva_cttv_pipeline/trait_mapping/main.py +++ b/eva_cttv_pipeline/trait_mapping/main.py @@ -1,7 +1,7 @@ from collections import Counter import csv import logging -import progressbar +import multiprocessing from eva_cttv_pipeline.trait_mapping.output import output_trait from eva_cttv_pipeline.trait_mapping.oxo import get_oxo_results @@ -30,8 +30,7 @@ def get_uris_for_oxo(zooma_result_list: list) -> set: return uri_set -def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list: list, - oxo_distance: int) -> Trait: +def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list: list, oxo_distance: int) -> Trait: """ Process a single trait. Find any mappings in Zooma. If there are no high confidence Zooma mappings that are in EFO then query OxO with any high confidence mappings not in EFO. @@ -68,11 +67,12 @@ def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list: return trait -def main(input_filepath, output_mappings_filepath, output_curation_filepath, filters, zooma_host, - oxo_target_list, oxo_distance, unattended): +def main(input_filepath, output_mappings_filepath, output_curation_filepath, filters, zooma_host, oxo_target_list, + oxo_distance): logger.info('Started parsing trait names') trait_names_list = parse_trait_names(input_filepath) trait_names_counter = Counter(trait_names_list) + logger.info("Loaded {} trait names".format(len(trait_names_counter))) with open(output_mappings_filepath, "w", newline='') as mapping_file, \ open(output_curation_filepath, "wt") as curation_file: @@ -80,19 +80,19 @@ def main(input_filepath, output_mappings_filepath, output_curation_filepath, fil mapping_writer.writerow(["#clinvar_trait_name", "uri", "label"]) curation_writer = csv.writer(curation_file, delimiter="\t") - trait_names_iterator = trait_names_counter.items() - if not unattended: - progress = progressbar.ProgressBar(max_value=len(trait_names_counter), - widgets=[progressbar.AdaptiveETA(samples=1000)]) - trait_names_iterator = progress(trait_names_iterator) - - logger.info("Loaded {} trait names".format(len(trait_names_counter))) - for i, (trait_name, freq) in enumerate(trait_names_iterator): - trait = Trait(trait_name, freq) - trait = process_trait(trait, filters, zooma_host, oxo_target_list, - oxo_distance) + logger.info('Processing trait names in parallel') + trait_list = [Trait(trait_name, freq) for trait_name, freq in trait_names_counter.items()] + trait_process_pool = multiprocessing.Pool(processes=12) + + processed_trait_list = [ + trait_process_pool.apply( + process_trait, + args=(trait, filters, zooma_host, oxo_target_list, oxo_distance) + ) + for trait in trait_list + ] + + for trait in processed_trait_list: output_trait(trait, mapping_writer, curation_writer) - if unattended and i % 100 == 0: - logger.info("Processed {} records".format(i)) logger.info('Finished processing trait names')