From 17ec504d7f7663afe3653caf68719392857580c7 Mon Sep 17 00:00:00 2001 From: Ming Ying Date: Thu, 2 Apr 2020 11:12:36 -0400 Subject: [PATCH] Added CTDC SBG manifest generation script. --- ctdc_sbg_manifect.py | 150 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 ctdc_sbg_manifect.py diff --git a/ctdc_sbg_manifect.py b/ctdc_sbg_manifect.py new file mode 100644 index 00000000..9b7b3ac5 --- /dev/null +++ b/ctdc_sbg_manifect.py @@ -0,0 +1,150 @@ +import argparse +import csv +import os + +from neo4j import GraphDatabase + +from bento.common.utils import get_logger, get_time_stamp, LOG_PREFIX + +PSWD_ENV = 'NEO_PASSWORD' +if LOG_PREFIX not in os.environ: + os.environ[LOG_PREFIX] = 'CTDC_SBG_Manifest' + +SPECIMEN_ID = 'Specimen_ID' + +def generate(tx, log): + query = ''' + MATCH(t:clinical_trial)<--(a:arm)<--(c:case)<--(s:specimen)<--(n:nucleic_acid)<--(sa:sequencing_assay)<--(f:file), + (s)<-[*]-(ar:assignment_report), (sa)<--(v:variant_report) + WITH DISTINCT f, t, a, c, ar, s, n, sa, v + OPTIONAL MATCH (s)<--(i_pten:ihc_assay_report) + WHERE i_pten.ihc_test_gene = 'PTEN' + WITH DISTINCT f, t, a, c, ar, s, n, sa, v, i_pten + OPTIONAL MATCH (s)<--(i_msh2:ihc_assay_report) + WHERE i_msh2.ihc_test_gene = 'MSH2' + WITH DISTINCT f, t, a, c, ar, s, n, sa, v, i_pten, i_msh2 + OPTIONAL MATCH (s)<--(i_mlh1:ihc_assay_report) + WHERE i_mlh1.ihc_test_gene = 'MLH1' + WITH DISTINCT f, t, a, c, ar, s, n, sa, v, i_pten, i_msh2, i_mlh1 + OPTIONAL MATCH (s)<--(i_rb:ihc_assay_report) + WHERE i_rb.ihc_test_gene = 'RB' + WITH DISTINCT f, t, a, c, ar, s, n, sa, v, i_pten, i_msh2, i_mlh1, i_rb + RETURN t.clinical_trial_id AS Trial_ID, t.clinical_trial_designation AS Trial_Code, + a.arm_id AS Treatment_Arm, + c.case_id AS Case_ID, c.gender AS Gender, c.race AS Race, c.ethnicity AS Ethnicity, c.disease AS Diagnosis, + c.ctep_category AS CTEP_Category, c.ctep_subcategory AS CTEP_Sub_Category, c.meddra_code AS MedDRA_Code, + c.prior_drugs AS Prior_Drugs, + s.specimen_id AS Specimen_ID, + s.specimen_type AS Specimen_Type, + n.aliquot_id AS Aliquot_ID, + coalesce(i_pten.ihc_test_result, 'UNKNOWN') AS PTEN_IHC_Status, + coalesce(i_mlh1.ihc_test_result, 'UNKNOWN') AS MLH1_IHC_Status, + coalesce(i_msh2.ihc_test_result, 'UNKNOWN') AS MSH2_IHC_Status, + coalesce(i_rb.ihc_test_result, 'UNKNOWN') AS RB_IHC_Status, + ar.assignment_outcome AS Assignment_Outcome, + sa.experimental_method + ':' + CASE f.file_type + WHEN 'Aligned DNA reads file' THEN ' DNA' + WHEN 'Aligned RNA reads file' THEN ' RNA' + WHEN 'Variants file' THEN ' DNA/RNA' + WHEN 'Index file' THEN ' ' + END AS `Experimental_strategy`, + sa.platform AS Platform, + v.reference_genome AS Reference_genome, + f.uuid AS File_UUID, f.file_name AS File_Name, f.file_type AS File_Type, f.file_size AS File_Size, + f.md5sum AS md5sum, f.file_location AS File_Location, 'dg.4DFC/' + f.uuid AS GUID + ORDER BY Treatment_Arm, Case_ID, File_Name + ''' + + fieldnames = [ + "Trial_ID", + "Trial_Code", + "Treatment_Arm", + "Case_ID", + "Gender", + "Race", + "Ethnicity", + "Diagnosis", + "CTEP_Category", + "CTEP_Sub_Category", + "MedDRA_Code", + "Prior_Drugs", + "Specimen_ID", + "Specimen_Type", + "Aliquot_ID", + "PTEN_IHC_Status", + "MLH1_IHC_Status", + "MSH2_IHC_Status", + "RB_IHC_Status", + "Assignment_Outcome", + "Experimental_strategy", + "Platform", + "Reference_genome", + "File_UUID", + "File_Name", + "File_Type", + "File_Size", + "md5sum", + "File_Location", + "GUID" + ] + + result = tx.run(query) + manifest_file = f'tmp/CTDC_SBG_Manifest_{get_time_stamp()}.csv' + + with open(manifest_file, 'w') as of: + writer = csv.DictWriter(of, fieldnames=fieldnames) + writer.writeheader() + specimen_list = {} + file_list = [] + line_num = 1 + for obj in result: + line_num += 1 + file_name = obj['File_Name'] + specimen_id = obj[SPECIMEN_ID] + log.info(f'Processing {obj["Case_ID"]}: {specimen_id}: {file_name}') + specimen = specimen_list.get(specimen_id, {}) + if file_name in specimen: + raise Exception(f'Line: {line_num} - Duplicated file name: "{file_name}"') + + data = obj.data() + specimen[file_name] = data + file_list.append(data) + specimen_list[specimen_id] = specimen + + for file in file_list: + specimen = specimen_list[file[SPECIMEN_ID]] + if file['File_Type'] == 'Index file': + update_experimental_strategy(file, specimen) + + log.info(f'Saving {file["Case_ID"]}: {file["File_Name"]}') + writer.writerow(file) + + log.info(f'Manifest saved to "{manifest_file}"') + + +def update_experimental_strategy(file, specimen): + file_name = file['File_Name'] + for name, obj in specimen.items(): + if name == file_name: + continue + elif file_name.startswith(name): + file['Experimental_strategy'] = obj['Experimental_strategy'] + return + +def main(): + parser = argparse.ArgumentParser(description='Generate CTDC SBG manifest') + parser.add_argument('-i', '--uri', help='Neo4j uri like bolt://12.34.56.78:7687') + parser.add_argument('-u', '--user', help='Neo4j user', default='neo4j') + parser.add_argument('-p', '--password', help='Neo4j password', default=os.environ[PSWD_ENV]) + args = parser.parse_args() + + log = get_logger('CTDC_SBG_Manifest') + + with GraphDatabase.driver(args.uri, auth=(args.user, args.password)) as driver: + with driver.session() as session: + tx = session.begin_transaction() + generate(tx, log) + + +if __name__ == '__main__': + main() \ No newline at end of file