From f8d0606743866b4a8244b1cd9b8ae2d55a2f4754 Mon Sep 17 00:00:00 2001 From: Sigve Nakken Date: Sun, 4 Feb 2024 00:04:43 +0100 Subject: [PATCH] minor updates --- pcgr/cpsr.py | 6 ++- pcgr/pcgr_vars.py | 6 +-- pcgrr/R/input_data.R | 2 +- pcgrr/R/reference_data.R | 92 +++++++++++++++++++++++++++++++++------ pcgrr/data-raw/data-raw.R | 12 +++-- 5 files changed, 94 insertions(+), 24 deletions(-) diff --git a/pcgr/cpsr.py b/pcgr/cpsr.py index b14ca629..ab33ef4d 100755 --- a/pcgr/cpsr.py +++ b/pcgr/cpsr.py @@ -175,7 +175,7 @@ def run_cpsr(conf_options, cpsr_paths): else: logger.info(f"Diagnostic-grade genes in virtual panels (GE PanelApp): " + \ f"{'ON' if conf_options['gene_panel']['diagnostic_grade_only'] else 'OFF'}") - logger.info(f"Include incidental findings (ACMG recommended list v3.1): " + \ + logger.info(f"Include incidental findings (ACMG recommended list v3.2): " + \ f"{'ON' if conf_options['variant_classification']['secondary_findings'] else 'OFF'}") logger.info(f"Include low to moderate cancer risk variants from genome-wide association studies: " + \ f"{'ON' if conf_options['variant_classification']['gwas_findings'] else 'OFF'}") @@ -204,6 +204,7 @@ def run_cpsr(conf_options, cpsr_paths): output_vcf = vep_vcf) logger = getlogger('cpsr-vep') + #print(str(vep_command["main"])) logger.info(( f"CPSR - STEP 1: Basic variant annotation with Variant Effect Predictor (version {pcgr_vars.VEP_VERSION}, " @@ -223,6 +224,7 @@ def run_cpsr(conf_options, cpsr_paths): check_subprocess(logger, vep_command["tabix"], debug) logger.info("Finished cpsr-vep") print('----') + #exit(0) ## CPSR|vcfanno - run vcfanno on query VCF with a number of relevant annotated VCFs logger = getlogger('cpsr-vcfanno') @@ -289,7 +291,7 @@ def run_cpsr(conf_options, cpsr_paths): outfile.write(yaml.dump(yaml_data)) outfile.close() - variant_set.to_csv(output_pass_tsv_gz, sep="\t", compression="gzip", index=False) + variant_set.fillna('.').to_csv(output_pass_tsv_gz, sep="\t", compression="gzip", index=False) if not debug: remove_file(output_pass_vcf2tsv_gz) diff --git a/pcgr/pcgr_vars.py b/pcgr/pcgr_vars.py index 43ec132a..aff86bc3 100644 --- a/pcgr/pcgr_vars.py +++ b/pcgr/pcgr_vars.py @@ -3,7 +3,7 @@ from pcgr._version import __version__ PCGR_VERSION = __version__ -DB_VERSION = '20231212' +DB_VERSION = '20240203' ## MISCELLANEOUS NCBI_BUILD_MAF = 'GRCh38' @@ -12,13 +12,13 @@ RECOMMENDED_N_MUT_SIGNATURE = 200 ## GENCODE -GENCODE_VERSION = {'grch38': 44,'grch37': 19} +GENCODE_VERSION = {'grch38': 45,'grch37': 19} ## vcfanno VCFANNO_MAX_PROC = 15 ## VEP settings/versions -VEP_VERSION = '110' +VEP_VERSION = '111' VEP_ASSEMBLY = {'grch38': 'GRCh38','grch37': 'GRCh37'} VEP_MIN_FORKS = 1 VEP_MAX_FORKS = 8 diff --git a/pcgrr/R/input_data.R b/pcgrr/R/input_data.R index 94a2f63e..399c9ae3 100644 --- a/pcgrr/R/input_data.R +++ b/pcgrr/R/input_data.R @@ -219,7 +219,7 @@ load_dna_variants <- function( .data$EVIDENCE_ID, sep=";" ) |> dplyr::group_by( - .data$EVIDENCE_ID + EVIDENCE_ID ) |> dplyr::summarise( CITATION = paste( diff --git a/pcgrr/R/reference_data.R b/pcgrr/R/reference_data.R index a6c5f9e6..32013882 100644 --- a/pcgrr/R/reference_data.R +++ b/pcgrr/R/reference_data.R @@ -115,6 +115,7 @@ load_reference_data <- function( pcgr_ref_data[["gene"]][["panel"]] <- data.frame() pcgr_ref_data[["gene"]][["cpg"]] <- data.frame() pcgr_ref_data[['gene']][['gene_xref']] <- data.frame() + pcgr_ref_data[['gene']][['transcript_xref']] <- data.frame() cpg_tsv_fname <- file.path( pcgr_db_assembly_dir, "gene", "tsv", @@ -156,6 +157,23 @@ load_reference_data <- function( "gene_transcript_xref.tsv.gz" ) check_file_exists(gene_xref_tsv_fname) + + pcgr_ref_data[['gene']][['transcript_xref']] <- as.data.frame( + readr::read_tsv(gene_xref_tsv_fname, show_col_types = F)) |> + dplyr::select( + c("chrom", + "ensembl_gene_id", + "ensembl_transcript_id", + "gencode_transcript_biotype", + "gene_biotype" + ) + ) |> + dplyr::distinct() + + colnames(pcgr_ref_data[['gene']][['transcript_xref']]) <- + toupper(colnames(pcgr_ref_data[['gene']][['transcript_xref']])) + + pcgr_ref_data[['gene']][['gene_xref']] <- as.data.frame( readr::read_tsv(gene_xref_tsv_fname, show_col_types = F)) |> dplyr::select( @@ -175,7 +193,7 @@ load_reference_data <- function( "cancergene_evidence") ) |> dplyr::rename( - genename = .data$name + genename = name ) |> dplyr::mutate( entrezgene = as.character(.data$entrezgene) @@ -251,6 +269,27 @@ load_reference_data <- function( toupper(colnames(pcgr_ref_data[['variant']][['gwas']])) + pcgr_ref_data[['variant']][['varstats']] <- list() + ## Get variant statistics + for(vardb in c('clinvar','gwas','tcga', + 'gnomad_non_cancer','dbmts', + 'dbnsfp')){ + varstats_fname <- + file.path( + pcgr_db_assembly_dir, "variant", "vcf", vardb, + paste0(vardb,".vcf_varstats.tsv") + ) + + if(file.exists(varstats_fname)){ + pcgr_ref_data[['variant']][['varstats']][[vardb]] <- + as.data.frame( + readr::read_tsv( + varstats_fname, show_col_types = F)) + } + + } + + ## 3. Phenotype ontologies @@ -294,54 +333,79 @@ load_reference_data <- function( file.path( pcgr_db_assembly_dir, "misc", "other", "msi_classification", - "msi_classification.rds" + "tcga_msi_classifier.rds" ) check_file_exists(msi_model_rds) pcgr_ref_data[['msi']] <- readRDS(msi_model_rds) + pcgr_ref_data[['misc']] <- list() ## 5. Miscellaneous for(elem in c('tmb', 'mutational_signature', - 'pathway')){ + 'pathway', + 'hotspot', + 'protein_domain')){ fname_misc <- file.path( pcgr_db_assembly_dir, "misc", "tsv", elem, paste0(elem,".tsv.gz") ) + + # if(elem == 'hotspot'){ + # fname_misc <- file.path( + # pcgr_db_assembly_dir, "misc", "tsv", elem, + # paste0(elem,".tsv.gz") + # ) + # } + check_file_exists(fname_misc) - pcgr_ref_data[[elem]] <- as.data.frame( + pcgr_ref_data[['misc']][[elem]] <- as.data.frame( readr::read_tsv( fname_misc, show_col_types = F, na = ".") ) - colnames(pcgr_ref_data[[elem]]) <- - toupper(colnames(pcgr_ref_data[[elem]])) + colnames(pcgr_ref_data[['misc']][[elem]]) <- + toupper(colnames(pcgr_ref_data[['misc']][[elem]])) } - tmp = pcgr_ref_data[['pathway']] - pcgr_ref_data[['pathway']] <- list() - pcgr_ref_data[['pathway']][['long']] <- tmp - pcgr_ref_data[['pathway']][['wide']] <- as.data.frame( + tmp = pcgr_ref_data[['misc']][['pathway']] + pcgr_ref_data[['misc']][['pathway']] <- list() + pcgr_ref_data[['misc']][['pathway']][['long']] <- tmp + pcgr_ref_data[['misc']][['pathway']][['wide']] <- as.data.frame( tmp |> dplyr::group_by(.data$GENE_ID) |> dplyr::summarise(LINK = paste(.data$URL_HTML, collapse = ", "))) ## 6. Drugs + + pcgr_ref_data[['drug']] <- list() drug_tsv_fname <- file.path( pcgr_db_assembly_dir, "drug", - "tsv", "drug.tsv.gz" + "tsv", "drug_targeted.tsv.gz" ) check_file_exists(drug_tsv_fname) - pcgr_ref_data[['drug']] <- as.data.frame( + pcgr_ref_data[['drug']][['targeted']] <- as.data.frame( readr::read_tsv(drug_tsv_fname, show_col_types = F, na = ".") ) - colnames(pcgr_ref_data[['drug']]) <- - toupper(colnames(pcgr_ref_data[['drug']])) + colnames(pcgr_ref_data[['drug']][['targeted']]) <- + toupper(colnames(pcgr_ref_data[['drug']][['targeted']])) + + drug_all_tsv_fname <- + file.path( + pcgr_db_assembly_dir, "drug", + "tsv", "drug_all.tsv.gz" + ) + check_file_exists(drug_all_tsv_fname) + pcgr_ref_data[['drug']][['all']] <- as.data.frame( + readr::read_tsv(drug_all_tsv_fname, show_col_types = F, na = ".") + ) + colnames(pcgr_ref_data[['drug']][['all']]) <- + toupper(colnames(pcgr_ref_data[['drug']][['all']])) ## 7. Biomarkers pcgr_ref_data[['biomarker']] <- list() diff --git a/pcgrr/data-raw/data-raw.R b/pcgrr/data-raw/data-raw.R index 50860ff0..0dfc1a56 100755 --- a/pcgrr/data-raw/data-raw.R +++ b/pcgrr/data-raw/data-raw.R @@ -15,10 +15,14 @@ for (c in c("pathogenicity", "clinical_evidence", "tier", } if (c == "clinical_evidence") { color_palette[[c]][["levels"]] <- - c("A: Validated", "A: FDA/NCCN/ELN guidelines", - "B: Clinical evidence", "B1: Clinical evidence: late trials", - "B2: Clinical evidence: early trials", "C: Case study", - "D: Preclinical evidence", "E: Indirect evidence") + c("A: Validated", + "A: FDA/NCCN/ELN guidelines", + "B: Clinical evidence", + "B1: Clinical evidence: late trials", + "B2: Clinical evidence: early trials", + "C: Case study", + "D: Preclinical evidence", + "E: Indirect evidence") color_palette[[c]][["values"]] <- c("#009E73", "#009E73", "#56B4E9", "#56B4E9", "#56B4E9", "#0072B2", "#E69F00", "#F0E442")