diff --git a/README.md b/README.md index 0322c47f..c4bba79e 100755 --- a/README.md +++ b/README.md @@ -28,6 +28,10 @@ PCGR originates from the [Norwegian Cancer Genomics Consortium (NCGC)](http://ca ### Top News +- *July 2024*: **2.0.1 release** + - patch with bug fix for mitochondrial input variants ([pr245](https://github.com/sigven/pcgr/pull/245)) + - [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html) + - *June 2024*: **2.0.0 release** - Details in [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html) - Massive reference data bundle upgrade, new report layout, oncogenicity classification++ diff --git a/pcgr/annoutils.py b/pcgr/annoutils.py index bd27e66f..acf8126b 100755 --- a/pcgr/annoutils.py +++ b/pcgr/annoutils.py @@ -199,7 +199,7 @@ def threeToOneAA(aa_change): return aa_change -def assign_cds_exon_intron_annotations(csq_record): +def assign_cds_exon_intron_annotations(csq_record, logger): csq_record['CODING_STATUS'] = 'noncoding' @@ -274,13 +274,13 @@ def assign_cds_exon_intron_annotations(csq_record): cds_pos = cds_pos_full.split('-')[0] if cds_pos.isdigit(): cds_pos = int(cds_pos) - else: - print('BALLE1 - ' + str(cds_pos_full) + ' - ' + str(cds_pos)) + #else: + # logger.warning(f'Could not determine variant CDS position from VEP annotation - ({csq_record["CDS_position"]})') else: if cds_pos_full.isdigit(): cds_pos = int(cds_pos_full) - else: - print('BALLE2 - ' + str(cds_pos_full) + ' - ' + str(cds_pos)) + #else: + # logger.warning(f'Could not determine variant CDS position from VEP annotation - ({csq_record["CDS_position"]})') if int(cds_pos) > -1 and int(cds_pos) <= int(cds_length): csq_record['CDS_RELATIVE_POSITION'] = float(cds_pos/cds_length) diff --git a/pcgr/vep.py b/pcgr/vep.py index a1079bb9..0d93f281 100644 --- a/pcgr/vep.py +++ b/pcgr/vep.py @@ -145,7 +145,7 @@ def get_csq_record_annotations(csq_fields, varkey, logger, vep_csq_fields_map, t csq_record['SYMBOL'] = transcript_xref_map[ensembl_transcript_id]['SYMBOL'] # Assign coding status, protein change, coding sequence change, last exon/intron status etc as VCF info tags - csq_record = assign_cds_exon_intron_annotations(csq_record) + csq_record = assign_cds_exon_intron_annotations(csq_record, logger) return(csq_record) diff --git a/pcgrr/NAMESPACE b/pcgrr/NAMESPACE index 4bcaac0a..93c7fe14 100644 --- a/pcgrr/NAMESPACE +++ b/pcgrr/NAMESPACE @@ -24,6 +24,7 @@ export(dbsnp_germline_status) export(deduplicate_eitems) export(detect_vcf_sample_name) export(df_string_replace) +export(exclude_non_chrom_variants) export(expand_biomarker_items) export(export_quarto_evars) export(filter_eitems_by_site) @@ -42,7 +43,6 @@ export(get_clin_assocs_cna) export(get_dt_tables) export(get_excel_sheets) export(get_genome_obj) -export(get_ordinary_chromosomes) export(get_prevalent_site_signatures) export(get_valid_chromosomes) export(het_af_germline_status) diff --git a/pcgrr/R/input_data.R b/pcgrr/R/input_data.R index ba67f154..c436245d 100644 --- a/pcgrr/R/input_data.R +++ b/pcgrr/R/input_data.R @@ -55,7 +55,8 @@ load_somatic_cna <- function( .data$ACTIONABILITY_TIER, dplyr::desc(.data$TISSUE_ASSOC_RANK), dplyr::desc(.data$GLOBAL_ASSOC_RANK)) |> - pcgrr::order_variants(pos_var = 'SEGMENT_START') + pcgrr::order_variants(pos_var = 'SEGMENT_START') |> + pcgrr::exclude_non_chrom_variants() pcgrr::log4r_info( "Generating data frame with hyperlinked variant/gene annotations") @@ -193,7 +194,8 @@ load_somatic_snv_indel <- function( ) )) |> dplyr::select(-c("tmp_HGVSc","ENST")) |> - pcgrr::order_variants(pos_var = 'POS') + pcgrr::order_variants(pos_var = 'POS') |> + pcgrr::exclude_non_chrom_variants() ## Tumor-only input diff --git a/pcgrr/R/mutational_signatures.R b/pcgrr/R/mutational_signatures.R index 6e622685..1e5f982b 100644 --- a/pcgrr/R/mutational_signatures.R +++ b/pcgrr/R/mutational_signatures.R @@ -675,9 +675,7 @@ generate_report_data_rainfall <- function(variant_set, build = NULL) { pcg_report_rainfall <- pcgrr::init_rainfall_content() - if (NROW(variant_set) == 0) { - return(pcg_report_rainfall) - } + invisible(assertthat::assert_that (assertthat::is.flag(autosomes), @@ -695,6 +693,10 @@ generate_report_data_rainfall <- function(variant_set, "') not allowed, available reference build values are:", "'grch37' or 'grch38'"))) + if (NROW(variant_set) == 0) { + return(pcg_report_rainfall) + } + pcgrr::log4r_info("------") pcgrr::log4r_info(paste0("Calculating data for rainfall plot")) @@ -745,12 +747,6 @@ generate_report_data_rainfall <- function(variant_set, stringr::str_replace(.data$MUTATION_TYPE, ":[A-Z]>[A-Z]$", ""), as.character(.data$MUTATION_TYPE))) |> - # dplyr::mutate( - # MUTATION_TYPE = - # dplyr::if_else(stringr::str_detect(.data$MUTATION_TYPE, "^A>"), - # stringr::str_replace(.data$MUTATION_TYPE, - # "^[A-Z]>[A-Z]:", ""), - # as.character(.data$MUTATION_TYPE))) |> pcgrr::sort_chromosomal_segments() bsg <- get_genome_obj(build) diff --git a/pcgrr/R/utils.R b/pcgrr/R/utils.R index c3ea3c71..6331c218 100644 --- a/pcgrr/R/utils.R +++ b/pcgrr/R/utils.R @@ -258,7 +258,7 @@ get_valid_chromosomes <- function(vcf_data_df, #' @return vcf_df data frame with mutations from nuclear chromosomes only #' #' @export -get_ordinary_chromosomes <- function(vcf_df, chrom_var = "CHROM") { +exclude_non_chrom_variants <- function(vcf_df, chrom_var = "CHROM") { invisible(assertthat::assert_that( is.data.frame(vcf_df), msg = "Argument 'vcf_df' must be of type data.frame")) @@ -268,15 +268,19 @@ get_ordinary_chromosomes <- function(vcf_df, chrom_var = "CHROM") { dplyr::mutate( !!rlang::sym(chrom_var) := as.character(!!rlang::sym(chrom_var))) n_before_exclusion <- nrow(vcf_df) - nuc_chromosomes_df <- data.frame(c(as.character(seq(1:22)), "X", "Y"), - stringsAsFactors = F) + nuc_chromosomes_df <- data.frame( + c(as.character(seq(1:22)), "X", "Y"), + stringsAsFactors = F) colnames(nuc_chromosomes_df) <- c(chrom_var) - vcf_df <- dplyr::semi_join(vcf_df, nuc_chromosomes_df, by = chrom_var) + vcf_df <- dplyr::semi_join( + vcf_df, nuc_chromosomes_df, by = chrom_var) n_after_exclusion <- nrow(vcf_df) - pcgrr::log4r_info( - paste0("Excluding ", - n_before_exclusion - n_after_exclusion, - " variants from non-nuclear chromosomes/scaffolds")) + if(n_before_exclusion - n_after_exclusion > 0){ + pcgrr::log4r_info( + paste0("Excluding n = ", + n_before_exclusion - n_after_exclusion, + " variant(s) from non-nuclear chromosomes/scaffolds")) + } return(vcf_df) } @@ -292,18 +296,27 @@ get_ordinary_chromosomes <- function(vcf_df, chrom_var = "CHROM") { #' @export order_variants <- function( vcf_df, chrom_var = "CHROM", pos_var = "POS") { + stopifnot(is.data.frame(vcf_df) & chrom_var %in% colnames(vcf_df) & pos_var %in% colnames(vcf_df)) - if (nrow(vcf_df) == 0)return(vcf_df) - vcf_df |> - dplyr::mutate(!!rlang::sym(chrom_var) := - factor(!!rlang::sym(chrom_var), - ordered = T, - levels = c(as.character(seq(1:22)), "X", "Y"))) |> - dplyr::arrange(!!rlang::sym(chrom_var), !!rlang::sym(pos_var)) |> - dplyr::mutate(!!rlang::sym(chrom_var) := - as.character(!!rlang::sym(chrom_var))) + if (nrow(vcf_df) == 0){ + return(vcf_df) + } + vcf_df <- vcf_df |> + dplyr::mutate( + !!rlang::sym(chrom_var) := + factor(!!rlang::sym(chrom_var), + ordered = T, + levels = c(as.character(seq(1:22)), "X", "Y"))) |> + dplyr::arrange( + !!rlang::sym(chrom_var), + !!rlang::sym(pos_var)) |> + dplyr::mutate( + !!rlang::sym(chrom_var) := + as.character(!!rlang::sym(chrom_var))) + + return(vcf_df) } diff --git a/pcgrr/man/get_ordinary_chromosomes.Rd b/pcgrr/man/exclude_non_chrom_variants.Rd similarity index 77% rename from pcgrr/man/get_ordinary_chromosomes.Rd rename to pcgrr/man/exclude_non_chrom_variants.Rd index 75c3bd0c..ad0c5668 100644 --- a/pcgrr/man/get_ordinary_chromosomes.Rd +++ b/pcgrr/man/exclude_non_chrom_variants.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils.R -\name{get_ordinary_chromosomes} -\alias{get_ordinary_chromosomes} +\name{exclude_non_chrom_variants} +\alias{exclude_non_chrom_variants} \title{Function that excludes genomic aberrations from non-nuclear chromosomes} \usage{ -get_ordinary_chromosomes(vcf_df, chrom_var = "CHROM") +exclude_non_chrom_variants(vcf_df, chrom_var = "CHROM") } \arguments{ \item{vcf_df}{data frame} diff --git a/pcgrr/pkgdown/index.md b/pcgrr/pkgdown/index.md index eca43797..c7bdcdca 100644 --- a/pcgrr/pkgdown/index.md +++ b/pcgrr/pkgdown/index.md @@ -29,6 +29,10 @@ PCGR originates from the [Norwegian Cancer Genomics Consortium (NCGC)](http://ca ### Top News +- *July 2024*: **2.0.1 release** + - patch with bug fix for mitochondrial input variants ([pr245](https://github.com/sigven/pcgr/pull/245)) + - [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html) + - *June 2024*: **2.0.0 release** - Details in [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html) - Massive reference data bundle upgrade, new report layout, oncogenicity classification++ diff --git a/pcgrr/vignettes/CHANGELOG.Rmd b/pcgrr/vignettes/CHANGELOG.Rmd index fa2ef1ee..a1c85bd6 100644 --- a/pcgrr/vignettes/CHANGELOG.Rmd +++ b/pcgrr/vignettes/CHANGELOG.Rmd @@ -46,6 +46,11 @@ sigven <- user("sigven") pdiakumis <- user("pdiakumis") ``` +## v2.0.1 + +- Date: **2024-07-07** +- Fixed bug for chrM variants in input - not properly annotated by VEP, and not correctly processed in `pcgrr`. Any mitochondrial variants found in input VCF are now removed during VCF pre-processing. + ## v2.0.0 - Date: **2024-06-26** diff --git a/pcgrr/vignettes/annotation_resources.Rmd b/pcgrr/vignettes/annotation_resources.Rmd index 41be3124..2fd79ad1 100644 --- a/pcgrr/vignettes/annotation_resources.Rmd +++ b/pcgrr/vignettes/annotation_resources.Rmd @@ -36,11 +36,11 @@ __Genomic biomarkers__ Genomic biomarkers included in PCGR are limited to the following: -* Evidence items for specific markers in CIViC must be *accepted* (*submitted* evidence items are not considered) -* Markers reported at the gene level (e.g. __BRAF mutation__, __BRCA1 oncogenic mutation__) -* Markers reported at the variant level (e.g. __BRAF p.V600E__) +* Evidence items for specific markers in CIViC must be *accepted* (*submitted* evidence items are not considered or shown) +* Markers reported at the exact variant level (e.g. __BRAF p.V600E__, __MET c.3028+1G>T, __g.7:140753336A>T__) * Markers reported at the codon level (e.g. __KRAS p.G12__) -* Markers reported at the exon/gene level (e.g. __KIT exon 11 mutation__, __BRCA1/2 oncogenic mutations__) +* Markers reported at the exon level (e.g. __KIT exon 11 mutation__, __EGFR exon 19 deletion__) +* Markers reported at the gene level (e.g. __BRAF mutation__, __TP53 loss-of-function mutation__, __BRCA1 oncogenic mutation__) * Within the [Cancer bioMarkers database (CGI)](https://www.cancergenomeinterpreter.org/biomarkers), only markers collected from FDA/NCCN guidelines, scientific literature, and clinical trials are included (markers collected from conference abstracts etc. are not included) * Copy number gains/losses diff --git a/scripts/cpsr_validate_input.py b/scripts/cpsr_validate_input.py index 1b4d8b37..62aebd2c 100755 --- a/scripts/cpsr_validate_input.py +++ b/scripts/cpsr_validate_input.py @@ -180,9 +180,11 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, custom_bed, refdata_assembly_dir cmd_vcf1 = f'bcftools view {input_vcf} | bgzip -cf > {temp_files["vcf_2"]} && tabix -p vcf {temp_files["vcf_2"]} && ' + \ f'bcftools sort --temp-dir {output_dir} -Oz {temp_files["vcf_2"]} > {temp_files["vcf_3"]} 2> {bcftools_simplify_log}' + \ f' && tabix -p vcf {temp_files["vcf_3"]}' - logger.info('Extracting variants on autosomal/sex/mito chromosomes only (1-22,X,Y, M/MT) with bcftools') - # Keep only autosomal/sex/mito chrom (handle hg38 and hg19), sub chr prefix - chrom_to_keep = [str(x) for x in [*range(1,23), 'X', 'Y', 'M', 'MT']] + logger.info('Extracting variants on autosomal/sex/mito chromosomes only (1-22,X,Y) with bcftools') + # Keep only autosomal/sex chromosomes, sub chr prefix + # Note: M/MT variants are skipped - requires additional cache/handling from VEP, + # see e.g. https://github.com/Ensembl/ensembl-vep/issues/464 + chrom_to_keep = [str(x) for x in [*range(1,23), 'X', 'Y',]] chrom_to_keep = ','.join([*['chr' + chrom for chrom in chrom_to_keep], *[chrom for chrom in chrom_to_keep]]) cmd_vcf2 = f'bcftools view --regions {chrom_to_keep} {temp_files["vcf_3"]} | sed \'s/^chr//\' > {temp_files["vcf_1"]}' @@ -198,6 +200,7 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, custom_bed, refdata_assembly_dir command_decompose = f'vt decompose -s {temp_files["vcf_1"]} > {temp_files["vcf_4"]} 2> {vt_decompose_log}' check_subprocess(logger, command_decompose, debug) else: + logger.info('All sites seem to be decomposed - skipping decomposition of multiallelic sites') command_copy = f'cp {temp_files["vcf_1"]} {temp_files["vcf_4"]}' check_subprocess(logger, command_copy, debug) diff --git a/scripts/pcgr_validate_input.py b/scripts/pcgr_validate_input.py index 153d6435..1d2e56e6 100755 --- a/scripts/pcgr_validate_input.py +++ b/scripts/pcgr_validate_input.py @@ -192,13 +192,15 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, output_dir, sample_id, keep_unco variant_id = f"{rec.CHROM}:{POS}_{rec.REF}->{alt}" multiallelic_list.append(variant_id) - logger.info('Extracting variants on autosomal/sex/mito chromosomes only (1-22,X,Y, M/MT) with bcftools') + logger.info('Extracting variants on autosomal/sex chromosomes only (1-22,X,Y) with bcftools') # bgzip + tabix required for sorting cmd_vcf1 = f'bcftools view {input_vcf} | bgzip -cf > {temp_files["vcf_2"]} && tabix -p vcf {temp_files["vcf_2"]} && ' + \ f'bcftools sort --temp-dir {output_dir} -Oz {temp_files["vcf_2"]} > {temp_files["vcf_3"]} 2> {bcftools_simplify_log} && ' + \ f'tabix -p vcf {temp_files["vcf_3"]}' - # Keep only autosomal/sex/mito chrom (handle hg38 and hg19), remove FORMAT metadata lines, keep cols 1-8, sub chr prefix - chrom_to_keep = [str(x) for x in [*range(1,23), 'X', 'Y', 'M', 'MT']] + # Keep only autosomal/sex chrom, remove FORMAT metadata lines, keep cols 1-8, sub chr prefix + # Note: any M/MT variants listed in input are skipped - requires additional cache/handling from VEP, + # see e.g. https://github.com/Ensembl/ensembl-vep/issues/464 + chrom_to_keep = [str(x) for x in [*range(1,23), 'X', 'Y']] chrom_to_keep = ','.join([*['chr' + chrom for chrom in chrom_to_keep], *[chrom for chrom in chrom_to_keep]]) cmd_vcf2 = f'bcftools view --regions {chrom_to_keep} {temp_files["vcf_3"]} | egrep -v \'^##FORMAT=\' ' + \ f'| cut -f1-8 | sed \'s/^chr//\' > {temp_files["vcf_1"]}' @@ -215,7 +217,7 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, output_dir, sample_id, keep_unco command_decompose = f'vt decompose -s {temp_files["vcf_1"]} > {validated_vcf} 2> {vt_decompose_log}' check_subprocess(logger, command_decompose, debug) else: - logger.info('All sites seem to be decomposed - skipping decomposition!') + logger.info('All sites seem to be decomposed - skipping decomposition of multiallelic sites') check_subprocess(logger, f'cp {temp_files["vcf_1"]} {validated_vcf}', debug) # need to keep uncompressed copy for vcf2maf.pl if selected @@ -230,8 +232,17 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, output_dir, sample_id, keep_unco i = i + 1 if len(vcf.seqnames) == 0 or i == 0: logger.info('') - logger.info("Input VCF contains NO valid variants after VCF cleaning - quitting workflow") + logger.info("Input VCF contains NO valid variants on autosomal/sex chromosomes after VCF cleaning - quitting workflow") logger.info('') + + if not debug: + remove_file(temp_files["vcf_1"]) + remove_file(temp_files["vcf_2"]) + remove_file(temp_files["vcf_3"]) + remove_file(temp_files["vcf_2"] + str('.tbi')) + remove_file(temp_files["vcf_3"] + str('.tbi')) + remove_file(bcftools_simplify_log) + remove_file(vt_decompose_log) exit(1) if not debug: