From feaaea731adaab78265967123dde76d1c7fe7a8c Mon Sep 17 00:00:00 2001 From: Ahhgust Date: Thu, 27 Apr 2023 13:06:31 -0500 Subject: [PATCH] Now supports extracting gencall scores (GCALL) as real numbers --- FormatFactory.py | 7 +++++ GencallFormatReal.py | 64 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 GencallFormatReal.py diff --git a/FormatFactory.py b/FormatFactory.py index c302924..1eb4cdc 100644 --- a/FormatFactory.py +++ b/FormatFactory.py @@ -1,5 +1,6 @@ from GenotypeFormat import GenotypeFormat from GencallFormat import GencallFormat +from GencallFormatReal import GencallFormatReal from BAlleleFreqFormat import BAlleleFreqFormat from LogRRatioFormat import LogRRatioFormat @@ -31,6 +32,8 @@ def __init__(self, no_samples, formats_to_include, logger): self._format_classes.append(GenotypeFormat) if GencallFormat.get_id() in formats_to_include: self._format_classes.append(GencallFormat) + if GencallFormatReal.get_id() in formats_to_include: + self._format_classes.append(GencallFormatReal) if BAlleleFreqFormat.get_id() in formats_to_include: self._format_classes.append(BAlleleFreqFormat) if LogRRatioFormat.get_id() in formats_to_include: @@ -40,6 +43,7 @@ def __init__(self, no_samples, formats_to_include, logger): def get_possible_formats(): valid_classes = [GenotypeFormat.get_id(), GencallFormat.get_id(), + GencallFormatReal.get_id(), BAlleleFreqFormat.get_id(), LogRRatioFormat.get_id()] return valid_classes @@ -98,6 +102,9 @@ def create_formats(self, gtc): if GencallFormat.get_id() in self._formats_to_include: result.append(GencallFormat( self._logger, gtc.get_genotype_scores())) + if GencallFormatReal.get_id() in self._formats_to_include: + result.append(GencallFormatReal( + self._logger, gtc.get_genotype_scores())) if BAlleleFreqFormat.get_id() in self._formats_to_include: result.append(BAlleleFreqFormat( self._logger, gtc.get_ballele_freqs())) diff --git a/GencallFormatReal.py b/GencallFormatReal.py new file mode 100644 index 0000000..558ace5 --- /dev/null +++ b/GencallFormatReal.py @@ -0,0 +1,64 @@ +from vcf.parser import _Format + +# Edited by August Woerner; borrowed (heavily) from GencallFormat.py + +class GencallFormatReal(object): + """ + Generate GQ format information for VCF + """ + + def __init__(self, logger, gencall_scores): + """ + Create new GencallFormatReal object + + Args: + logger (logging.Logger): Logging object for errors/warnings + gencall_scores (list(float)): List of all gencall scores for all loci across sample + + Returns + GencallFormatReal + """ + self._logger = logger + self._gencall_scores = gencall_scores + + @staticmethod + def get_id(): + return "GCALL" + + @staticmethod + def get_description(): + return "GenCall score as a real number. For merged multi-locus entries, max(GenCall) score is reported." + + @staticmethod + def get_format_obj(): + return _Format( + GencallFormatReal.get_id(), 1, "Float", GencallFormatReal.get_description()) + + def _get_max_gencall_score(self, bpm_records): + """ + Look up gencall scores for entries defined by bpm_records and report + the minimum. + + Args: + bpm_records iter(BPMRecord) : Iterable of BPM records + + Returns + float : MAX gen call score + """ + return max(self._gencall_scores[record.index_num] for record in bpm_records) + + def generate_sample_format_info(self, bpm_records, vcf_record, sample_name): + """ + Returns sample format info (GQ score) + + Args: + bpm_records (iter(BPMRecord)) : BPM records + vcf_record (vcf._Record): VCF record + sample_name (string): Name of sample + + Returns: + float: MAX GenCall score, as a real number [0,1]. + """ + gencall_score = self._get_max_gencall_score(bpm_records) + #phred_quality_score = int(round(-10*log10(min(1.0, max(0.00001, 1 - min_gencall_score))))) + return gencall_score