diff --git a/README.rst b/README.rst index 6ca0a38..fd83760 100644 --- a/README.rst +++ b/README.rst @@ -38,60 +38,60 @@ Basic tombo installation (python 2.7 and 3.4+ support) Quick Start =========== -Re-squiggle raw nanopore read files and call 5mC and 6mA sites. +This quick start guides the steps to perform some common modified base detection analyses using the Tombo command line interface. -Then, for 5mA calls, output genome browser `wiggle format file `_ and, for 6mA calls, plot raw signal around most significant locations. +The first step in any Tombo analysis is to re-squiggle (raw signal to reference sequence alignment) raw nanopore reads. This creates an index and stores the information necessary to perform downstream analyses. -:: +In this example, an E. coli sample is tested for dam and dcm methylation (CpG model also available for human analysis). Using these results, raw signal is plotted at the most significantly modified dcm positions and the dam modified base predictions are output to a `wiggle `_ file for use in downstream processing or visualization in a genome browser. - # skip this step if FAST5 files already contain basecalls - tombo preprocess annotate_raw_with_fastqs --fast5-basedir path/to/fast5s/ \ - --fastq-filenames basecalls1.fastq basecalls2.fastq \ - --sequencing-summary-filenames seq_summary1.txt seq_summary2.txt \ - --processes 4 +:: tombo resquiggle path/to/fast5s/ genome.fasta --processes 4 --num-most-common-errors 5 tombo detect_modifications alternative_model --fast5-basedirs path/to/fast5s/ \ - --statistics-file-basename sample.alt_modified_base_detection \ - --per-read-statistics-basename sample.alt_modified_base_detection \ - --alternate-bases 5mC 6mA --processes 4 - - # produces "estimated fraction of modified reads" genome browser files - # for 5mC testing - tombo text_output browser_files --statistics-filename sample.alt_modified_base_detection.5mC.tombo.stats \ - --file-types dampened_fraction --browser-file-basename sample.alt_modified_base_detection.5mC - # and 6mA testing (along with coverage bedgraphs) - tombo text_output browser_files --statistics-filename sample.alt_modified_base_detection.6mA.tombo.stats \ - --fast5-basedirs path/to/fast5s/ --file-types dampened_fraction coverage \ - --browser-file-basename sample.alt_modified_base_detection.6mA - - # plot raw signal at most significant 6mA locations + --statistics-file-basename native.e_coli_sample \ + --alternate-bases dam dcm --processes 4 + + # plot raw signal at most significant dcm locations tombo plot most_significant --fast5-basedirs path/to/fast5s/ \ - --statistics-filename sample.alt_modified_base_detection.6mA.tombo.stats \ - --plot-standard-model --plot-alternate-model 6mA \ - --pdf-filename sample.most_significant_6mA_sites.pdf + --statistics-filename native.e_coli_sample.dcm.tombo.stats \ + --plot-standard-model --plot-alternate-model dcm \ + --pdf-filename sample.most_significant_dcm_sites.pdf + + # produces wig file with estimated fraction of modified reads at each valid reference site + tombo text_output browser_files --statistics-filename native.e_coli_sample.dam.tombo.stats \ + --file-types dampened_fraction --browser-file-basename native.e_coli_sample.dam + # also produce successfully processed reads coverage file for reference + tombo text_output browser_files --fast5-basedirs path/to/fast5s/ \ + --file-types coverage --browser-file-basename native.e_coli_sample -Detect any deviations from expected signal levels for canonical bases to investigate any type of modification. +While motif models (``CpG``, ``dcm`` and ``dam``; most accurate) and all-context specific alternate base models (``5mC`` and ``6mA``; more accurate) are preferred, Tombo also allows users to investigate other or even unknown base modifications. + +Here are two example commands running the ``de_novo`` method (detect deviations from expected cannonical signal levels) and the ``level_sample_compare`` method (detect deviation in signal levels between two samples of interest; works best with high coverage). :: - tombo resquiggle path/to/fast5s/ genome.fasta --processes 4 --num-most-common-errors 5 tombo detect_modifications de_novo --fast5-basedirs path/to/fast5s/ \ - --statistics-file-basename sample.de_novo_modified_base_detection \ - --per-read-statistics-basename sample.de_novo_modified_base_detection \ - --processes 4 + --statistics-file-basename sample.de_novo_detect --processes 4 + tombo text_output browser_files --statistics-filename sample.de_novo_detect.tombo.stats \ + --browser-file-basename sample.de_novo_detect --file-types dampened_fraction + + tombo detect_modifications level_sample_compare --fast5-basedirs path/to/fast5s/ \ + --control-fast5-basedirs path/to/control/fast5s/ --minimum-test-reads 50 \ + --processes 4 --statistics-file-basename sample.level_samp_comp_detect + tombo text_output browser_files --statistics-filename sample.level_samp_comp_detect.tombo.stats \ + --browser-file-basename sample.level_samp_comp_detect --file-types statistic + +.. - # produces "estimated fraction of modified reads" genome browser files from de novo testing - tombo text_output browser_files --statistics-filename sample.de_novo_modified_base_detection.tombo.stats \ - --browser-file-basename sample.de_novo_modified_base_detection --file-types dampened_fraction + See more complete tutorials on the `documentation page `_. === RNA === -All Tombo commands work for RNA data as well, but a transcriptome reference sequence must be provided for spliced transcripts. +All Tombo commands work for direct RNA nanopore reads as well, but a transcriptome reference sequence must be provided for spliced transcripts. -The reasons for this decision and other tips for processing RNA data within the Tombo framework can be found in the `RNA section `_ of the detailed Tombo documentation. +Tips for processing direct RNA reads within the Tombo framework can be found in the `RNA section `_ of the detailed Tombo documentation. ===================== Further Documentation @@ -99,7 +99,7 @@ Further Documentation Run ``tombo -h`` to see all Tombo command groups and run ``tombo [command-group] -h`` to see all commands within each group. -Detailed documentation for all Tombo commands and algorithms can be found at https://nanoporetech.github.io/tombo/ +Detailed documentation for all Tombo commands and algorithms can be found on the `tombo documentation page `_. ======== Citation diff --git a/docs/_images/dampened_fraction.png b/docs/_images/dampened_fraction.png index f61d379..cc04a9e 100644 Binary files a/docs/_images/dampened_fraction.png and b/docs/_images/dampened_fraction.png differ diff --git a/docs/_images/stat_dist.png b/docs/_images/stat_dist.png index 2d77e7d..aad8de6 100644 Binary files a/docs/_images/stat_dist.png and b/docs/_images/stat_dist.png differ diff --git a/docs/_images/stat_dist_null.png b/docs/_images/stat_dist_null.png new file mode 100644 index 0000000..749f7a9 Binary files /dev/null and b/docs/_images/stat_dist_null.png differ diff --git a/docs/examples.rst b/docs/examples.rst index c48f69b..d7e76d4 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -2,17 +2,17 @@ Tombo Commands ************** -Below are minimal use case examples. For more detail on each commands' options and further algorithm details, please see the corresponding documentation sections. +This page contains breif descriptions of the most common Tombo commands. For more detail on each commands' options and further algorithm details, please see the corresponding documentation sections. ------------------------------------------ Re-squiggle (Raw Signal Genomic Alignment) ------------------------------------------ -The re-squiggle algorithm aligns raw signal (electric current nanopore measurements) to genomic/transcriptomic sequence. +The re-squiggle algorithm aligns raw signal (electric current nanopore measurements) to reference genomic or transcriptomic sequence. -One of the major assumptions of the re-squiggle algorithm is that the provided reference sequence is correct. Thus for poorly assembled genomes or divergent samples, an assembly polishing step (possibly from the same data/sample) may improve results. +One of the major assumptions of the re-squiggle algorithm is that the provided reference sequence is correct. Thus for a poorly assembled reference or divergent sample, an assembly polishing step (possibly from the same data/sample) may improve results. -The ``resquiggle`` command will add infomation (the mapped genomic location and the raw signal to sequence assignment) to the read files provided (in FAST5 format), as well as producing an index file for more efficient file access in downstream commands. +The ``resquiggle`` command will add infomation (the mapped reference location and the raw signal to sequence assignment) to the read files provided (in FAST5 format), as well as producing an index file for more efficient file access in downstream commands. .. important:: @@ -34,7 +34,7 @@ For more details see the :doc:`re-squiggle documentation `. Modified Base Detection ----------------------- -Tombo provides three methods for the investigation of modified bases (within the ``detect_modifications`` command group). Each method has different advantages and requirements. +Tombo provides four (including two types of sample comparison) methods for the investigation of modified bases (within the ``tombo detect_modifications`` command group). Each method has different advantages and requirements. ---- @@ -45,76 +45,45 @@ Tombo provides three methods for the investigation of modified bases (within the ---- -All modified base detection methods poduce per-read, per-genomic position test statistics (which can be saved via the ``--per-read-statistics-basename`` option). A threshold is then applied to these statistics to produce an estimate for the fraction of reads that appear modified at each genomic location. +All modified base detection methods, except the ``level_sample_compare`` method, produce per-read, per-reference position test statistics (which can be saved via the ``--per-read-statistics-basename`` option). A threshold is then applied to these statistics to produce an estimate for the fraction of reads that appear modified at each reference location. 1. **Specific alternative base detection (recommended)** - Run using ``tombo detect_modifications alternative_model`` command. - - This method identifies signal that deviates from the canonical base expected signal level while matching a specific alternative base expected signal level. - - This method produces a statistic similar to a log likelihood ratio, but scaled to be more robust to outlier signal assignments (similar to `Tukey's biweight function `_). - - Alternative DNA models are currently available for 5-methylcytosine (5mC) and N6-methyladenosine (6mA) in all sequence contexts. - - An alternative RNA model is available for 5mC. + - This method identifies sites where signal matches a specific alternative base expected signal levels better than the canonical expected levels producing a statistic similar to a log likelihood ratio. + - All-context alternative DNA models are currently available for 5-methylcytosine (5mC) and N6-methyladenosine (6mA; not currently available for RNA). + - More accurate motif specific models are available for dam and dcm methylation (found in E. coli) and CpG methylation (found in human samples). 2. **De novo canonical model comparison** - Run using ``tombo detect_modifications de_novo`` command. - - This method compares re-squiggled signal to the default canonical model. - - While this method may produce significant false positive and negative results per-read, it produces the best results for many statistical measures per-genomic location (fraction of modified bases across a set of reads). + - This method identifies sites where signal deviates from the expected canonical signal levels. + - While this method has the highest error rates, it can be used effectively on any sample and is particularly useful for motif discovery for motif-specific modifications (e.g. bacterial samples). -3. **Canonical (control) sample comparison** +3. **Sample comparison** - - Run using ``tombo detect_modifications sample_compare`` command. - - This method performs a hypothesis test against the distribution estimated from the control sample at each base. - - This method requires the production of a second set of reads containing only the 4 canonical bases (e.g PCR for DNA of IVT for RNA). + - These two methods requires the production of a second set of reads for comparison. -.. - - Both the control sample comparison and the *de novo* methods may not identify the exact modified base location (as the shifted signal does not always center on a modified base) and gives no information as to the identity of a modified base. - -The result of all ``detect_modifications`` calls will be a binary statistics file(s), which can be passed to other Tombo commands. - -For more details see the :doc:`modified base detection documentation `. - -Specific Alternative Base Method -================================ - -In order to specifically detect 5mC and 6mA, use the ``detect_modifications alternative_model`` command. - -This will compute a statistic similar to a log likelihood ratio using the default canonical model and the 5mC and 6mA alternative DNA models provided with Tombo. + a. ``level_sample_compare`` -This is the perferred method for modified base detection if a model is available for your biological modification of interest, as it identifies the exact location of the modified base and reduces false positives for spurious shifts in signal. + - Run using ``tombo detect_modifications level_sample_compare`` command. + - This method performs either a ks-test (default), u-test or t-test to identify sites where signal levels deviate between two samples. + - This method is ideal for high coverage samples (in order to accurately estimate the effect size measures) and comparison of two potentailly non-canonical samples (e.g. a KO experiment). -.. code-block:: bash - - tombo detect_modifications alternative_model --fast5-basedirs \ - --alternate-bases 5mC 6mA --statistics-file-basename sample_alt_model - -*De novo* Non-canonical Base Method -=================================== - -In order to perform *de novo* non-canonical base detection, use the ``detect_modifications de_novo`` command. - -This will perform a hypothesis test against the default canonical base model provided with Tombo. Note that this method is quite error prone and may result in a high false positive rate on a per-read basis, but may be of use in a research and development setting. This method also has the lowest requirement, consisting of only a set of reads potentially containing modifications and a reference sequence. - -.. code-block:: bash + b. ``model_sample_compare`` - tombo detect_modifications de_novo --fast5-basedirs \ - --statistics-file-basename sample_de_novo_detection + - Run using ``tombo detect_modifications model_sample_compare`` command. + - This uses a canonical control sample to adjust the expected signal levels due to un-modeled effects. + - This adjusted model is then used to test for modifications as in the de novo method. + - This was the ``sample_compare`` method prior to version 1.5. -Canonical Sample Comparison Method -================================== - -In order to execute the canonical sample comparison method, use the ``detect_modifications sample_compare`` command. - -This will perform a hypothesis test against the signal level observed from the control sample (provided via ``--control-fast5-basedirs`` option) at each genomic position. This method (like the ``de_novo`` method) does not always identify the exact modified base position. +.. -As of version 1.4, this method uses the canonical base model as a prior for control sample distribution estimation drastically improving results, particularly for low coverage samples. To test only against the canonical sample use the ``--sample-only-estimates`` flag. The prior weights for the estimated mean and standard deviation can be set using the ``--model-prior-weights`` option. + Both the sample comparison and the *de novo* methods may not identify the exact modified base location (as the shifted signal does not always center on a modified base) and gives no information as to the identity of a modified base. -.. code-block:: bash +The result of all ``tombo detect_modifications`` calls will be a binary statistics file(s), which can be passed to other Tombo commands. - tombo detect_modifications sample_compare --fast5-basedirs \ - --control-fast5-basedirs \ - --statistics-file-basename sample_canonical_compare +For more details see the :doc:`modified base detection documentation `. ----------- Text Output @@ -123,24 +92,27 @@ Text Output Genome Browser File Output ========================== -In order to output the results of re-squiggling and statistical testing in a genome browser compatible format (either `wiggle format `_ or `bedgraph format `_), the ``tombo text_output genome_browser`` command is provided. +In order to output re-squiggle and/or modified base detection results in a genome browser compatible format (either `wiggle format `_ or `bedgraph format `_), the ``tombo text_output genome_browser`` command is provided. .. code-block:: bash tombo text_output browser_files --fast5-basedirs \ - --statistics-filename sample_alt_model.5mC.tombo.stats \ - --browser-file-basename sample_alt_model --file-types dampened_fraction coverage + --statistics-filename sample_alt_model.CpG.tombo.stats \ + --browser-file-basename sample_alt_model --file-types dampened_fraction coverage .. hint:: - Other ``--file-types`` available are ``fraction``, ``valid_coverage``, ``signal``, ``signal_sd``, ``dwell`` and ``difference``. + All ``--file-types`` available are: - The ``dampened_fraction`` option adds psuedo-counts to the detected number of un-modified and modified reads at each tested location (as specified by the ``--coverage-dampen-counts`` option), while the ``fraction`` option returns the raw fraction of modified reads at any genomic site from ``detect_modifications`` results. The ``dampen_fraction`` output is intended to allow the inclusion of low coverage regions in downstream analysis without causing potentially false site at the top of rank lists. Visualize different values of the ``--coverage-dampen-counts`` option with the included ``scripts/test_beta_priors.R`` script. + - ``fraction``, ``dampened_fraction``, and ``valid_coverage`` derived from a (non-``level_sample_compare``) statistics file + - ``statistic`` derived from a ``level_sample_compare`` statistics files + - ``coverage`` derived from the Tombo index (fast) + - ``signal``, ``signal_sd``, ``dwell``, and ``difference`` derived from read FAST5 files (slow) -Genome Sequence Output -====================== +Reference Sequence Output +========================= -For modified base analysis pipelines (e.g. motif detection), it may be useful to output the genomic sequence surrounding locations with the largest fraction of modified reads. The ``text_output signif_sequence_context`` command is provided for this purpose. +For modified base analysis pipelines (e.g. motif detection), it may be useful to output the reference sequence surrounding the most likely modified sites. The ``text_output signif_sequence_context`` command is provided for this purpose. .. code-block:: bash @@ -159,11 +131,11 @@ For more details see the :doc:`text output documentation `. Plotting Commands ----------------- -Tombo provides many plotting functions for the visualization of modified bases and raw nanopore signal in general. +Tombo provides many plotting functions for the visualization of modified bases within raw nanopore signal. -Most plotting commands are genome-anchored. That is the raw signal is plotted as the re-squiggle algorithm has assigned it to the genome. Thus each read may contain a different number of raw observations assigned to each genomic base. For regions with higher coverage, several over-plotting options are available. For those options producing a distribution, these are taken over each reads average signal assigned to a base. This requires extraction of these levels from all relevant FAST5 files and thus can be slow for very deep coverage regions. +Most plotting commands are reference-anchored. That is the normalized raw signal is plotted as the re-squiggle algorithm has assigned it to the reference sequence. -Each genome anchored plotting command allows for the selection of genomic positions based on generally applicable criterion. +Each reference anchored plotting command allows for the selection of reference positions based on generally applicable criterion. .. code-block:: bash @@ -176,6 +148,10 @@ Each genome anchored plotting command allows for the selection of genomic positi --genome-locations chromosome:1000 chromosome:2000:- \ --genome-fasta genome.fasta +.. note:: + + For regions with higher coverage, several over-plotting options are available. For those options producing a distribution, these are taken over each reads average signal assigned to a base. This requires extraction of these levels from all relevant FAST5 files and thus can be slow for very deep coverage regions. + For more details see the :doc:`plotting documentation `. -------------- @@ -194,14 +170,4 @@ Read filtering commands can be useful to extract the most out out of a set of re tombo filter raw_signal_matching --fast5-basedirs path/to/native/rna/fast5s/ \ --signal-matching-score 1.0 -.. hint:: - - Hint: Save a set of filters for later use by copying the Tombo index file: ``cp path/to/native/rna/.fast5s.RawGenomeCorrected_000.tombo.index save.native.tombo.index``. To re-set to a set of saved filters after applying further filters simply replace the index file: ``cp save.native.tombo.index path/to/native/rna/.fast5s.RawGenomeCorrected_000.tombo.index``. - -.. - - For more read filtering commands see the `Tombo filter documentation here `_. - -.. tip:: - - For additional command details, see the specific commands documentation section. +For more details see the :doc:`filter documentation `. diff --git a/docs/filtering.rst b/docs/filtering.rst index 5fd2592..96e0ca8 100644 --- a/docs/filtering.rst +++ b/docs/filtering.rst @@ -4,6 +4,10 @@ Read Filtering Commands Read filtering commands can be useful to extract the most out out of a set of reads for modified base detection. Read filtering commands effect only the Tombo index file, and so filters can be cleared or applied iteratively without re-running the re-squiggle command. Five filters are currently made available (``genome_locations``, ``raw_signal_matching``, ``q_score``, ``level_coverage`` and ``stuck``). +.. hint:: + + Hint: Save a set of filters for later use by copying the Tombo index file: ``cp path/to/native/rna/.fast5s.RawGenomeCorrected_000.tombo.index save.native.tombo.index``. To re-set to a set of saved filters after applying further filters simply replace the index file: ``cp save.native.tombo.index path/to/native/rna/.fast5s.RawGenomeCorrected_000.tombo.index``. + --------------------------------- ``tombo filter genome_locations`` --------------------------------- diff --git a/docs/index.rst b/docs/index.rst index 77cc6b0..9194355 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -29,62 +29,58 @@ Basic tombo installation (python 2.7 and 3.4+ support) pip install numpy pip install ont-tombo[full] -See :doc:`examples` for common workflows. +See :doc:`tutorials` for common workflows. =========== Quick Start =========== -Re-squiggle raw nanopore read files and call 5mC and 6mA sites. +This **quick start** guides the steps to perform some common modified base detection analyses using the Tombo command line interface. -Then, for 5mA calls, output genome browser `wiggle format file `_ and, for 6mA calls, plot raw signal around most significant locations. +The first step in any Tombo analysis is to re-squiggle (raw signal to reference sequence alignment) raw nanopore reads. This creates an index and stores the information necessary to perform downstream analyses. -:: +In this example, an E. coli sample is tested for dam and dcm methylation (present in lab E. coli; CpG model also available for human analysis). Using these results, raw signal is plotted at the most significantly modified dcm positions and the dam results are output to a `wiggle `_ file for use in downstream processing or visualization in a genome browser. - # skip this step if FAST5 files already contain basecalls - tombo preprocess annotate_raw_with_fastqs --fast5-basedir path/to/fast5s/ \ - --fastq-filenames basecalls1.fastq basecalls2.fastq \ - --sequencing-summary-filenames seq_summary1.txt seq_summary2.txt \ - --processes 4 +:: - tombo resquiggle path/to/fast5s/ genome.fasta --processes 4 + tombo resquiggle path/to/fast5s/ genome.fasta --processes 4 --num-most-common-errors 5 tombo detect_modifications alternative_model --fast5-basedirs path/to/fast5s/ \ - --statistics-file-basename sample.alt_modified_base_detection \ - --per-read-statistics-basename sample.alt_modified_base_detection \ - --alternate-bases 5mC 6mA --processes 4 + --statistics-file-basename native.e_coli_sample \ + --alternate-bases dam dcm --processes 4 - # produces "estimated fraction of modified reads" genome browser files - # for 5mC testing - tombo text_output browser_files --statistics-filename sample.alt_modified_base_detection.5mC.tombo.stats \ - --file-types dampened_fraction --browser-file-basename sample.alt_modified_base_detection.5mC - # and 6mA testing (along with coverage bedgraphs) - tombo text_output browser_files --statistics-filename sample.alt_modified_base_detection.6mA.tombo.stats \ - --fast5-basedirs path/to/fast5s/ --file-types dampened_fraction coverage\ - --browser-file-basename sample.alt_modified_base_detection.6mA - - # plot raw signal at most significant 6mA locations + # plot raw signal at most significant dcm locations tombo plot most_significant --fast5-basedirs path/to/fast5s/ \ - --statistics-filename sample.alt_modified_base_detection.6mA.tombo.stats \ - --plot-standard-model --plot-alternate-model 6mA \ - --pdf-filename sample.most_significant_6mA_sites.pdf + --statistics-filename native.e_coli_sample.dcm.tombo.stats \ + --plot-standard-model --plot-alternate-model dcm \ + --pdf-filename sample.most_significant_dcm_sites.pdf + + # produces "estimated fraction of modified reads" genome browser files + tombo text_output browser_files --statistics-filename native.e_coli_sample.dam.tombo.stats \ + --file-types dampened_fraction --browser-file-basename native.e_coli_sample.dam + # also produce successfully processed reads coverage file for reference + tombo text_output browser_files --fast5-basedirs path/to/fast5s/ \ + --file-types coverage --browser-file-basename native.e_coli_sample + +While motif models (``CpG``, ``dcm`` and ``dam``; most accurate) and all-context specific alternate base models (``5mC`` and ``6mA``; more accurate) are preferred, Tombo also allows users to investigate other or even unknown base modifications. -Detect any deviations from expected signal levels for canonical bases to investigate any type of modification. +Here are two example commands running the ``de_novo`` method (detect deviations from expected cannonical base signal levels) and the ``level_sample_compare`` method (detect deviation in signal levels between two samples of interest; works best with high >50X coverage). :: - tombo resquiggle path/to/fast5s/ genome.fasta --processes 4 tombo detect_modifications de_novo --fast5-basedirs path/to/fast5s/ \ - --statistics-file-basename sample.de_novo_modified_base_detection \ - --per-read-statistics-basename sample.de_novo_modified_base_detection \ - --processes 4 + --statistics-file-basename sample.de_novo_detect --processes 4 + tombo text_output browser_files --statistics-filename sample.de_novo_detect.tombo.stats \ + --browser-file-basename sample.de_novo_detect --file-types dampened_fraction - # produces sample.de_novo_modified_base_detection.dampened_fraction.[plus|minus].wig files - tombo text_output browser_files --statistics-filename sample.de_novo_modified_base_detection.tombo.stats \ - --browser-file-basename sample.de_novo_modified_base_detection --file-types dampened_fraction + tombo detect_modifications level_sample_compare --fast5-basedirs path/to/fast5s/ \ + --control-fast5-basedirs path/to/control/fast5s/ --minimum-test-reads 50 \ + --processes 4 --statistics-file-basename sample.level_samp_comp_detect + tombo text_output browser_files --statistics-filename sample.level_samp_comp_detect.tombo.stats \ + --browser-file-basename sample.level_samp_comp_detect --file-types statistic .. note:: - All of these commands work for RNA data as well, but a transcriptome reference sequence must be provided for spliced transcripts. + All Tombo commands work for direct RNA nanopore reads as well, but a transcriptome reference sequence must be provided for spliced transcripts. Run ``tombo -h`` to see all Tombo command groups, run ``tombo [command-group] -h`` to see all commands within each group and run ``tombo [command-group] [comand] -h`` for help with arguments to each Tombo command. @@ -94,7 +90,7 @@ Detect any deviations from expected signal levels for canonical bases to investi Naming ------ -Tombo Ahi is a Japanese name for albacore (the name of the Oxford Nanopore Technologies basecaller). So use albacore to identify canonical bases and then use Tombo to detect more exotic, non-canonical bases. +Tombo Ahi is a Japanese name for albacore (the name of an Oxford Nanopore Technologies basecaller). So use albacore to identify canonical bases and then use Tombo to detect more exotic, non-canonical bases. -------- Contents @@ -103,6 +99,7 @@ Contents .. toctree:: :maxdepth: 2 + tutorials examples resquiggle modified_base_detection diff --git a/docs/model_training.rst b/docs/model_training.rst index 4ce1b4a..6e21342 100644 --- a/docs/model_training.rst +++ b/docs/model_training.rst @@ -2,11 +2,11 @@ Model Training (Advanced Users Only) ************************************ -Model training is made available via several Tombo commands, but should be used with care as these methods can be very sensetive to the samples used. Commands relevant to model training are found within the ``tombo build_model`` command group. The commands are ``estimate_reference`` for estimating a canonical bases model, ``estimate_alt_reference`` for estimation of a non-canonical alternative base model, and ``event_resquiggle`` for re-squiggling reads without a model (requires event-based basecaller results). +Model training is made available via several Tombo commands, but should be used with care as these methods can be very sensetive to the samples used. Commands relevant to model training are found within the ``tombo build_model`` command group. The commands are ``estimate_reference`` for estimating a canonical bases model, ``estimate_alt_reference`` for estimation of a non-canonical alternative base model, ``estimate_motif_alt_reference`` for estimation of a more accurate motif-specific non-canonical alternate base model, and ``event_resquiggle`` for re-squiggling reads without a model (requires event-based base caller results). .. note:: - Model training produces a binary Tombo model file similar to those included in the Tombo software (found in the code repository here ``tombo/tombo_models``). User-created strandard Tombo models can be used in re-squiggling, modified base detection and plotting commands using the advanced ``--tombo-model-filename`` option. This option is generally for advanced users training their own models, so this option is not shown in the command line help documentation. Similarly user-created alternative models can be passed to plotting commands via the hidden ``--alternate-model-filename`` option and passed to any ``tombo detect_modifications`` command via the advanced ``--alternate-model-filenames`` option. + Model training produces a binary Tombo model file (as those packaged with Tombo found in the code repository in ``tombo/tombo_models``). User-created Tombo models can be used in re-squiggling, modified base detection and plotting commands using the advanced ``--tombo-model-filename`` option. This option is generally for advanced users training their own models, so this option is not shown in the command line help documentation, but is available for all applicable commands. Similarly user-created alternative models can be passed to plotting commands via the hidden ``--alternate-model-filename`` option and passed to applicable commands via the advanced ``--alternate-model-filenames`` option. ======================================== ``tombo build_model estimate_reference`` @@ -16,9 +16,9 @@ The ``tombo build_model estimate_reference`` command is provided to estimate a T To estimate a canonical model, first genomic base levels are parsed from reads as assigned by a re-squiggle command (either ``tombo build_model event_resquiggle`` or ``tombo resquiggle`` processed reads are acceptable) and grouped by their genomic base assignment. By default, the median and standard deviation of the current level over reads covering each genomic position is computed. The ``--estimate-mean`` option will trigger this to be computed as a mean instead, though this can be sensetive to outlier read signal assignment and is thus not recommended. -All genomic current levels are then grouped based on the genomic k-mer sequence at that location. This k-mer is defined by the ``--upstream-bases`` and ``--downstream-bases`` options. Note that the modeled k-mer will be one longer than the sum of these two options as the k-mer includes the *dominant*, central position as well. The central position generally has the strongest correlation with the current signal level as can be seen with the ``plot kmer`` command. +All genomic current levels are then grouped based on the reference sequence local k-mer. The k-mer is defined by the ``--upstream-bases`` and ``--downstream-bases`` options. Note that the modeled k-mer will be one longer than the sum of these two options as the k-mer includes the *dominant*, central position as well. The central position generally has the strongest correlation with the current signal level as can be seen with the ``plot kmer`` command. -The reference signal level and spread for each k-mer are then estimated by taking the median of the signal level and mean of the standard deviation over all observations of each k-mer across the genome. By default, a single global standard deviation is taken as the median over all k-mers. The ``--kmer-specific-sd`` option is provided in order to estimate a seperate standard deviation for each k-mer, but is not recommended as this can have deleterious effects on Tombo analyses. In particular, k-mer specific standard deviation estimates can produce poor re-squiggle results due to signal being "packed" into high SD k-mers. +The reference signal level and spread for each k-mer are then estimated by taking the median of the signal level and mean of the standard deviation over all observations of each k-mer across the genome. By default, a single global standard deviation is taken as the median over all k-mers. The ``--kmer-specific-sd`` option is provided in order to estimate a seperate standard deviation for each k-mer, but is not recommended as this can have negative effects on some Tombo analyses. In particular, k-mer specific standard deviation estimates can produce poor re-squiggle results due to signal being "packed" into high SD k-mers. Note that these levels are an approximation for the expected signal levels at a position, but consistent deviations from this model indicate that nanopore signal is modulated at some rate by longer range sequence effects. These values are stored in the output file in the binary HDF5 format and can be passed to any Tombo command that takes a Tombo model file. @@ -26,43 +26,55 @@ Several options are supplied in order to ensure more robust parameter estimates The model estimation command is capable of using mutiple processes via the ``--multiprocess-region-size`` and ``--processes`` options with similar behavior as these options in the ``tombo detect_modifications`` command. The multi-processing only applies to the genome position level computation and not the global model estimation stage; as such changes in multi-processing options will not change resulting models. -============================================ -``tombo build_model estimate_alt_reference`` -============================================ +============================ +Alternative Model Estimation +============================ --------------------------- Alternative Reference Goals --------------------------- -One of the main goals of the Tombo suite of tools is to make alternative model estimation more accessible. Key to this goal is the estimation of an alternative model from a relatively simple to prodcue biological sample. A significant additional goal is the estimation of a model capable of detecting an alternative base in all sequence contexts. +One goal of the Tombo suite of tools is to make alternative model estimation more accessible. Key to this goal is the estimation of an alternative model from a relatively simple to prodcue biological sample. A significant additional goal is the estimation of a model capable of detecting an alternative base in all sequence contexts. -In order to address these goals, the sample required for alternative model estimation must contain the four canonical bases along with a **single, known, alternative base incorporated randomly instead of one canonical base** into a sample with a known genome (referred to as the "*alternative sample*" below). The rate of incorporation for the alternative base should ideally be between 15% and 35%, though a larger range may be acceptable. Key to this method is that the exact known location of alternative base incorporation is not needed, though the base must incorporate in place of only a single canonical base (referred to as the "*swap base*" below and specified with the ``--alternate-model-base`` option to ``tombo build_model estimate_alt_reference``). +In order to address these goals, two alternative model estimation procedures are provided. The first method estimates a model from a set of known modifications (either from alternative evidence or from a known modification motif). The second method attempts to estimate an alternative model over all contexts, but generally produces a less accurate model. -The creation of such a sample for the estimation of the included 5-methylcytosine (5mC) model was completed by introducing 25% (ratio to canonical dCTP) 5-methyl-dCTP into a standard PCR reaction in E. coil. Note that a standard PCR'ed (or otherwise produced canonical bases only) sample is also required for alternative model estimation (referred to as the "*standard sample*" below). For the included N6-methyladenosine (6mA) model, the sample was produced using an in vitro methylase thus exemplifying the flexibility of the alternative model estimation method to different sample preparation techniques. These samples were then re-squiggled and processed with the ``tombo build_model estimate_alt_reference`` command to produce the included 5mC and 6mA models. +----------------------------------------------- +Motif-specific Alternative Reference Estimation +----------------------------------------------- + +The motif-specific model estimation procedure (available via ``tombo build_model estimate_motif_alt_reference`` command) requires a sample containing modified bases at either known locations or completely modified within a known sequence motif. If the sample is assumed to contain modifications at only some motif sites the ``--valid-locations-filename`` (e.g. human CpG methylation) option is provided. Otherwise the model will be fit from all motif sites. + +For motif-specific models, the estimation procedure is quite similar to the canonical model estimation method, except that instead of using all sites with sufficeint coverage, only known modified sites are included in the model. As such these models can be much more accurate than the all-contexts models. + +When these models are used within the ``tombo detect_modifications alternative_model`` command only sites at the motif specified will output modified base predictions. ---------------------------------------- -Alternative Reference Estimation Method ---------------------------------------- +--------------------------------------------- +All-contexts Alternative Reference Estimation +--------------------------------------------- + +The all-context model estimation procedure (available via ``tombo build_model estimate_alt_reference`` command) requires a sample containing the four canonical bases along with a **single, known, alternative base incorporated randomly instead of one canonical base** with a known genome (referred to as the "*alternative sample*" below). The rate of alternative base incorporation should ideally be between 15% and 35%, though a larger range may be acceptable. Key to this method is that the exact known location of alternative base incorporation is not needed, though the base must incorporate in place of only a single canonical base (referred to as the "*swap base*" below and specified with the ``--alternate-model-base`` option to ``tombo build_model estimate_alt_reference``). + +The creation of such a sample for the estimation of the included 5-methylcytosine (5mC) model was completed by introducing 25% (ratio to canonical dCTP) 5-methyl-dCTP into a standard PCR reaction in E. coil. Note that a standard PCR'ed (or otherwise produced canonical bases only) sample is also required for alternative model estimation (referred to as the "*standard sample*" below). For the included N6-methyladenosine (6mA) model, the sample was produced using an in vitro methylase thus exemplifying the flexibility of the alternative model estimation method to different sample preparation techniques. These samples were then re-squiggled and processed with the ``tombo build_model estimate_alt_reference`` command to produce the included 5mC and 6mA models. Base Level Extraction -^^^^^^^^^^^^^^^^^^^^^ +""""""""""""""""""""" Given the above descsribed standard and alternative samples, the alternative model estimation procedure begins with the extraction of the current signal level from a number of reads from both samples. These signal levels are grouped by the genomic k-mer at the location assigned by the re-squiggle algorithm. Importantly, in contrast to standard reference estimation, the signal is not averaged or otherwise processed at the genomic position level. This is because each swap base genomic position contains some proportion of canonical and alternative bases. Reads continue to be processed until every k-mer has at least ``--minimum-kmer-observations`` unique event observations. For PCR'ed samples in paricular, the ``tombo filter level_coverage`` command can help speed up this processing step if the sample coverage is highly variable. In order to save on the memory footprint, event levels are no longer stored once 10,000 obervations have been made for a particular k-mer. Signal Level Density Estimation -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +""""""""""""""""""""""""""""""" Once enough observations have been parsed for each k-mer, a kernel density estimate is computed for each k-mer within the standard and alternative samples. This kernel density estimate can be controled with the ``--kernel-density-bandwidth`` option. The density estimates can be stored by specifying the ``--save-density-basename`` option, and this is highly recommended as the event extraction can be a long process. Future estimation efforts can then load these density estimates using the ``--alternate-density-filename`` and ``--control-density-filename`` options. Additionally, the ``scripts/debug_est_alt.R`` script can produce some useful visualizations from these files. Alternative Base Density Isolation -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +"""""""""""""""""""""""""""""""""" After standard and alternative kernel density estimation, an algorithm is applied to isolate the alternative distribution. This algorithm assumes that a portion of each alternative current level density represents the canonical only density. Thus when this portion of the alternative density is "subtracted" away, the alternative base distribution is all that is left. Alternative Base Incorporation Rate -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +""""""""""""""""""""""""""""""""""" The first step in this process is to estimate the fraction of each k-mer alternative density composed of canonical signal levels. In order to estimate this value, the ratio of the highest peak of the standard density and the closest peak in the alternative sample density is computed for all k-mers including exactly one swap base. Before this ratio computation, alternative densities is shifted due to scaling issues for highly modified samples. This shift is estimated from the emperical signal levl distributions at each non-swap-base-containing k-mer and is fitted with a quadratic function. @@ -78,14 +90,14 @@ Most of these k-mers are likely to shift the signal only slightly (though this m ---- Canonical Density "Subtraction" -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +""""""""""""""""""""""""""""""" Once the alternative base incorporation rate has been obtained, the alternative base expected level is computed by first isolating the alternative base density. This computation simply involoves subtracting, from the alternative sample kernel density estimate, the canonical kernel density estimate scaled by the alternative base incorporation rate. Any position where the scaled canonical density is greater than the alternative sample density is set to zero. Each k-mer's expected signal level is then taken as the weighted mean computed from the isolated alternative density. The spread for each k-mer is taken as the globally estimated standard deviation from the canonical model estimation as spread measures from the isolated distribution are not robust. For k-mers not containing any swap bases, the standard model expected level is taken. For kmers containing more than one swap base, the canonical distribution scaling factor is adjusted appropraitely assuming that each swap base has the same estimated incorporation rate. This is why only single swap base k-mers are used in the incorporation rate estimation stage. Alternative Model Output -^^^^^^^^^^^^^^^^^^^^^^^^ +"""""""""""""""""""""""" The alternative model is then saved to the file specified with the ``--alternate-model-filename`` option. Also specified is the ``--alternate-model-name`` option, which should be a short name describing the alternative base. When ``tombo detect_modifications`` is run with this alternative model, the results are saved with this short name included in the output Tombo statsitics filename. diff --git a/docs/modified_base_detection.rst b/docs/modified_base_detection.rst index 8c5f0be..7b9b720 100644 --- a/docs/modified_base_detection.rst +++ b/docs/modified_base_detection.rst @@ -2,7 +2,7 @@ Modified Base Detection *********************** -Tombo enables three methods for detecting shifts in current signal level, indicative of non-canonical bases. These three methods allow researchers to investigate non-canonical bases given any sample type, while enabling more accurate detection of specific modifications when applicable. +Tombo enables four methods (including two sample comparison methods) for detecting shifts in raw current signal level, indicative of non-canonical bases. These four methods allow researchers to investigate non-canonical bases given any sample type, while enabling more accurate detection of specific modifications when applicable. ---- @@ -13,18 +13,22 @@ Tombo enables three methods for detecting shifts in current signal level, indica ---- -All three methods are accessed by the ``tombo detect_modifications`` command group as described below. +All four methods are accessed by the ``tombo detect_modifications`` command group as described below. **TL;DR**: -* To identify 5-methylcytosine (5mC; DNA or RNA) and N6-methyladenosine (6mA; DNA only), run ``tombo detect_modifications alternative_model`` with the ``--alternate-bases 5mC 6mA`` option -* For more experimental *de novo* modified base detection simply run ``tombo detect_modifications de_novo`` with just a set of reads -* For modified base detection via comparison to a control sample (e.g. PCR or IVT) run ``tombo detect_modifications sample_compare`` with a control set of reads (``--control-fast5-basedirs``) -* The ``tombo detect_modifications`` command will produce a binary file (not intended for use outside the Tombo framework) +* Motif-specific models (new in version 1.5; E. coli dcm and dam and CpG methylation motifs available) provide the most accurate modified base detection and are thus the preferred method where applicable. + + - Access these models via the ``tombo detect_modifications alternative_model`` command with the ``--alternate-bases dam dcm CpG`` option. +* All-context alternate models (less accurate than motif models) identify 5-methylcytosine (5mC; DNA or RNA) and N6-methyladenosine (6mA; DNA only) in any sequence context, run ``tombo detect_modifications alternative_model`` with the ``--alternate-bases 5mC 6mA`` option. +* For more experimental *de novo* modified base detection run ``tombo detect_modifications de_novo`` providing only a set of reads to compare with the canonical base model +* For modified base detection via comparison to a control sample (e.g. PCR or IVT) run ``tombo detect_modifications model_sample_compare`` with a control set of reads (``--control-fast5-basedirs``) +* For modified base detection via comparison to any second sample run ``tombo detect_modifications level_sample_compare`` with a second set of reads (``--alternate-fast5-basedirs``) +* Each ``tombo detect_modifications`` command will produce a binary file (not intended for use outside the Tombo framework) - To extract useful text files see the ``tombo text_output`` commands - To visualize raw signal around significant regions use the ``tombo plot most_significant`` command - - To assess testing results around a known motif use the ``tombo plot motif_with_stats``, ``tombo plot roc``, and ``tombo plot per_read_roc`` commands + - To assess testing results given a ground truth (motif or previously identified sites) use the ``tombo plot motif_with_stats``, ``tombo plot roc``, ``tombo plot sample_compare_roc``, ``tombo plot per_read_roc`` and ``tombo plot sample_compare_per_read_roc`` commands .. hint:: @@ -33,9 +37,9 @@ All three methods are accessed by the ``tombo detect_modifications`` command gro Specific Alternate Base Detection (Recommended) =============================================== -In order to specifically detect 5mC and/or 6mA, use the ``tombo detect_modifications alternative_model`` command. This command computes a statistic similar to a log likelihood ratio (LLR) but dynamically scaled to be more robust to outlier signal levels. This statistic is computed for each "swap base" within each read provided (e.g. each cytosine for 5mC detection or each adenine for 6mA detection). +In order to detect a specific non-canonical base, use the ``tombo detect_modifications alternative_model`` command. This command identifies sites where signal matches the expected levels for an alternate base better than the canonical expected levels. This command computes a statistic similar to a log likelihood ratio (LLR) but dynamically scaled to be more robust to outlier signal levels. This statistic is computed for each "swap base" within each read provided (e.g. each cytosine for 5mC detection or each adenine for 6mA detection). -This statistic is computed by scaling the LLR by the normal likelihood function with the same variance and mean halfway between the canonical and alternative expected signal levels. Three additional scaling factors are added to this function in order to give greater weight to sequence contexts with larger differences between the canonical and alternative expected signal levels, which inherently provide more power to distinguish the canonical and alternative base signal levels. These parameters are also set so that values are on relatively the same scale as a log likelihood ratio for setting ``--single-read-threshold`` values. Default values for the scale factors below are :math:`S_f = 4`, :math:`S_{f2} = 3` and :math:`S_p = 0.3`, which produce the functions shown in the figure below. Users can experiment with the effect of these parameters with the provided ``scripts/outlier_robust_llr.R`` script. +This statistic is computed by scaling the LLR by the normal likelihood function with the same variance and mean halfway between the canonical and alternate expected signal levels. Three additional scaling factors are added to this function in order to give greater weight to sequence contexts with larger differences between the canonical and alternate expected signal levels, which inherently provide more power to distinguish the canonical and alternate base signal levels. These parameters are also set so that values are on relatively the same scale as a log likelihood ratio for setting ``--single-read-threshold`` values. Default values for the scale factors below are :math:`S_f = 4`, :math:`S_{f2} = 3` and :math:`S_p = 0.3`, which produce the functions shown in the figure below. Users can experiment with the effect of these parameters with the provided ``scripts/outlier_robust_llr.R`` script. .. math:: @@ -52,11 +56,11 @@ In order to compute a standard log likelihood ratio, use the ``--standard-log-li .. figure:: _images/outlier_robust_llr.gif :align: center - Tombo outlier-robust versus standard likelihood ratio statistic over varied differences between canonical and alternative expected signal levels. + Tombo outlier-robust versus standard likelihood ratio statistic over varied differences between canonical and alternate expected signal levels. ---- -This statistic is computed and summed over all positions where the base of interest is included in the modeled k-mer. The default DNA model is a 6-mer, so the signal at the six surrounding genomic bases contribute to the resulting statistic at any one position. For example, for 5mC detection within in a TGGTA **C** GTCCG context, the signal will be tested against expected canonical and alternative 5mC levels at the following locations:: +This statistic is computed and summed over all positions where the base of interest is included in the modeled k-mer. The default DNA model is a 6-mer, so the signal at the six surrounding genomic bases contribute to the resulting statistic at any one position. For example, for 5mC detection within in a TGGTA **C** GTCCG context, the signal will be tested against expected canonical and alternate 5mC levels at the following locations:: TGGTA **C** GTCCG ----------------- @@ -67,69 +71,77 @@ This statistic is computed and summed over all positions where the base of inter A **C** GTCC **C** GTCCG -New alternative base models will be added as they are trained and validated internally. This is the perferred method for modified base detection if a model is available for your biological sample of interest as the exact modification position is identified. +New alternate base models will be added as they are trained and validated internally. This is the perferred method for modified base detection if a model is available for your biological sample of interest as the exact modification position is identified. -.. code-block:: bash +Motif-specific models were added in version 1.5 and provide more accurate results based on the model estimation procedure. These motif-specific models will also help to improve the all-context models in the future. Motif-specific models are available for E. coli dam and dcm methylation as well as CpG methylation. - tombo detect_modifications alternative_model --fast5-basedirs \ - --alternate-bases 5mC 6mA --statistics-file-basename sample.alt_model +Motif-specific models are much more user-friendly to train as seen in the :doc:`model_training` section. -.. hint:: +.. code-block:: bash - Users may also train their own alternative base Tombo models and test against these with the advanced ``--alternate-model-filenames`` option. See more details in the :doc:`model_training` section. + tombo detect_modifications alternative_model --fast5-basedirs \ + --alternate-bases CpG --statistics-file-basename sample.alt_model De novo Non-canonical Base Method ================================= In order to perform *de novo* non-canonical base detection, use the ``tombo detect_modifications de_novo`` command. This method is ideal for unknown modification motif detection when using in combination with the ``tombo text_output signif_sequence_context`` command and motif detection software (e.g. `MEME `_). -For each read at each position, this method performs a hypothesis test against the canonical model based on the genomic sequence. Note that this method can be quite error prone and may result in a high false positive rate, especially on a per-read basis. This method also has the lowest barrier to entry, requiring only a set of reads and a reference sequence, allowing any nanopore researcher to start investigating potentially any type of modified base. +For each read at each position, this method performs a hypothesis test against the canonical model based on the genomic sequence. Note that this method can be quite error prone and may result in a high false positive rate. This method has the advantage of being the lowest barrier to entry, requiring only a set of reads and a reference sequence, allowing any nanopore researcher to start investigating potentially any type of modified base. .. code-block:: bash tombo detect_modifications de_novo --fast5-basedirs \ --statistics-file-basename sample.de_novo -Canonical Sample Comparison Method -================================== +Canonical Sample Comparison Methods +=================================== -In order to perform *canonical sample comparison* modified base detection, use the ``tombo detect_modifications sample_compare`` command with a second set of reads from the same biological sample containing only canonical bases (e.g. PCR for DNA or IVT for RNA) via the ``--control-fast5-basedirs`` option. +As of version 1.5, Tombo provides two sample comparison methods for modified base detection (``model_sample_compare`` and ``level_sample_compare``). -For each sample read, this will perform a hypothesis test against a distribution estimated from the signal levels observed from the control sample reads at each genome position. As of version 1.4, this method uses the canonical base model as a prior for this estimated distribution improving results for low coverage regions (disable canonical prior with the ``--sample-only-estimates`` option or lower the prior impact on estimates by lowering the default ``--model-prior-weights`` values). +The ``model_sample_compare`` method (equivalent to ``sample_compare`` method from Tombo versions <1.5) uses a control set of reads (e.g. PCR for DNA or IVT for RNA; provided via the ``--control-fast5-basedirs`` option) to adjust the canonical model for un-modeled local effects. This locally adjusted model is then used as in the ``de_novo`` method to identify deviations from this expected level. The amount of adjustment based on the observed levels can be controlled with the ``--model-prior-weights`` option, which essentially sets a number of pseudo-observations supporting the canonical model. Use the ``--sample-only-estimates`` option to estimate the local expected level only from observed reads (recommended only for high coverage samples). + +The ``level_sample_compare`` method (new in version 1.5) compares two sets of reads to identify inequality in signal level distributions. This method, unlike the other three detection methods, does not perform per-read testing, but compares the two groups of signal levels at each reference position. This method applies either a KS-test (default), U-test or T-test and saves either an effect size statistic (default) or significance p-value. For each test the effect size statistics are the D-statistic for the KS-test, the common language effect size for the U-test (transformed to ``abs(0.5 - S) * 2`` to result in a 0 to 1 scale with 1 indicating a modification) and Cohen's D for the T-test. + +It is recommended that a higher ``--minimum-test-reads`` value be set for the ``level_sample_compare`` command (default is 50) in order to obtain a reliable estimate for the effect size and avoid high false positive rates. This method can be the most reliable for many direct RNA applications where a comparison sample is available. .. code-block:: bash - tombo detect_modifications sample_compare --fast5-basedirs \ + tombo detect_modifications model_sample_compare --fast5-basedirs \ --control-fast5-basedirs \ - --statistics-file-basename sample.compare_sample + --statistics-file-basename sample.model_compare_sample + + tombo detect_modifications level_sample_compare --fast5-basedirs \ + --alternate-fast5-basedirs \ + --statistics-file-basename sample.level_compare_sample .. note:: - Due to the nature of nanopore sequencing, the genomic context surrounding the read head effect that current at any position. Thus shifts in signal due to a modified base may occur at several positions to either side of the true modified location. In order to account for this the canonical sample and *de novo* modfied base detection methods accept the ``--fishers-method-context`` option which combines test values, using `Fisher's Method `_, over a moving window across the genome. This can help to center significant values on modified base positions. The default value for this parameter is 1, but reasonable results can be obtained for values between 0 and 3. + Due to the nature of nanopore sequencing, the context surrounding the read head effects the electric current observed at any position. Thus shifts in signal due to a modified base may occur at several positions to either side of the true assigned modified base location. In order to account for this, the canonical sample and *de novo* modfied base detection methods accept the ``--fishers-method-context`` option which combines test values, using `Fisher's Method `_, over a moving window across each read. For the ``level_sample_compare`` method the statistics are averaged over this window. This can help to center significant values on true modified base positions. The default value for this parameter is 1, but reasonable results can be obtained for values between 0 and 3. Aggregating Per-read Statistics =============================== -All of the above methods compute per-read, per-genome location test statistics. In order to facilitate research at the genomic location level, these per-read statistics are combined at each genomic location by applying a global threshold identifying each read as supporting a canonical or alternative base. This results in a fraction of reads indicating a modified base at each genomic location. This global threshold may consist of a single threshold value or a pair of values (where test statistics between the values do not contribute to the estimated fraction of modified reads). +All of the above methods (except the ``level_sample_compare`` method) compute per-read, per-genome location test statistics. In order to facilitate research at the reference location level, these per-read statistics are combined at each reference position by applying a global threshold identifying each read as supporting a canonical or alternate base. This results in a fraction of reads indicating a modified base at each reference position. This global threshold may consist of a single threshold value or a pair of values (where test statistics between the values do not contribute to the estimated fraction of modified reads). All ``tombo detect_modifications`` methods enable output of per-read test statistics (``--per-read-statistics-basename``). Tombo also provides the ``tombo detect_modifications aggregate_per_read_stats`` command in order to apply different global threshold values to per-read statistics without re-computing these statistics. Note it is not possible to change other testing parameters from this command (e.g. ``--fishers-method-context``). Dampened Fraction Estimates =========================== -At low coverage locations the fraction of modified reads estimates can be poor. Thus the ``--coverage-dampen-counts`` option is provided in order to dampen the estimated fraction of modified reads at low coverage locations. This allows easier use of the fraction statistic in downstream analysis. +At low coverage locations the fraction of modified reads estimates can be poor. Thus the ``--coverage-dampen-counts`` option is provided in order to dampen the estimated fraction of modified reads at low coverage locations. This allows easier use of the dampened fraction statistic in downstream analysis. - The fraction estimate includes pseudo-counts added to the un-modified and modified read counts (as specified by the ``--coverage-dampen-counts`` option) - This is equivalent to using a beta prior when estimating the fraction of reads modified at each position - Test the effect of different dampen counts using the ``scripts/test_beta_priors.R`` (the default values are shown below) - - The raw fraction is still included in the statistics file (access from python API) + - The raw fraction is still included in the statistics file as well ---- .. figure:: _images/dampened_fraction.png :align: center - Heatmap showing the resulting dampened farction of modified reads given the default ``--coverage-dampen-counts`` values over range of coverage and number of un-modified reads. + Heatmap showing the resulting dampened farction of modified reads given the default ``--coverage-dampen-counts`` values over range of coverage and number of un-modified reads for default ``--coverage-dampen-counts 2 0``. ---- @@ -147,13 +159,9 @@ For all modified base detection methods, the result is a binary Tombo statistics While the Tombo statistics file is meant to be a binary file not processed by outside tools its contents are described here for completeness. Access to this file is recommended through the ``tombo.tombo_helper.TomboStats`` object in the Tombo python API. -.. important:: - - All other optional arguments to the ``tombo.tombo_stats.TomboStats`` constructor should be left as ``None``; setting these values will delete the file and construct a blank per-read statistics file. - -The Tombo statistics file is in `HDF5 format `_. Attributes at the root level are 1) ``stat_type`` indicating which testing method was used (``model_compare``, ``de_novo`` or ``sample_compare``), 2) ``block_size`` indicating the number of genomic bases in each statistics block and 3) `Cov_Threshold`` containing the coverage threshold applied to this file. +The Tombo statistics file is in `HDF5 format `_. Attributes at the root level are 1) ``stat_type`` indicating which testing method was used (``model_compare``, ``de_novo``, ``model_sample_compare``, or ``level_sample_compare``), 2) ``block_size`` indicating the number of genomic bases in each statistics block and 3) `Cov_Threshold`` containing the coverage threshold applied to this file (except for ``level_sample_compare`` files). -Blocks of statistics are stored in the ``Statistic_Blocks`` group. Within this group, each block of statistics is found within a group named ``Group_NNN``. Each group contains attributes for the block ``start``, ``chrm`` and ``strand``. The ``block_stats`` data set contains the per-location statistics records. Each record contains the following attributes: ``frac``, ``pos``, ``chrm``, ``strand``, ``cov``, ``control_cov``, and ``valid_cov``. +Blocks of statistics are stored in the ``Statistic_Blocks`` group. Within this group, each block of statistics is found within a group named ``Group_NNN``. Each group contains attributes for the block ``start``, ``chrm`` and ``strand``. The ``block_stats`` data set contains the per-location statistics records. Each record contains the following attributes: ``damp_frac``, ``frac``, ``pos``, ``chrm``, ``strand``, ``cov``, ``control_cov``, and ``valid_cov``. For ``level_sample_compare`` files the ``damp_frac`` and ``frac`` values are replaced by the ``stat`` value. ``frac`` contains the fraction of valid (not including per-read statistics within the interval specified by ``--single_read_threshold``) reads at this genomic position identified as the standard base. diff --git a/docs/plotting.rst b/docs/plotting.rst index a071acf..3a02518 100644 --- a/docs/plotting.rst +++ b/docs/plotting.rst @@ -41,6 +41,8 @@ Plots are also enabled to visualize the different testing frameworks available i Control these plots with these options: ``--control-fast5-basedirs``, ``--plot-standard-model``, ``--plot-alternate-model 5mC``, ``--tombo-model-filename``, and ``--alternate-model-filename``. +For alternative model plotting (``--plot-alternate-model``), note that, as of v1.5, alternative models are spcified for a specific position within each label. Thus a global alternative model cannot be plotted. As such, the alternative model is only displayed around the central base in each plot and only when applicable (i.e. the cetnral base fits the model motif). So the alternative levels may not appear for some regions when a plot specifies the alternative model option. The alternative model levels should always be plotted when plotting using the ``tombo plot most_significant`` command and specifying the same model used in the ``tombo detect_modifications`` call. + ---- .. figure:: _images/sample_comp.png @@ -125,24 +127,26 @@ This command identifies a number (defined by ``--num-statistics``) of genomic re Other Plotting Commands ----------------------- -K-mer Level Distributions -^^^^^^^^^^^^^^^^^^^^^^^^^ +K-mer Signal Levels +^^^^^^^^^^^^^^^^^^^ -In order to investigate the k-mer signal current levels of a particular set of reads, the ``tombo plot kmer`` command is provided. This plot extracts the observed signal levels from a set of reads and groups the signal by the local genomic sequence context (k-mer) and plots the resulting distributions of signal levels. +In order to investigate the signal levels of a particular set of reads, the ``tombo plot kmer`` command is provided. This plot extracts the observed signal levels from a set of reads and groups the signal by the local genomic sequence context (k-mer) and plots the resulting distributions of signal levels. ---- .. figure:: _images/kmer_levels.png :align: center - Example k-mer current level distribution plot + Example current level distribution plot ---- ROC Curves ^^^^^^^^^^ -In order to validate the performance of modified base detection results at a known sequence motif, the ``tombo plot roc`` command is provided. This command takes a Tombo statistics file, corresponding motif descriptions and the genome FASTA file. The "area under the curve" (AUC) for each motif is printed and the precision-recall curve is also plotted for each motif on the second page of the resulting PDF. Note that only genomic positions with the canonical base of interest are included in the results from this command (since the alternative model only makes calls at these positions). +In order to validate the performance of modified base detection results at a known sequence motif or set of ground truth locations, a set of ground truth evaluation plotting commands are prodived (``roc``, ``sample_compare_roc``, ``per_read_roc``, and ``sample_compare_per_read_roc``). Each of these commands produced both a receiver operating characteristic (ROC) curve as well as a Precision-Recall curve. This command also outputs a table of area under the curve (AUC) and mean average precision (mean AP) for all provided ground truth sets. + +For the ``roc`` and ``per_read_roc`` commands ground truth sites are compared to modified base statistics at all other "swap base" sites in the genome. Users should be aware that for more specific motifs the AUC statistic may be somewhat misleading and are not directly comparable from one motif to the next. The ``sample_compare_roc``, and ``sample_compare_per_read_roc`` commands instead use a separate control sample as the ground truth and includes only sites at the ground truth motif. This creates balanced validation metrics that are more comparable between two motifs and/or modifications. Below is an example command and resulting plot for identifying the known dam and dcm methylase contexts in E. coli using all three provided testing methods. @@ -169,7 +173,7 @@ Below is an example command and resulting plot for identifying the known dam and ---- -It is also possible to compute and plot validation results on a per-read basis from a Tombo per-read statistics file. Along with ROC and precision-recall curves, this command also plots a distribution of test statistics for motif-matching and non-motif-matching sites for each motif provided (see figure below). These plots can be very useful in picking a ``--single-read-threshold`` for use in either the ``detect_modifications`` or ``aggregate_per_read_stats`` commands. +For the per-read ROC plots, along with ROC and precision-recall curves these commands plot a distribution of test statistics for motif-matching and non-motif-matching sites for each motif provided (see figure below). These plots can be very useful in picking a ``--single-read-threshold`` for use in either the ``detect_modifications`` or ``aggregate_per_read_stats`` commands. .. code-block:: bash diff --git a/docs/resquiggle.rst b/docs/resquiggle.rst index dc00916..82a5574 100644 --- a/docs/resquiggle.rst +++ b/docs/resquiggle.rst @@ -2,9 +2,9 @@ Re-squiggle Algorithm ********************* -The electric current signal level data produced from a nanopore read is referred to as a squiggle. Base calling this squiggle information generally contains some errors compared to a reference sequence. The re-squiggle algorithm defines a new assignment from squiggle to genomic sequence, hence a re-squiggle. +The electric current signal level data produced from a nanopore read is referred to as a squiggle. Base calling this squiggle information generally contains some errors compared to a reference sequence. The re-squiggle algorithm defines a new assignment from squiggle to reference sequence, hence a re-squiggle. -The re-squiggle algorithm is the basis for the Tombo framework. The re-squiggle algorithm takes as input a read file (in FAST5 format) containing raw signal and associated base calls. The base calls are mapped to a genome or transcriptome reference and then the raw signal is assigned to the genomic sequence based on an expected current level model. +The re-squiggle algorithm is the basis for the Tombo framework. The re-squiggle algorithm takes as input a read file (in FAST5 format) containing raw signal and associated base calls. The base calls are mapped to a genome or transcriptome reference and then the raw signal is assigned to the reference sequence based on an expected current level model. **TL;DR**: @@ -13,13 +13,13 @@ The re-squiggle algorithm is the basis for the Tombo framework. The re-squiggle - The reference sequence may be previously known or discovered from this sample. -* Importantly, the reference sequence is assumed to be correct, so polishing to create a personalized genome may improve performance, particularly for samples divergent from the reference or poorly assembled genomes. +* Importantly, the reference sequence is assumed to be correct, so polishing to create a personalized reference may improve performance, particularly for a divergent sample or poorly assembled reference. * Raw read FAST5 files must contain basecalls. - Add basecalls from a set of FASTQs to raw read files with the ``tombo preprocess annotate_raw_with_fastqs`` command. - - Read files need not contain ``Events`` data (as output with ``fast5`` mode from albacore). + - Read files need *NOT* contain ``Events`` data (as output with ``fast5`` mode from albacore). -* Tombo currently only supports both DNA and RNA data (including R9.4 and R9.5; 1D and 1D2 data; R9.*.1 chemistries). Other data may produce sub-optimal results (e.g. R7 data). +* Tombo currently only supports both DNA and RNA data (including R9.4 and R9.5; 1D and 1D2 data; R9.*.1 chemistries). Other data may produce sub-optimal results (e.g. R9.0 or R7 data). * DNA and RNA reads will be detected automatically and processed accordingly (set explicitly with ``--dna`` or ``--rna``). - Tombo does not perform spliced mapping. Thus a transcriptime reference must be passed to the re-squiggle command for RNA samples. For futher details on Tombo RNA processing see the :doc:`rna` section. diff --git a/docs/text_output.rst b/docs/text_output.rst index c16656f..bfa94f8 100644 --- a/docs/text_output.rst +++ b/docs/text_output.rst @@ -2,7 +2,7 @@ Text Outputs ************ -Two text outputs are available from Tombo: +Tombo provides two text outputs: 1. Genome Broser Files - Genome browser compatible per-genomic-base statistics 2. Fasta - Genomic sequence output surrounding identified modified base sites @@ -16,18 +16,28 @@ Several statistics are available for output: * ``coverage`` - The coverage level for mapped and validly re-squiggled reads * ``valid_coverage`` - The coverage level for reads that are mapped, validly re-squiggled and outside the interval specified by ``--single-read-threshold`` specified in a ``--statistics-filename``. -* ``dampened_fraction`` - The estimated fraction of significantly modified reads -* ``fraction`` - The raw fraction of significantly modified reads +* ``dampened_fraction`` - The estimated fraction of significantly modified reads (non-``level_sample_compare`` modified base detection methods only) +* ``fraction`` - The raw fraction of significantly modified reads (non-``level_sample_compare`` modified base detection methods only) +* ``statistic`` - Statistic produced from ``level_sample_compare`` method * ``signal`` - The mean signal level across all reads mapped to this location * ``signal_sd`` - The mean signal standard deviation across all reads mapped to this location (not available unless ``--include-event-stdev`` was provided in ``tombo resquiggle`` command) * ``dwell`` - The mean number of raw observations observed assigned to this location * ``difference`` - The difference in normalized signal level between a sample and control set of reads +.. hint:: + + The ``dampened_fraction`` output adds psuedo-counts to the detected number of un-modified and modified reads at each tested location (as specified by the ``--coverage-dampen-counts`` option), while the ``fraction`` option returns the raw fraction of modified reads at any reference site from ``detect_modifications`` results. The ``dampen_fraction`` output is intended to allow the inclusion of low coverage regions in downstream analysis without causing potentially false positive site at the top of rank lists. Visualize different values of the ``--coverage-dampen-counts`` option with the included ``scripts/test_beta_priors.R`` script. + +Motif Filtering Output +~~~~~~~~~~~~~~~~~~~~~~ + +The ``tombo text_output browser_files`` contains options ``--motif-descriptions`` and ``--genome-fasta`` enabling computed statistics output to be restrcited to only those locations at known/putative motif-centered modifications. These options apply to the ``fraction``, ``dampened_fraction`` and ``valid_coverage`` file types. + .. note:: - ``signal``, ``signal_sd``, ``dwell`` and ``difference`` require each reads' event level data to be extracted from the raw read files and thus may be quite slow. ``coverage``, ``valid_coverage``, ``fraction`` , and ``dampened_fraction`` can be extracted simply from the tombo statistics files, which is much faster. + ``signal``, ``signal_sd``, ``dwell`` and ``difference`` require each reads' event level data to be extracted from the raw read files and thus may be quite slow. ``valid_coverage``, ``fraction`` , ``dampened_fraction`` and ``statistic`` can be extracted from the tombo statistics files and ``coverage`` from the Tombo index, which is much faster. - The ``signal``, ``signal_sd``, ``dwell`` and ``difference`` outputs all require the ``--fast5-basedirs`` option, the ``valid_coverage``, ``fraction`` , and ``dampened_fraction`` outputs require the ``--statistics-filename`` option, and ``coverage`` output requires one or the other. + The ``signal``, ``signal_sd``, ``dwell`` and ``difference`` outputs all require the ``--fast5-basedirs`` option, the ``valid_coverage``, ``fraction`` , ``dampened_fraction`` and ``statistic`` outputs require the ``--statistics-filename`` option, and ``coverage`` output requires one or the other. Files will be output to individual wiggle files (two per statistic for plus and minus genomic strand) in the following format ``[wiggle-basename].[wiggle-type].[sample|control]?.[plus|minus].wig`` diff --git a/docs/tutorials.rst b/docs/tutorials.rst new file mode 100644 index 0000000..ed53917 --- /dev/null +++ b/docs/tutorials.rst @@ -0,0 +1,195 @@ +*************** +Tombo Tutorials +*************** + +This page contains several tutorials with commands to perform suggested Tombo analysis pipelines. + +.. contents:: :local: + +Human CpG Methylation Detection +------------------------------- + +Detect CpG methylation and output to browser file wiggle format. + +:: + + # re-squiggle raw reads + tombo resquiggle path/to/fast5s/ reference.fasta \ + --processes 4 --num-most-common-errors 5 + + # run modified base detection + tombo detect_modifications alternative_model \ + --fast5-basedirs path/to/fast5s/ \ + --statistics-file-basename human_cpg_testing \ + --alternate-bases CpG --processes 4 + + # output to genome browser compatible format + tombo text_output browser_files \ + --fast5-basedirs path/to/fast5s/ \ + --statistics-filename human_cpg_testing.CpG.tombo.stats \ + --file-types coverage dampened_fraction \ + --browser-file-basename human_cpg_testing + +Resulting files: + + - human_cpg_testing.coverage.plus.bedgraph + - human_cpg_testing.coverage.minus.bedgraph + - human_cpg_testing.CpG.dampened_fraction_modified_reads.plus.wig + - human_cpg_testing.CpG.dampened_fraction_modified_reads.minus.wig + +can be loaded into a genome browser for visualization or processed with bioinformatic tools taking the wiggle format. + +De novo Bacterial Modified Base Detection +----------------------------------------- + +This tutorial walks through the steps to estimate a methylase recognition motif as well as identifying the most likely modified base within that motif. The tutorial then walks through the steps to train a more accurate motif model and use the model to call modified sites in a new sample. Finally, Tombo provides tools to validate the accuracy for this model. + +:: + + # re-squiggle raw reads + tombo resquiggle path/to/fast5s/ reference.fasta \ + --processes 4 --num-most-common-errors 5 + + # run de novo modified base detection for motif discovery + tombo detect_modifications de_novo \ + --fast5-basedirs path/to/fast5s/ \ + --statistics-file-basename de_novo_testing \ + --processes 4 + + # output reference sequence around most significantly modified sites + tombo text_output signif_sequence_context \ + --fast5-basedirs path/to/fast5s/ \ + --statistics-filename de_novo_testing.tombo.stats \ + --sequences-filename de_novo_testing.fasta + + # use command line meme (conda install -c bioconda meme) to estimate modified motif(s) + meme -oc de_novo_testing.meme -dna \ + -mod zoops de_novo_testing.fasta + +MEME will produce enriched motifs given the sites identified by Tombo de novo modified base detection. These motifs can be use to search public data bases for known methlyase recognition motifs and the exact modified base identity. + +While the Tombo framework cannot identify the type of modified base present within a motif context, the relative position of the modified base within that motif can be estimated using the ``tombo plot motif_with_stats`` command. This plot will identify the sites in the reference with the most significantly modified base testing values within a set distance of the designated motif and plot the generated statistics centered on this motif. For example if one had found the E. coli dcm ``CCWGG`` motif the ``tombo plot motif_with_stats`` command could be run as follows: + +:: + + tombo plot motif_with_stats --fast5-basedirs path/to/fast5s/ \ + --motif CCWGG --genome-fasta reference.fasta \ + --statistics-filename de_novo_testing.tombo.stats \ + --pdf-filename de_novo_testing.pdf + +This would produce a plot like this: + +.. figure:: _images/stat_dist.png + :align: center + +---- + +Note that the most significant sites identified for this plot need not be within the motif provided. So if a random motif (not containing a methylase recognition motif in the sample of interest) is provided there should be no enrichment for significant sites within the motif. For example the ``CAW`` motif produces the following results: + +.. figure:: _images/stat_dist_null.png + :align: center + +---- + +Following this analysis a more accurate Tombo motif model can be produced in order to test for this specific modification in this or new samples. + +:: + + # build model at discovered CCWGG motif and test on this sample + tombo build_model estimate_motif_alt_reference \ + --fast5-basedirs path/to/fast5s/ \ + --alternate-model-filename novel_de_novo.tombo.model \ + --alternate-model-name novel_mod --motif-description CCWGG:2 \ + --processes 4 + tombo detect_modifications alternative_model \ + --fast5-basedirs path/to/fast5s/ \ + --statistics-file-basename novel_de_novo \ + --alternate-model-filename novel_de_novo.tombo.model --processes 4 + + # use this model to call modifications in a new control sample + tombo resquiggle path/to/control/fast5s/ reference.fasta \ + --processes 4 --num-most-common-errors 5 + tombo detect_modifications alternative_model \ + --fast5-basedirs path/to/control/fast5s/ \ + --statistics-file-basename novel_de_novo.control \ + --alternate-model-filename novel_de_novo.tombo.model --processes 4 + +Finally the quality of this model can be tested by comparing the results to a control sample which does not contain modifications (e.g. PCR). + +:: + + # validate model with ground truth comparison + tombo plot sample_compare_roc --genome-fasta reference.fasta \ + --statistics-filenames novel_de_novo.novel_mod.tombo.stats \ + --control-statistics-filenames novel_de_novo.control.novel_mod.tombo.stats \ + --motif-descriptions CCWGG:2:"Novel De novo Modification" \ + --pdf-filename novel_de_novo.samp_comp_roc.pdf + +This command will output quality metrics (AUC and mean AP) as well as plotting the ROC curve and precision-recall curve. + +Build Motif-specific Model from Ground Truth +-------------------------------------------- + +In this tutorial a model is built from a sample containing modifications at known locations (e.g. bisulfite sequencing). + +.. Note:: + + In order to estimate a Tombo alternate base motif model, the sample must contain a valid reference observation for each k-mer (as defined by ``--upstream-bases`` and ``--downstream-bases``), and ideally many observations for each k-mer. + + Note also that these estimated k-mer levels are approximations of the expected signal levels and that larger sequence contexts likely modulate the signal levels in most cases. + +In this example, we have a sample with a modification occuring at the second base in TT dimers (as defined by the ``--motif-description TT:2`` option) and the modification exists specifically at the sites defined within the provided ``--valid-locations-filename modified_locations.bed`` file. + +:: + + tombo resquiggle path/to/fast5s/ reference.fasta \ + --processes 4 --num-most-common-errors 5 + + # build model specifying valid modificaiton locations + tombo build_model estimate_motif_alt_reference \ + --fast5-basedirs path/to/fast5s/ \ + --alternate-model-filename known_TT_mod.tombo.model \ + --alternate-model-name TT_mod --motif-description TT:2 \ + --valid-locations-filename modified_locations.bed \ + --processes 4 + +As in the previous de novo bacterial tutorial, the quality of this model can be tested by comparing the results to a control sample which does not contain modifications (e.g. PCR). + +:: + + # test on modified and control samples + tombo detect_modifications alternative_model \ + --fast5-basedirs path/to/fast5s/ \ + --statistics-file-basename modified_sample \ + --alternate-model-filename known_TT_mod.tombo.model --processes 4 + tombo detect_modifications alternative_model \ + --fast5-basedirs path/to/control/fast5s/ \ + --statistics-file-basename control_sample \ + --alternate-model-filename known_TT_mod.tombo.model --processes 4 + + # validate model with ground truth comparison + tombo plot sample_compare_roc --genome-fasta reference.fasta \ + --statistics-filenames modified_sample.TT_mod.tombo.stats \ + --control-statistics-filenames control_sample.TT_mod.tombo.stats \ + --motif-descriptions TT:2:"Known TT Modification" \ + --pdf-filename TT_mod.samp_comp_roc.pdf + +RNA Level Testing +----------------- + +RNA modifications can be more subtle to detect in raw nanopore data, so the ``tombo detect_modifications level_sample_compare`` compare method has been added in version 1.5. This tutorial walks through running this analysis and some considerations when doing so. + +Running the ``level_sample_compare`` command is quite similar to other modified base detection methods: + +:: + + tombo detect_modifications level_sample_compare \ + --fast5-basedirs path/to/fast5s/ \ + --alternate-fast5-basedirs path/to/comparison/fast5s/ \ + --statistics-file-basename level_testing \ + --statistic-type ks --processes 4 + +There are several considerations when running this command: + + - This command computes effect size measures by default (as statistical measures are generally too dependent on sequencing depth, but are available via the ``--store-p-value`` option). Thus a higher read coverage threshold generally provides more robust results (``--minimum-test-reads`` has a default of ``50`` for ``level_sample_compare`` command). + - The effect size meaasure produced should not be interpreted as a fraction of modified bases or confidence metric for a modified base at a site (though effect sizes for KS and U tests are in the range fro 0 to 1). Different modifications within different sequence contexts will produce different effect sizes under ideal conditions. But given higher read depth peaks in this value within a sample can be useful especially for hypothesis generation. diff --git a/scripts/debug_params.R b/scripts/debug_params.R index 2ccdde6..9d7c63a 100644 --- a/scripts/debug_params.R +++ b/scripts/debug_params.R @@ -1,59 +1,74 @@ -library(dplyr) -library(ggplot2) -library(ggridges) -library(ggbeeswarm) +suppressPackageStartupMessages(library(dplyr)) +suppressPackageStartupMessages(library(ggplot2)) +suppressPackageStartupMessages(library(ggridges)) ## set _DEBUG_PARAMS = True in resquiggle.py ## example run for min_obs_per_base testing: -##for i in {0..6}; do -## testParam=`echo $i | awk '{print ($1 * 1) + 2}'` -## tombo resquiggle param_test_reads/ genome.fasta --segmentation-parameters 5 $testParam 5 --signal-align-parameters 4.2 4.2 1200 1.75 5.0 --processes 4 +##for i in {0..2}; do +## testParam=`echo $i | awk '{print ($1 * 1) + 1}'` +## tombo resquiggle param_test_reads/ genome.fasta \ +## --segmentation-parameters 5 3 $testParam 5 \ +## --signal-align-parameters 4.2 4.2 300 1500 20.0 40 750 2500 250 \ +## --processes 4 ##done > param_values.txt -stat <- 'skip_pen' - -dat <- read.table('param_values.txt') -colnames(dat) <- c('running_window', 'min_obs_per_base', 'mean_obs_per_event', - 'match_evalue', 'skip_pen', 'bandwidth', - 'read_name', 'mean_score') -dat$mean_obs_per_event <- factor(dat$mean_obs_per_event) -dat$running_window <- factor(dat$running_window) -dat$min_obs_per_base <- factor(dat$min_obs_per_base) -dat$match_evalue <- factor(dat$match_evalue) -dat$skip_pen <- factor(dat$skip_pen) -dat$bandwidth <- factor(dat$bandwidth) - -dat <- dat %>% group_by(mean_obs_per_event, min_obs_per_base, running_window, - match_evalue, skip_pen, bandwidth, read_name) %>% + +dat <- read.table('param_values.txt', header=TRUE) + +## update with output from tombo.resquiggle._write_params_debug +colnames(dat) <- c( + 'running_window', 'min_obs_per_base', 'raw_min_obs_per_base', + 'mean_obs_per_event', 'match_evalue', 'skip_pen', 'bandwidth', + 'read_name', 'mean_score') + +## filter out save bandwidth reads if bandwidth was not tested here +if(length(unique(dat$bandwidth)) == 2){ + dat <- dat %>% filter(bandwidth == min(as.numeric(dat$bandwidth))) +} + +#3 convert params to factors and get stat that was investigated in this run +param_names <- setdiff(colnames(dat), c('mean_score', 'read_name')) +for(param_name in param_names){ + dat[,param_name] <- factor(dat[,param_name]) +} +stat <- param_names[which.max(sapply(param_names, function(param_name) + length(unique(dat[,param_name]))))] + +## take min score over same read with same params (over re-scalings) +dat <- dat %>% group_by_at(c(param_names, 'read_name')) %>% summarize(mean_score=min(mean_score)) +## filter for reads included in all parameter groups rdat <- dat %>% group_by(read_name) %>% summarize(nreads=n()) maxNReads <- rdat$read_name[which(rdat$nreads == max(rdat$nreads))] fdat <- dat %>% filter(read_name %in% maxNReads) -dat %>% group_by_at(stat) %>% - summarize(med=median(mean_score), mean=mean(mean_score)) %>% - print.data.frame(digits=6) -fdat %>% group_by_at(stat) %>% - summarize(med=median(mean_score), mean=mean(mean_score)) %>% - print.data.frame(digits=6) +## compute and print mean and median stats for plotting +sumDat <- dat %>% group_by_at(stat) %>% + summarize(med=median(mean_score), mean=mean(mean_score)) +sumFDat <- fdat %>% group_by_at(stat) %>% + summarize(med=median(mean_score), mean=mean(mean_score)) -minMed <- dat %>% group_by_at(stat) %>% summarize(med=median(mean_score)) %>% - summarize(foo=min(med)) %>% .$foo -minMedF <- fdat %>% group_by_at(stat) %>% summarize(med=median(mean_score)) %>% - summarize(foo=min(med)) %>% .$foo +sumDat %>% print.data.frame(digits=6) +sumFDat %>% print.data.frame(digits=6) pdf(paste0('param_values.', stat, '.pdf'), width=10) -ggplot(dat, aes_string(x=stat, y='mean_score', color=stat)) + - geom_hline(aes(yintercept=minMed)) + - geom_beeswarm(alpha=0.3, cex=0.5) + - stat_summary(fun.y=median, color='red', geom='point', size=2) + - stat_summary(fun.y=mean, color='orange', geom='point', size=2) + - theme_bw() + theme(axis.text.x=element_text(angle=60, hjust=1)) -ggplot(fdat, aes_string(x=stat, y='mean_score', color=stat)) + - geom_hline(aes(yintercept=minMedF)) + - geom_beeswarm(alpha=0.3, cex=0.5) + - stat_summary(fun.y=median, color='red', geom='point', size=2) + - stat_summary(fun.y=mean, color='orange', geom='point', size=2) + - theme_bw() + theme(axis.text.x=element_text(angle=60, hjust=1)) +ggplot(dat, aes_string(y=stat, x='mean_score', fill=stat)) + + geom_vline(aes(xintercept=min(sumDat$med))) + + geom_density_ridges(alpha=0.3, cex=0.5) + + geom_point(aes_string(x='med', y=stat), + color='red', size=5, data=sumDat) + + geom_point(aes_string(x='mean', y=stat), + color='orange', size=5, data=sumDat) + + theme_minimal() + theme(legend.position="none") + + coord_cartesian(xlim=quantile(dat$mean_score, c(0.01, 0.98))) +ggplot(fdat, aes_string(y=stat, x='mean_score', fill=stat)) + + geom_vline(aes(xintercept=min(sumFDat$med))) + + geom_density_ridges(alpha=0.3, cex=0.5) + + geom_point(aes_string(x='med', y=stat), + color='red', size=5, data=sumFDat) + + geom_point(aes_string(x='mean', y=stat), + color='orange', size=5, data=sumFDat) + + theme_minimal() + theme(legend.position="none") + + coord_cartesian(xlim=quantile(fdat$mean_score, c(0.01, 0.98))) foo <- dev.off() diff --git a/tombo/R_scripts/debugRawDP.R b/tombo/R_scripts/debugRawDP.R new file mode 100644 index 0000000..818a2cc --- /dev/null +++ b/tombo/R_scripts/debugRawDP.R @@ -0,0 +1,42 @@ +minDPScore <- -250 + +plotRawDP <- function(zDat, fwdDat, tbDat, sigDat){ + for(reg in unique(zDat$Region)){ + tbReg <- tbDat[tbDat$Region == reg,] + + regZDat <- zDat[zDat$Region == reg,] + regZDat$Score[regZDat$Score < minDPScore] <- minDPScore + zP <- ggplot(regZDat) + + geom_tile(aes(x=EventPos, y=SeqPos, fill=Score)) + + scale_fill_gradient2( + high='#67001f', mid='#ffffbf', low='#1a1a1a', + midpoint=mean(range(regZDat$Score))) + + geom_line(aes(x=EventPos, y=SeqPos), + data=tbReg, color='steelblue') + + theme_minimal() + ylab('Genomic Sequence') + + theme(axis.title.x=element_blank(), axis.text.x=element_blank()) + + ggtitle(tbDat$Region[1]) + + xlim(min(sigDat$Pos) - 1, max(sigDat$Pos) + 1) + + regFwdDat <- fwdDat[fwdDat$Region == reg,] + regFwdDat$Score[regFwdDat$Score < minDPScore] <- minDPScore + fP <- ggplot(regFwdDat) + + geom_tile(aes(x=EventPos, y=SeqPos, fill=Score)) + + scale_fill_gradient2( + high='#67001f', mid='#ffffbf', low='#1a1a1a', + midpoint=mean(range(regFwdDat$Score))) + + geom_line(aes(x=EventPos, y=SeqPos), + data=tbReg, color='steelblue') + + theme_minimal() + ylab('Genomic Sequence') + + theme(axis.title.x=element_blank(), axis.text.x=element_blank()) + + ggtitle(tbDat$Region[1]) + + xlim(min(sigDat$Pos) - 1, max(sigDat$Pos) + 1) + + sP <- ggplot(sigDat) + geom_line(aes(x=Pos, y=Signal, color=0)) + + theme_minimal() + xlab('Position') + + xlim(min(sigDat$Pos) - 1, max(sigDat$Pos) + 1) + + print(plot_grid(zP, sP, align='v', ncol=1, rel_heights=c(5,1))) + print(plot_grid(fP, sP, align='v', ncol=1, rel_heights=c(5,1))) + } +} diff --git a/tombo/R_scripts/plotModelComp.R b/tombo/R_scripts/plotModelComp.R index 3ffd1ab..7404959 100644 --- a/tombo/R_scripts/plotModelComp.R +++ b/tombo/R_scripts/plotModelComp.R @@ -22,8 +22,10 @@ plotModelComp <- function(sigDat, quantDat, boxDat, eventDat, unique(as.character(eventDat$Region)))) for(reg_i in regions){ reg_model_dat <- modelDat[modelDat$Region==reg_i,] + reg_has_alt <- FALSE if(!is.null(altModelDat)){ - reg_alt_model_dat <- altModelDat[modelDat$Region==reg_i,] + reg_alt_model_dat <- altModelDat[altModelDat$Region==reg_i,] + reg_has_alt = nrow(reg_alt_model_dat) > 0 } reg_base_dat <- baseDat[baseDat$Region==reg_i,] title <- TitleDat[TitleDat$Region==reg_i,'Title'] @@ -41,7 +43,7 @@ plotModelComp <- function(sigDat, quantDat, boxDat, eventDat, gPos=rep(psDat$Position[1], nDens), Group=rep(psDat$Region[1], nDens)) }) - if(!is.null(altModelDat)){ + if(reg_has_alt){ altModDensDat <- lapply(split( reg_alt_model_dat, paste0(reg_alt_model_dat$Position, reg_alt_model_dat$Strand)), @@ -78,7 +80,7 @@ plotModelComp <- function(sigDat, quantDat, boxDat, eventDat, geom_polygon(aes(x=Position, y=Signal, group=gPos), data=normDensDat, fill='black', alpha=0.4, size=0, show.legend=FALSE) - if(!is.null(altModelDat)){ + if(reg_has_alt){ altNormDensDat <- do.call( rbind.data.frame, lapply(altModDensDat, function(posDens){ diff --git a/tombo/R_scripts/plotMotifStats.R b/tombo/R_scripts/plotMotifStats.R index 08188c8..98b7d73 100644 --- a/tombo/R_scripts/plotMotifStats.R +++ b/tombo/R_scripts/plotMotifStats.R @@ -130,7 +130,7 @@ plotMotifStats <- function(PlotDat, BaseDat, StatsDat, axis.ticks.x=element_blank(), axis.title.x=element_blank(), panel.grid.minor.y=element_blank()) + - ylab('Est. Fraction Modified'))) + ylab('Fraction Modified or -log10(p-value)'))) maxWidth <- do.call(grid::unit.pmax, sapply(ps, function(x) x$widths[1:4])) ps <- lapply(ps, function(p){ diff --git a/tombo/R_scripts/plotPerReadStats.R b/tombo/R_scripts/plotPerReadStats.R index efca2e7..f0a638c 100644 --- a/tombo/R_scripts/plotPerReadStats.R +++ b/tombo/R_scripts/plotPerReadStats.R @@ -1,5 +1,5 @@ # set thresholds for plotting tile -pointMaxReads <- 30 +pointMaxReads <- 40 pointMaxBases <- 200 textLim <- 150 @@ -36,7 +36,8 @@ plotPerReadStats <- function(StatData, OrdData, baseDat, boxCenter, arePvals){ p <- p + geom_tile(aes(x=Position, y=Read, fill=Stats)) } else { p <- p + geom_point(aes(x=Position, y=Read, fill=Stats), - stroke=0, color='#969696', size=5, shape=21) + stroke=0, color='#969696', size=5, shape=21, + alpha=0.5) } lhRatioMax <- max(abs(regDat$Stats)) breaks <- seq(-lhRatioMax, lhRatioMax, length.out=5) diff --git a/tombo/R_scripts/plotROC.R b/tombo/R_scripts/plotROC.R index d2eccfe..6834048 100644 --- a/tombo/R_scripts/plotROC.R +++ b/tombo/R_scripts/plotROC.R @@ -3,6 +3,6 @@ plotROC <- function(rocDat){ geom_path(aes(x=FP, y=TP, color=Comparison)) + theme_bw() + xlab('False Positive Rate') + ylab('True Positive Rate')) print(ggplot(rocDat) + - geom_path(aes(x=Precision, y=TP, color=Comparison)) + theme_bw() + - xlab('Precision') + ylab('Recall')) + geom_path(aes(x=TP, y=Precision, color=Comparison)) + theme_bw() + + xlab('Recall') + ylab('Precision')) } diff --git a/tombo/R_scripts/plotROCPerRead.R b/tombo/R_scripts/plotROCPerRead.R index 3d35ca1..1e572c9 100644 --- a/tombo/R_scripts/plotROCPerRead.R +++ b/tombo/R_scripts/plotROCPerRead.R @@ -7,8 +7,8 @@ plotROCPerRead <- function(rocDat, denStats){ geom_path(aes(x=FP, y=TP, color=Comparison)) + theme_bw() + xlab('False Positive Rate') + ylab('True Positive Rate')) print(ggplot(rocDat) + - geom_path(aes(x=Precision, y=TP, color=Comparison)) + theme_bw() + - xlab('Precision') + ylab('Recall')) + geom_path(aes(x=TP, y=Precision, color=Comparison)) + theme_bw() + + xlab('Recall') + ylab('Precision')) for(modName in names(denStats)){ denStats[[modName]]$stat[denStats[[modName]]$stat > lhRatioMax] <- lhRatioMax @@ -17,6 +17,7 @@ plotROCPerRead <- function(rocDat, denStats){ print(ggplot(denStats[[modName]]) + geom_density(aes(x=stat, fill=motif_match), alpha=0.5, color='white', size=0.01) + - theme_bw() + ggtitle(modName)) + theme_bw() + ggtitle(modName) + + scale_fill_discrete(name="Ground Truth\nModified")) } } diff --git a/tombo/__init__.py b/tombo/__init__.py index 54aa32a..66f7eeb 100644 --- a/tombo/__init__.py +++ b/tombo/__init__.py @@ -9,7 +9,7 @@ .. note:: - Effort will be made to maintain this API interface introduced at Tombo version 1.3.1, but major structural changes to the Tombo framework may require changes to some API interface components. Such changes will be noted in github release notes where applicable. + Effort will be made to maintain this API interface introduced at Tombo version 1.4, but major structural changes to the Tombo framework may require changes to some API interface components. Such changes will be noted in github release notes where applicable. ------------------- Python API Examples diff --git a/tombo/__main__.py b/tombo/__main__.py index bbae7f2..5a947c2 100644 --- a/tombo/__main__.py +++ b/tombo/__main__.py @@ -57,10 +57,15 @@ def main(args=None): ('alternative_model', 'Test for shifts in raw signal which match ' + 'those of a specific known non-canonical base.', _option_parsers.get_alt_test_signif_parser()), - ('sample_compare', 'Test for shifts in raw signal against signal ' + - 'levels derived from a canonical base only sample (PCR/IVT).', + ('model_sample_compare', 'Test for shifts in raw signal ' + + 'against levels estimated from a canonical/control ' + + 'sample (PCR/IVT) at each reference position.', _option_parsers.get_samp_comp_test_signif_parser()), - ('aggregate_per_read_stats','Aggregate Tombo per-read statistics ' + + ('level_sample_compare', 'Test for shifts in raw signal against ' + + 'signal level distributions from a canonical/control sample ' + + '(PCR/IVT) at each reference position.', + _option_parsers.get_group_comp_test_signif_parser()), + ('aggregate_per_read_stats', 'Aggregate Tombo per-read statistics ' + 'to produce a genomic base statistics file.', _option_parsers.get_aggregate_per_read_parser()), ]), @@ -74,39 +79,6 @@ def main(args=None): 'modified genomic locations.', _option_parsers.get_write_signif_diff_parser()), ]), - ('plot', 'Save plots to visualize raw nanopore signal or ' + - 'testing results.', [ - ('max_coverage', - 'Plot raw signal in regions with maximal coverage.', - _option_parsers.get_max_cov_parser()), - ('genome_locations', - 'Plot raw signal at defined genomic locations.', - _option_parsers.get_genome_loc_parser()), - ('motif_centered', - 'Plot raw signal at a specific motif.', - _option_parsers.get_motif_loc_parser()), - ('max_difference', - 'Plot raw signal where signal differs most between two ' + - 'read groups.', _option_parsers.get_max_diff_parser()), - ('most_significant', - 'Plot raw signal at most modified locations.', - _option_parsers.get_signif_diff_parser()), - ('motif_with_stats', - 'Plot example signal and statistic distributions around a ' + - 'motif of interst.', _option_parsers.get_signif_motif_parser()), - ('per_read', - 'Plot per-read modified base probabilities.', - _option_parsers.get_per_read_parser()), - ('roc','Plot ROC curve from known motif(s).', - _option_parsers.get_roc_parser()), - ('per_read_roc','Plot per-read ROC curve from known motif(s).', - _option_parsers.get_per_read_roc_parser()), - ('kmer','Plot signal distributions acorss kmers.', - _option_parsers.get_kmer_dist_parser()), - ('cluster_most_significant', - 'Clustering traces at bases with most significant stats.', - _option_parsers.get_cluster_signif_diff_parser()), - ]), ('build_model', 'Create canonical and alternative base Tombo models.', [ ('estimate_reference', 'Estimate reference tombo model derived from the provided reads.', @@ -115,6 +87,9 @@ def main(args=None): 'a sample containing canonical bases spiked with a single ' + 'non-standard base.', _option_parsers.get_est_alt_ref_parser()), + ('estimate_motif_alt_reference', 'Estimate alternative tombo ' + + 'model from a sample containing modified bases within a known ' + + 'sequence motif.', _option_parsers.get_est_motif_alt_ref_parser()), ('estimate_scale', 'Estimate a global scaling parameter from a ' + 'sub-set of reads.', _option_parsers.get_estimate_scale_parser()), @@ -123,11 +98,60 @@ def main(args=None): _option_parsers.get_event_resquiggle_parser()), ]), ] + plot_commands = ( + 'plot', 'Save plots to visualize raw nanopore signal or ' + + 'testing results.', [ + ('Genome Location (Standard)', + (('max_coverage', + 'Plot raw signal in regions with maximal coverage.', + _option_parsers.get_max_cov_parser()), + ('genome_locations', + 'Plot raw signal at defined genomic locations.', + _option_parsers.get_genome_loc_parser()), + ('motif_centered', + 'Plot raw signal at a specific motif.', + _option_parsers.get_motif_loc_parser()), + ('max_difference', + 'Plot raw signal where signal differs most between two ' + + 'read groups.', _option_parsers.get_max_diff_parser()), + ('most_significant', + 'Plot raw signal at most modified locations.', + _option_parsers.get_signif_diff_parser()), + )), + ('Genome Location (Other)', ( + ('motif_with_stats', + 'Plot example signal and statistic distributions around a ' + + 'motif of interst.', _option_parsers.get_signif_motif_parser()), + ('per_read', + 'Plot per-read modified base probabilities.', + _option_parsers.get_per_read_parser()), + )), + ('Ground Truth Performance Evaluation', + (('roc','Plot ROC curve from known motif(s).', + _option_parsers.get_roc_parser()), + ('sample_compare_roc', + 'Plot ROC curve comparing two samples at known motif(s).', + _option_parsers.get_control_roc_parser()), + ('per_read_roc','Plot per-read ROC curve from known motif(s).', + _option_parsers.get_per_read_roc_parser()), + ('sample_compare_per_read_roc', + 'Plot per-read ROC curve comparing two samples at known motif(s).', + _option_parsers.get_control_per_read_roc_parser()), + )), + ('Other', + (('kmer','Plot signal distributions acorss kmers.', + _option_parsers.get_kmer_dist_parser()), + ('cluster_most_significant', + 'Clustering traces at bases with most significant stats.', + _option_parsers.get_cluster_signif_diff_parser())) + )]) + desc = ('Tombo command groups (additional help available ' + 'within each command group):\n' + '\n'.join([ '\t{0: <25}{1}'.format(grp_name, grp_help) - for grp_name, grp_help, _ in rsqgl_help + nested_commands])) + for grp_name, grp_help, _ in rsqgl_help + nested_commands + + [plot_commands,]])) parser = argparse.ArgumentParser( prog='tombo', description='********** Tombo *********\n\nTombo is a suite of tools ' + @@ -163,6 +187,21 @@ def main(args=None): subparser_cmd = grp_subparser.add_parser( cmd_name, parents=[cmd_parser,], add_help=False) + # add plot commands in groups for easier command determination + plot_desc = '\n\n'.join([ + grp + '\n' + '\n'.join([ + '\t{0: <30}{1}'.format(cmd, cmd_help) + for cmd, cmd_help, _ in cmds]) + for grp, cmds in plot_commands[2]]) + plot_parser = service_subparsers.add_parser( + 'plot', formatter_class=SubcommandHelpFormatter, description=plot_desc) + plot_subparser = plot_parser.add_subparsers( + title='plot', dest="action_command") + for grp, cmds in plot_commands[2]: + for cmd_name, cmd_help, cmd_parser in cmds: + subparser_cmd = plot_subparser.add_parser( + cmd_name, parents=[cmd_parser,], add_help=False) + try: save_args = args args = parser.parse_args(args) @@ -211,6 +250,8 @@ def main(args=None): tombo_stats._est_ref_main(args) elif args.action_command == 'estimate_alt_reference': tombo_stats._est_alt_ref_main(args) + elif args.action_command == 'estimate_motif_alt_reference': + tombo_stats._est_motif_alt_ref_main(args) elif args.action_command == 'estimate_scale': tombo_stats._estimate_scale_main(args) else: diff --git a/tombo/_c_dynamic_programming.pyx b/tombo/_c_dynamic_programming.pyx index b4bd750..e1f3a9b 100644 --- a/tombo/_c_dynamic_programming.pyx +++ b/tombo/_c_dynamic_programming.pyx @@ -70,7 +70,7 @@ def c_reg_z_scores( # clip positions from the end of each base for idx in range(reg_len): base_i = base_range[reg_len - idx - 1] - b_sig_end = r_b_starts[min(reg_end - 1, base_i + max_base_shift + 1)] + b_sig_end = r_b_starts[min(reg_end, base_i + max_base_shift + 1)] # clip observations from the end of a base if there is no # possible traceback path through that location if (prev_end_set and @@ -91,8 +91,8 @@ def c_reg_z_scores( c_base_z_scores(r_sig[b_sig_start:b_sig_end], r_ref_means[base_i], r_ref_sds[base_i], do_winsorize_z, np_max_half_z_score), ( - b_sig_start-r_b_starts[reg_start], - b_sig_end-r_b_starts[reg_start]))) + b_sig_start - r_b_starts[reg_start], + b_sig_end - r_b_starts[reg_start]))) return reg_scores diff --git a/tombo/_c_helper.pyx b/tombo/_c_helper.pyx index 0b1c5a3..8b5b3f3 100644 --- a/tombo/_c_helper.pyx +++ b/tombo/_c_helper.pyx @@ -35,8 +35,9 @@ def c_mean_std(np.ndarray[DTYPE_t] values): v_var += (values[idx] - v_mean)**2 return v_mean, sqrt(v_var / v_len) -def c_new_mean_stds(np.ndarray[DTYPE_t] norm_signal not None, - np.ndarray[DTYPE_INT_t] new_segs not None): +def c_new_mean_stds( + np.ndarray[DTYPE_t] norm_signal not None, + np.ndarray[DTYPE_INT_t] new_segs not None): cdef DTYPE_INT_t n_segs = new_segs.shape[0] - 1 cdef np.ndarray[DTYPE_t] means_arr = np.empty(n_segs, dtype=DTYPE) cdef np.ndarray[DTYPE_t] stds_arr = np.empty(n_segs, dtype=DTYPE) @@ -55,8 +56,9 @@ def c_new_mean_stds(np.ndarray[DTYPE_t] norm_signal not None, stds_arr[idx] = sqrt(curr_var / seg_len) return means_arr, stds_arr -def c_new_means(np.ndarray[DTYPE_t] norm_signal not None, - np.ndarray[DTYPE_INT_t] new_segs not None): +def c_new_means( + np.ndarray[DTYPE_t] norm_signal not None, + np.ndarray[DTYPE_INT_t] new_segs not None): cdef DTYPE_INT_t n_segs = new_segs.shape[0] - 1 cdef np.ndarray[DTYPE_t] means_arr = np.empty(n_segs, dtype=DTYPE) cdef DTYPE_t curr_sum @@ -117,8 +119,9 @@ def c_valid_cpts_w_cap( return cpts -def c_valid_cpts(np.ndarray[DTYPE_t] raw_signal, DTYPE_INT_t min_base_obs, - DTYPE_INT_t running_stat_width): +def c_valid_cpts( + np.ndarray[DTYPE_t] raw_signal, DTYPE_INT_t min_base_obs, + DTYPE_INT_t running_stat_width): cdef np.ndarray[DTYPE_t] raw_cumsum = np.cumsum( np.concatenate([[0.0], raw_signal])) # get difference between all neighboring running_stat_width regions diff --git a/tombo/_default_parameters.py b/tombo/_default_parameters.py index a570261..c3965c4 100644 --- a/tombo/_default_parameters.py +++ b/tombo/_default_parameters.py @@ -15,6 +15,9 @@ ALTERNATE_MODELS = { DNA_SAMP_TYPE + '_5mC':'tombo.DNA.5mC.model', DNA_SAMP_TYPE + '_6mA':'tombo.DNA.6mA.model', + DNA_SAMP_TYPE + '_dcm':'tombo.DNA.dcm.model', + DNA_SAMP_TYPE + '_dam':'tombo.DNA.dam.model', + DNA_SAMP_TYPE + '_CpG':'tombo.DNA.CpG.model', RNA_SAMP_TYPE + '_5mC':'tombo.RNA.5mC.model', } @@ -26,10 +29,11 @@ # table containing default segmentation parameters for different sample types # 1) running neighboring window width for segmentation scoring # 2) minimum observations per genomic base -# 3) mean number of observations per event during segmentation +# 3) raw re-squiggle minimum observations per genomic base +# 4) mean number of observations per event during segmentation SEG_PARAMS_TABLE = { - RNA_SAMP_TYPE:(12, 5, 15), - DNA_SAMP_TYPE:(5, 3, 5), + RNA_SAMP_TYPE:(12, 6, 2, 15), + DNA_SAMP_TYPE:(5, 3, 1, 5), } # table containing default signal to sequence assignment parameters @@ -187,7 +191,7 @@ NANOPOLISH_CENTRAL_POS = 2 # default values for dampened fraction computations -COV_DAMP_COUNTS = [2, 0.5] +COV_DAMP_COUNTS = [2, 0] # store N arrays during stat computation before re-computing the # most significant array diff --git a/tombo/_event_resquiggle.py b/tombo/_event_resquiggle.py index 91c1244..3ad54f5 100644 --- a/tombo/_event_resquiggle.py +++ b/tombo/_event_resquiggle.py @@ -333,7 +333,7 @@ def resquiggle_read( if seg_params is None: seg_params = SEG_PARAMS_TABLE[RNA_SAMP_TYPE] if rna else \ SEG_PARAMS_TABLE[RNA_SAMP_TYPE] - (running_stat_width, min_obs_per_base, _) = seg_params + (running_stat_width, min_obs_per_base, _, _) = seg_params else: running_stat_width, min_obs_per_base = seg_params diff --git a/tombo/_option_parsers.py b/tombo/_option_parsers.py index 514c8de..8994f73 100644 --- a/tombo/_option_parsers.py +++ b/tombo/_option_parsers.py @@ -92,6 +92,10 @@ 'type':unicode, 'nargs':'+', 'help':'Set of directories containing fast5 files for control reads, ' + 'containing only canonical nucleotides.'}) +altfast5dir_opt=('--alternate-fast5-basedirs', { + 'type':unicode, 'nargs':'+', + 'help':'Set of directories containing fast5 files for alternate set of ' + + 'reads.'}) corrgrp_opt=('--corrected-group', { 'type':unicode, 'default':'RawGenomeCorrected_000', @@ -198,19 +202,55 @@ 'type':unicode, 'nargs':'+', 'help':'Binary files containing per-read statistics from ' + 'statistical testing.'}) +prctrlstats_opt=('--per-read-control-statistics-filenames', { + 'type':unicode, 'nargs':'+', + 'help':'Binary files containing per-read control statistics from ' + + 'statistical testing.'}) statfns_opt=('--statistics-filenames', { 'type':unicode, 'nargs':'+', - 'help':"Files to load base by base statistics."}) -motifdesc_opt=('--motif-descriptions', { + 'help':"Files to load genomic base anchored statistics."}) +ctrlstatfns_opt=('--control-statistics-filenames', { + 'type':unicode, 'nargs':'+', 'help':'Files to load genomic base anchored ' + + 'statistics from a control sample.'}) +validlocs_opt=('--valid-locations-filename', { + 'type':unicode, + 'help':"Bed format file containing single base locations of valid sites. " + + "Should contain 6 fields including strand. E.g. modified base locations."}) +moddescs_opt=('--modified-locations', { + 'type':unicode, 'nargs':'+', + 'help':'Modification description and bed format files containing ' + + 'single base locations of ground truth modified sites. Bed files should ' + + 'contain 6 fields including strand. Format descriptions as ' + + '"mod_name:locs.bed". Example: "CpG bisulfite":bisulfite_locs.bed'}) +unmodlocs_opt=('--unmodified-locations', { + 'type':unicode, 'nargs':'+', + 'help':'Bed format files containing single base locations of ground ' + + 'truth unmodified sites. Bed files should contain 6 fields including ' + + 'strand.'}) +motifdesc_opt=('--motif-description', { + 'type':unicode, + 'help':'Motif containing alternate-base. All positions with this motif ' + + 'should be modified (or filtered with [--valid-locations-filename]). ' + + 'Format descriptions as: "motif:mod_pos". mod_pos indicates the ' + + 'alternate-base within the motif (1-based index). Example: "CG:1" to ' + + 'train a CpG methylation model.'}) +motifdescs_opt=('--motif-descriptions', { 'type':unicode, 'nargs':'+', 'help':'Ground truth, motif centered, modified base descriptions for ' + 'computing ROC and PR curves. Each statistics file is associated with ' + 'a set of motif descriptions. Format descriptions as: "motif:mod_pos:name' + - '[::motif2:mod_pos2:name2...]". The mod_pos indicated the modified base ' + + '[::motif2:mod_pos2:name2...]". mod_pos indicates the alternate-base ' + 'within the motif (1-based index). Example: CCWGG:2:"dcm 5mC"::GATC:2:' + '"dam 6mA" would assess the performance of a single Tombo statistics ' + 'file for identification of E. coli dam and dcm methylation.'}) +motifdescsimp_opt=('--motif-descriptions', { + 'type':unicode, 'nargs':'+', + 'help':'Ground truth, motif centered, modified base descriptions for ' + + 'output filtering. Format descriptions as: "motif:mod_pos:name". The ' + + 'mod_pos indicates the modified base within the motif (1-based index). ' + + 'Example: CCWGG:2:dcm_5mC GATC:2:dam_6mA would filter output for ' + + 'identification of E. coli dam and dcm methylation.'}) ############################ @@ -335,8 +375,9 @@ segpars_opt=('--segmentation-parameters', { 'type':int, 'nargs':len(next(iter(SEG_PARAMS_TABLE.values()))), 'help':'Specify parameters for segmentation 1) running neighboring ' + - 'windows width 2) minimum raw observations per genomic base 3) mean raw ' + - 'observations per event. Sample type defaults: ' + + 'windows width 2) minimum raw observations per genomic base, 3) raw ' + + 're-squiggle min obs per base 4) mean raw observations per event. ' + + 'Sample type defaults: ' + ' || '.join((bst + ' : ' + ' '.join(map(str, params))) for bst, params in SEG_PARAMS_TABLE.items())}) hidsegpars_opt=('--segmentation-parameters', { @@ -422,6 +463,11 @@ 'default':False, 'action':'store_true', 'help':'Include reads that partially overlap the specified region. ' + 'Default: Only include reads completely contained in a specified region'}) +storepval_opt=('--store-p-value', { + 'default':False, 'action':'store_true', + 'help':'Store p-value instead of effect-size statistic. Statistics are ' + + 'D-statistic (KS-test), deviation from even common language effect size ' + + "(u-test), and Cohen's D (t-test)."}) readmean_opt=('--read-mean', { 'default':False, 'action':'store_true', @@ -593,11 +639,14 @@ ftypes_opt=('--file-types', { 'type':unicode, 'default':['coverage', ], 'nargs':'+', 'choices':['coverage', 'valid_coverage', 'fraction', 'dampened_fraction', - 'signal', 'signal_sd', 'dwell', 'difference'], + 'signal', 'signal_sd', 'dwell', 'difference', 'statistic'], 'help':'Data types of genome browser files to produce. Produced coverage ' + 'files are in bedGraph format, while all other file types will be in ' + 'wiggle format (https://genome.ucsc.edu/goldenpath/help/wiggle.html). ' + 'Default: "coverage"'}) +stype_opt=('--statistic-type', { + 'type':unicode, 'default':'ks', 'choices':['ks', 'u', 't'], + 'help':'Type of statistical test to apply. Default: "ks"'}) dna_opt=('--dna', { 'dest':'seq_sample_type', 'action':'store_const', 'const':DNA_SAMP_TYPE, @@ -855,7 +904,7 @@ def get_est_ref_parser(): def get_est_alt_ref_parser(): parser = argparse.ArgumentParser( - description='Estimate alternative k-mer reference model for use ' + + description='Estimate alternate-base k-mer reference model for use ' + 'in testing for specific modification types. [--fast5-basedirs] ' + 'should contain a sample spiked with a single known randomly ' + 'incorporated base.', add_help=False) @@ -894,6 +943,39 @@ def get_est_alt_ref_parser(): return parser +def get_est_motif_alt_ref_parser(): + parser = argparse.ArgumentParser( + description='Estimate motif-specific alternate-base k-mer reference ' + + 'model. [--fast5-basedirs] should contain a sample with a specific ' + + 'alternate-base occuring at a specific motif and ideally only ' + + 'canonical bases otherwise. These models often perform better than ' + + 'models trained from samples with random alternate-base incorporation.', + add_help=False) + req_args = parser.add_argument_group('Required Arguments') + req_args.add_argument(atbmod_opt[0], required=True, **atbmod_opt[1]) + req_args.add_argument(altname_opt[0], required=True, **altname_opt[1]) + req_args.add_argument(motifdesc_opt[0], required=True, **motifdesc_opt[1]) + req_args.add_argument(fast5dir_opt[0], required=True, **fast5dir_opt[1]) + + mod_args = parser.add_argument_group( + 'Modeling Arguments (Should match canoncial model)') + mod_args.add_argument(upstrmbs_opt[0], **upstrmbs_opt[1]) + mod_args.add_argument(dnstrmbs_opt[0], **dnstrmbs_opt[1]) + + filt_args = parser.add_argument_group('Filtering Argument') + filt_args.add_argument(minkmer_opt[0], default=1, **minkmer_opt[1]) + filt_args.add_argument(validlocs_opt[0], **validlocs_opt[1]) + filt_args.add_argument(minreads_opt[0], default=10, **minreads_opt[1]) + filt_args.add_argument(covthresh_opt[0], default=100, **covthresh_opt[1]) + + multi_args = parser.add_argument_group('Multiprocessing Arguments') + multi_args.add_argument(mpreg_opt[0], **mpreg_opt[1]) + multi_args.add_argument(proc_opt[0], default=1, **proc_opt[1]) + + fast5_args, misc_args, parser = add_default_args(parser) + + return parser + def get_estimate_scale_parser(): parser = argparse.ArgumentParser( description='Estimate a global scaling parameter from a ' + @@ -966,8 +1048,8 @@ def get_alt_test_signif_parser(): def get_samp_comp_test_signif_parser(): parser = argparse.ArgumentParser( description='Test for significant shifts in raw nanopore signal ' + - 'away from a control/canonical base only sample (usually ' + - 'PCR for DNA or IVT for RNA).', add_help=False) + 'from levels estimated from a control/canonical base only sample ' + + '(usually PCR for DNA or IVT for RNA).', add_help=False) req_args = parser.add_argument_group('Required Argument') req_args.add_argument(fast5dir_opt[0], required=True, **fast5dir_opt[1]) req_args.add_argument(statbsnm_opt[0], required=True, **statbsnm_opt[1]) @@ -992,6 +1074,34 @@ def get_samp_comp_test_signif_parser(): return parser +def get_group_comp_test_signif_parser(): + parser = argparse.ArgumentParser( + description='Test for significant shifts in raw nanopore signal ' + + 'away from a control/canonical base only sample (usually ' + + 'PCR for DNA or IVT for RNA).', add_help=False) + req_args = parser.add_argument_group('Required Argument') + req_args.add_argument(fast5dir_opt[0], required=True, **fast5dir_opt[1]) + req_args.add_argument(statbsnm_opt[0], required=True, **statbsnm_opt[1]) + req_args.add_argument( + altfast5dir_opt[0], required=True, **altfast5dir_opt[1]) + + test_args = parser.add_argument_group('Significance Test Arguments') + test_args.add_argument(fmo_opt[0], **fmo_opt[1]) + test_args.add_argument(minreads_opt[0], default=50, **minreads_opt[1]) + test_args.add_argument(stype_opt[0], **stype_opt[1]) + test_args.add_argument(storepval_opt[0], **storepval_opt[1]) + + io_args = parser.add_argument_group('Output Argument') + io_args.add_argument(mstsgnf_opt[0], **mstsgnf_opt[1]) + + multi_args = parser.add_argument_group('Multiprocessing Arguments') + multi_args.add_argument(mpreg_opt[0], **mpreg_opt[1]) + multi_args.add_argument(proc_opt[0], default=1, **proc_opt[1]) + + fast5_args, misc_args, parser = add_default_args(parser) + + return parser + def get_aggregate_per_read_parser(): parser = argparse.ArgumentParser( description='Aggregate per-read statistics to produce a standard ' + @@ -1358,13 +1468,41 @@ def get_roc_parser(): add_help=False) req_args = parser.add_argument_group('Required Argument') req_args.add_argument(statfns_opt[0], required=True, **statfns_opt[1]) - req_args.add_argument(motifdesc_opt[0], required=True, **motifdesc_opt[1]) + + gt_args = parser.add_argument_group( + 'Ground Truth Arguments (provide bed files or motifs)') + gt_args.add_argument(moddescs_opt[0], **moddescs_opt[1]) + gt_args.add_argument(unmodlocs_opt[0], **unmodlocs_opt[1]) + gt_args.add_argument(motifdescs_opt[0], **motifdescs_opt[1]) + gt_args.add_argument(fasta_opt[0], **fasta_opt[1]) + + out_args = parser.add_argument_group('Output Arguments') + out_args.add_argument( + pdf_opt[0], default=OUTPUT_BASE + '.roc.pdf', **pdf_opt[1]) + + limit_args = parser.add_argument_group('Down-sampling Arguments') + limit_args.add_argument(allspb_opt[0], **allspb_opt[1]) + limit_args.add_argument(tsl_opt[0], default=5000000, **tsl_opt[1]) + + misc_args, parser = add_misc_args(parser) + + return parser + +def get_control_roc_parser(): + parser = argparse.ArgumentParser( + description='Plot ROC curve comparing a control and native sample ' + + 'at known motif(s).', add_help=False) + req_args = parser.add_argument_group('Required Arguments') + req_args.add_argument(statfns_opt[0], required=True, **statfns_opt[1]) + req_args.add_argument( + ctrlstatfns_opt[0], required=True, **ctrlstatfns_opt[1]) + req_args.add_argument(motifdescs_opt[0], required=True, **motifdescs_opt[1]) req_args.add_argument(fasta_opt[0], required=True, **fasta_opt[1]) out_args = parser.add_argument_group('Output Arguments') - out_args.add_argument(pdf_opt[0], - default=OUTPUT_BASE + '.roc.pdf', - **pdf_opt[1]) + out_args.add_argument( + pdf_opt[0], default=OUTPUT_BASE + '.sample_compare.roc.pdf', + **pdf_opt[1]) limit_args = parser.add_argument_group('Down-sampling Arguments') limit_args.add_argument(allspb_opt[0], **allspb_opt[1]) @@ -1380,8 +1518,13 @@ def get_per_read_roc_parser(): add_help=False) req_args = parser.add_argument_group('Required Argument') req_args.add_argument(prstats_opt[0], required=True, **prstats_opt[1]) - req_args.add_argument(motifdesc_opt[0], required=True, **motifdesc_opt[1]) - req_args.add_argument(fasta_opt[0], required=True, **fasta_opt[1]) + + gt_args = parser.add_argument_group( + 'Ground Truth Arguments (provide bed files or motifs)') + gt_args.add_argument(moddescs_opt[0], **moddescs_opt[1]) + gt_args.add_argument(unmodlocs_opt[0], **unmodlocs_opt[1]) + gt_args.add_argument(motifdescs_opt[0], **motifdescs_opt[1]) + gt_args.add_argument(fasta_opt[0], **fasta_opt[1]) limit_args = parser.add_argument_group('Down-sampling Arguments') limit_args.add_argument(spb_opt[0], default=100000, **spb_opt[1]) @@ -1396,6 +1539,30 @@ def get_per_read_roc_parser(): return parser +def get_control_per_read_roc_parser(): + parser = argparse.ArgumentParser( + description='Plot per-read ROC curve comparing a control and ' + + 'native sample at known motif(s).', add_help=False) + req_args = parser.add_argument_group('Required Arguments') + req_args.add_argument(prstats_opt[0], required=True, **prstats_opt[1]) + req_args.add_argument( + prctrlstats_opt[0], required=True, **prctrlstats_opt[1]) + req_args.add_argument(motifdescs_opt[0], required=True, **motifdescs_opt[1]) + req_args.add_argument(fasta_opt[0], required=True, **fasta_opt[1]) + + out_args = parser.add_argument_group('Output Arguments') + out_args.add_argument( + pdf_opt[0], default=OUTPUT_BASE + '.sample_compare.roc.pdf', + **pdf_opt[1]) + + limit_args = parser.add_argument_group('Down-sampling Arguments') + limit_args.add_argument(allspb_opt[0], **allspb_opt[1]) + limit_args.add_argument(tsl_opt[0], default=5000000, **tsl_opt[1]) + + misc_args, parser = add_misc_args(parser) + + return parser + def get_cluster_signif_diff_parser(): parser = argparse.ArgumentParser( description='Cluster signal trace differences at most significant ' + @@ -1442,6 +1609,10 @@ def get_browser_files_parser(): data_args.add_argument(ctrlfast5dir_opt[0], **ctrlfast5dir_opt[1]) data_args.add_argument(statfn_opt[0], **statfn_opt[1]) + motif_args = parser.add_argument_group('Statistic Motif Filter Arguments') + motif_args.add_argument(fasta_opt[0], **fasta_opt[1]) + motif_args.add_argument(motifdescsimp_opt[0], **motifdescsimp_opt[1]) + out_args = parser.add_argument_group('Output Arguments') out_args.add_argument(brsrfn_opt[0], **brsrfn_opt[1]) out_args.add_argument(ftypes_opt[0], **ftypes_opt[1]) diff --git a/tombo/_plot_commands.py b/tombo/_plot_commands.py index 08563c0..6e3d045 100644 --- a/tombo/_plot_commands.py +++ b/tombo/_plot_commands.py @@ -57,13 +57,115 @@ #### ROC Curves #### #################### +def prep_accuracy_rates(all_motif_stats): + tp_rates, fp_rates, precisions, mod_names_for_r = [], [], [], [] + if VERBOSE: + sys.stderr.write(' {:<30}{:<6} {:<6}\n'.format( + 'Statistic Type', 'AUC', 'mean AP')) + sys.stderr.write(' {:<30}{:<6} {:<6}\n'.format( + '--------------', '---', '-------')) + for mod_name, mod_stats in all_motif_stats.items(): + # extract motif_match (bool) ordered by stat values + ordered_mod_tf = list(zip(*sorted(mod_stats)))[1] + mod_tp_rate, mod_fp_rate, mod_precision = ts.compute_accuracy_rates( + ordered_mod_tf) + auc = ts.compute_auc(mod_tp_rate, mod_fp_rate) + mean_ap = ts.compute_mean_avg_precison(mod_tp_rate, mod_precision) + if VERBOSE: + sys.stderr.write(' {:<30}{:6.4f} {:6.4f}\n'.format( + mod_name, auc, mean_ap)) + tp_rates.extend(mod_tp_rate) + fp_rates.extend(mod_fp_rate) + precisions.extend(mod_precision) + mod_names_for_r.extend(repeat(mod_name, len(mod_tp_rate))) + + return tp_rates, fp_rates, precisions, mod_names_for_r + def plot_roc( - stats_fns, motif_descs, fasta_fn, pdf_fn, stats_per_block, + stats_fns, motif_descs, fasta_fn, mod_locs_fns, unmod_locs_fns, + pdf_fn, stats_per_block, total_stats_limit): + if motif_descs is None: + if mod_locs_fns is None: + th.error_message_and_exit( + 'Must provide either motifs or bed files describing ground ' + + 'truth modification locations.') + if (len(mod_locs_fns) != len(unmod_locs_fns) or + len(mod_locs_fns) != len(stats_fns)): + th.error_message_and_exit( + 'Must provide exactly one [--modified-locations] and ' + + '[--unmodified-locations] for each statistics file.') + ground_truth_info = [] + for mod_name_fn, unmod_fn in zip(mod_locs_fns, unmod_locs_fns): + mod_name, mod_fn = mod_name_fn.split(':') + ground_truth_info.append(( + th.parse_locs_file(mod_fn), th.parse_locs_file(unmod_fn), + mod_name)) + else: + if len(motif_descs) != len(stats_fns): + th.error_message_and_exit( + 'Must provide exactly one set of motif descriptions for ' + + 'each statistics file.') + if VERBOSE: th.status_message('Parsing motifs.') + ground_truth_info = [ + th.parse_motif_descs(stat_motif_descs) + for stat_motif_descs in motif_descs] + mod_names = [mod_name for stat_motif_descs in ground_truth_info + for _, mod_name in stat_motif_descs] + if len(mod_names) != len(set(mod_names)): + th.error_message_and_exit('Modified base names are not unique.') + + if VERBOSE: th.status_message('Parsing genome.') + genome_index = th.Fasta(fasta_fn) + + all_stats = {} + for stats_fn, stat_gt_info in zip(stats_fns, ground_truth_info): + if not os.path.isfile(stats_fn): + th.warning_message('Statistics file does not exist. Skipping: ' + + stats_fn) + continue + try: + stats = ts.TomboStats(stats_fn) + except Exception as e: + th.warning_message( + 'Unexpected error parsing ' + stats_fn + '. Continuing ' + + 'without processing this file. \n\tError code:\n\t\t' + + str(e) + '\n') + continue + if motif_descs is None: + stat_type_stats = stats.compute_ground_truth_stats( + stat_gt_info) + else: + stat_type_stats = stats.compute_motif_stats( + stat_gt_info, genome_index, stats_per_block, + total_stats_limit) + for mod_name, mod_stats in stat_type_stats.items(): + all_stats[mod_name] = mod_stats + stats.close() + + if VERBOSE: th.status_message('Computing accuracy statistics.') + tp_rates, fp_rates, precisions, mod_names_for_r = prep_accuracy_rates( + all_stats) + + if VERBOSE: th.status_message('Plotting.') + rocDat = r.DataFrame({ + 'TP':r.FloatVector(tp_rates), + 'FP':r.FloatVector(fp_rates), + 'Precision':r.FloatVector(precisions), + 'Comparison':r.StrVector(mod_names_for_r)}) + r.r(resource_string(__name__, 'R_scripts/plotROC.R').decode()) + r.r('pdf("' + pdf_fn + '", height=4, width=6)') + r.globalenv[str('plotROC')](rocDat) + r.r('dev.off()') + + return + +def plot_ctrl_samp_roc( + stats_fns, ctrl_fns, motif_descs, fasta_fn, pdf_fn, stats_per_block, total_stats_limit): - if len(motif_descs) != len(stats_fns): + if len(motif_descs) != len(stats_fns) and len(stats_fns) != len(ctrl_fns): th.error_message_and_exit( - 'Must provide exactly one set of motif descriptions for ' + - 'each statistics file.') + 'Must provide exactly one set of motif descriptions and a ' + + 'control sample for each statistics file.') if VERBOSE: th.status_message('Parsing motifs.') motif_descs = [th.parse_motif_descs(stat_motif_descs) @@ -77,11 +179,12 @@ def plot_roc( genome_index = th.Fasta(fasta_fn) all_motif_stats = {} - all_motif_stats_for_r = {} - for stats_fn, stat_motif_descs in zip(stats_fns, motif_descs): - if not os.path.isfile(stats_fn): - th.warning_message('Statistics file does not exist. Skipping: ' + - stats_fn) + for stats_fn, ctrl_fn, stat_motif_descs in zip( + stats_fns, ctrl_fns,motif_descs): + if not os.path.isfile(stats_fn) or not os.path.isfile(ctrl_fn): + th.warning_message( + 'Statistics file does not exist. Skipping: ' + + stats_fn + ' ' + ctrl_fn) continue try: stats = ts.TomboStats(stats_fn) @@ -91,68 +194,141 @@ def plot_roc( 'without processing this file. \n\tError code:\n\t\t' + str(e) + '\n') continue - for mod_name, mod_stats in stats.compute_motif_stats( - stat_motif_descs, genome_index, stats_per_block, + try: + ctrl_stats = ts.TomboStats(ctrl_fn) + except Exception as e: + th.warning_message( + 'Unexpected error parsing ' + ctrl_fn + '. Continuing ' + + 'without processing this file. \n\tError code:\n\t\t' + + str(e) + '\n') + continue + + for mod_name, mod_stats in stats.compute_ctrl_motif_stats( + ctrl_stats, stat_motif_descs, genome_index, stats_per_block, total_stats_limit).items(): all_motif_stats[mod_name] = mod_stats stats.close() + ctrl_stats.close() - for mod_name, stats in all_motif_stats.items(): + if VERBOSE: th.status_message('Computing accuracy statistics.') + tp_rates, fp_rates, precisions, mod_names_for_r = prep_accuracy_rates( + all_motif_stats) + + if VERBOSE: th.status_message('Plotting.') + rocDat = r.DataFrame({ + 'TP':r.FloatVector(tp_rates), + 'FP':r.FloatVector(fp_rates), + 'Precision':r.FloatVector(precisions), + 'Comparison':r.StrVector(mod_names_for_r)}) + r.r(resource_string(__name__, 'R_scripts/plotROC.R').decode()) + r.r('pdf("' + pdf_fn + '", height=4, width=6)') + r.globalenv[str('plotROC')](rocDat) + r.r('dev.off()') + + return + +def plot_per_read_roc( + pr_stats_fns, motif_descs, fasta_fn, mod_locs_fns, unmod_locs_fns, + pdf_fn, stats_per_block, total_stats_limit): + if motif_descs is None: + if mod_locs_fns is None: + th.error_message_and_exit( + 'Must provide either motifs or bed files describing ground ' + + 'truth modification locations.') + if (len(mod_locs_fns) != len(unmod_locs_fns) or + len(mod_locs_fns) != len(pr_stats_fns)): + th.error_message_and_exit( + 'Must provide exactly one [--modified-locations] and ' + + '[--unmodified-locations] for each statistics file.') + ground_truth_info = [] + for mod_name_fn, unmod_fn in zip(mod_locs_fns, unmod_locs_fns): + mod_name, mod_fn = mod_name_fn.split(':') + ground_truth_info.append(( + th.parse_locs_file(mod_fn), th.parse_locs_file(unmod_fn), + mod_name)) + else: + if len(motif_descs) != len(pr_stats_fns): + th.error_message_and_exit( + 'Must provide exactly one set of motif descriptions for ' + + 'each statistics file.') + if VERBOSE: th.status_message('Parsing motifs.') + ground_truth_info = [th.parse_motif_descs(stat_motif_descs) + for stat_motif_descs in motif_descs] + mod_names = [mod_name for stat_motif_descs in ground_truth_info + for _, mod_name in stat_motif_descs] + if len(mod_names) != len(set(mod_names)): + th.error_message_and_exit('Modified base names are not unique.') + + if VERBOSE: th.status_message('Parsing genome.') + genome_index = th.Fasta(fasta_fn) + + if VERBOSE: th.status_message('Extracting per-read statistics.') + all_stats = {} + all_stats_for_r = {} + for pr_stats_fn, stat_gt_info in zip(pr_stats_fns, ground_truth_info): + if not os.path.isfile(pr_stats_fn): + th.warning_message('Statistics file does not exist. Skipping: ' + + pr_stats_fn) + continue + try: + pr_stats = ts.PerReadStats(pr_stats_fn) + except Exception as e: + th.warning_message( + 'Unexpected error parsing ' + pr_stats_fn + '. Continuing ' + + 'without processing this file. \n\tError code:\n\t\t' + + str(e) + '\n') + continue + if motif_descs is None: + stat_type_stats = pr_stats.compute_ground_truth_stats( + stat_gt_info) + else: + stat_type_stats = pr_stats.compute_motif_stats( + stat_gt_info, genome_index, stats_per_block, + total_stats_limit) + for mod_name, mod_stats in stat_type_stats.items(): + all_stats[mod_name] = mod_stats + pr_stats.close() + + for mod_name, stats in all_stats.items(): unzip_stats = list(zip(*stats)) - all_motif_stats_for_r[mod_name] = r.DataFrame({ + all_stats_for_r[mod_name] = r.DataFrame({ 'stat':r.FloatVector(unzip_stats[0]), 'motif_match':r.BoolVector(unzip_stats[1])}) # python2 rpy2 ListVector can't take unicode keys if sys.version_info[0] < 3: - conv_all_motif_stats_for_r = {} - for k, v in all_motif_stats_for_r.items(): - conv_all_motif_stats_for_r[k.encode()] = v - all_motif_stats_for_r = conv_all_motif_stats_for_r - all_motif_stats_for_r = r.ListVector(all_motif_stats_for_r) + conv_all_stats_for_r = {} + for k, v in all_stats_for_r.items(): + conv_all_stats_for_r[k.encode()] = v + all_stats_for_r = conv_all_stats_for_r + all_stats_for_r = r.ListVector(all_stats_for_r) if VERBOSE: th.status_message('Computing accuracy statistics.') - tp_rates, fp_rates, precisions, mod_names_for_r = [], [], [], [] - if VERBOSE: - sys.stderr.write(' {:<30}{:<6} {:<6}\n'.format( - 'Statistic Type', 'AUC', 'mean AP')) - sys.stderr.write(' {:<30}{:<6} {:<6}\n'.format( - '--------------', '---', '-------')) - for mod_name, mod_stats in all_motif_stats.items(): - # extract motif_match (bool) ordered by stat values - ordered_mod_tf = list(zip(*sorted(mod_stats)))[1] - mod_tp_rate, mod_fp_rate, mod_precision = ts.compute_accuracy_rates( - ordered_mod_tf) - auc = ts.compute_auc(mod_tp_rate, mod_fp_rate) - mean_ap = ts.compute_mean_avg_precison(mod_tp_rate, mod_precision) - if VERBOSE: - sys.stderr.write(' {:<30}{:6.4f} {:6.4f}\n'.format( - mod_name, auc, mean_ap)) - tp_rates.extend(mod_tp_rate) - fp_rates.extend(mod_fp_rate) - precisions.extend(mod_precision) - mod_names_for_r.extend(repeat(mod_name, len(mod_tp_rate))) + tp_rates, fp_rates, precisions, mod_names_for_r = prep_accuracy_rates( + all_stats) - if VERBOSE: th.status_message('Plotting.') rocDat = r.DataFrame({ 'TP':r.FloatVector(tp_rates), 'FP':r.FloatVector(fp_rates), 'Precision':r.FloatVector(precisions), 'Comparison':r.StrVector(mod_names_for_r)}) - r.r(resource_string(__name__, 'R_scripts/plotROC.R').decode()) + + if VERBOSE: th.status_message('Plotting.') + r.r(resource_string(__name__, 'R_scripts/plotROCPerRead.R').decode()) r.r('pdf("' + pdf_fn + '", height=4, width=6)') - r.globalenv[str('plotROC')](rocDat) + r.globalenv[str('plotROCPerRead')](rocDat, all_stats_for_r) r.r('dev.off()') return -def plot_per_read_roc( - pr_stats_fns, motif_descs, fasta_fn, pdf_fn, stats_per_block, - total_stats_limit): - if len(motif_descs) != len(pr_stats_fns): +def plot_ctrl_samp_per_read_roc( + pr_stats_fns, pr_ctrl_fns, motif_descs, fasta_fn, pdf_fn, + stats_per_block, total_stats_limit): + if (len(motif_descs) != len(pr_stats_fns) and + len(pr_stats_fns) != len(pr_ctrl_fns)): th.error_message_and_exit( - 'Must provide exactly one set of motif descriptions for ' + - 'each statistics file.') + 'Must provide exactly one set of motif descriptions and a ' + + 'control sample for each statistics file.') if VERBOSE: th.status_message('Parsing motifs.') motif_descs = [th.parse_motif_descs(stat_motif_descs) @@ -165,13 +341,14 @@ def plot_per_read_roc( if VERBOSE: th.status_message('Parsing genome.') genome_index = th.Fasta(fasta_fn) - if VERBOSE: th.status_message('Extracting per-read statistics.') all_motif_stats = {} all_motif_stats_for_r = {} - for pr_stats_fn, stat_motif_descs in zip(pr_stats_fns, motif_descs): - if not os.path.isfile(pr_stats_fn): - th.warning_message('Statistics file does not exist. Skipping: ' + - pr_stats_fn) + for pr_stats_fn, pr_ctrl_fn, stat_motif_descs in zip( + pr_stats_fns, pr_ctrl_fns, motif_descs): + if not os.path.isfile(pr_stats_fn) or not os.path.isfile(pr_ctrl_fn): + th.warning_message( + 'Per-read statistics files do not exist. Skipping: ' + + pr_stats_fn + ' ' + pr_ctrl_fn) continue try: pr_stats = ts.PerReadStats(pr_stats_fn) @@ -181,11 +358,21 @@ def plot_per_read_roc( 'without processing this file. \n\tError code:\n\t\t' + str(e) + '\n') continue - for mod_name, mod_stats in pr_stats.compute_motif_stats( - stat_motif_descs, genome_index, stats_per_block, + try: + pr_ctrl_stats = ts.PerReadStats(pr_ctrl_fn) + except Exception as e: + th.warning_message( + 'Unexpected error parsing ' + pr_ctrl_fn + '. Continuing ' + + 'without processing this file. \n\tError code:\n\t\t' + + str(e) + '\n') + continue + + for mod_name, mod_stats in pr_stats.compute_ctrl_motif_stats( + pr_ctrl_stats, stat_motif_descs, genome_index, stats_per_block, total_stats_limit).items(): all_motif_stats[mod_name] = mod_stats pr_stats.close() + pr_ctrl_stats.close() for mod_name, stats in all_motif_stats.items(): unzip_stats = list(zip(*stats)) @@ -202,34 +389,15 @@ def plot_per_read_roc( all_motif_stats_for_r = r.ListVector(all_motif_stats_for_r) if VERBOSE: th.status_message('Computing accuracy statistics.') - tp_rates, fp_rates, precisions, mod_names_for_r = [], [], [], [] - if VERBOSE: - sys.stderr.write(' {:<30}{:<6} {:<6}\n'.format( - 'Statistic Type', 'AUC', 'mean AP')) - sys.stderr.write(' {:<30}{:<6} {:<6}\n'.format( - '--------------', '---', '-------')) - for mod_name, mod_stats in all_motif_stats.items(): - # extract motif_match (bool) ordered by stat values - ordered_mod_tf = list(zip(*sorted(mod_stats)))[1] - mod_tp_rate, mod_fp_rate, mod_precision = ts.compute_accuracy_rates( - ordered_mod_tf) - auc = ts.compute_auc(mod_tp_rate, mod_fp_rate) - mean_ap = ts.compute_mean_avg_precison(mod_tp_rate, mod_precision) - if VERBOSE: - sys.stderr.write(' {:<30}{:6.4f} {:6.4f}\n'.format( - mod_name, auc, mean_ap)) - tp_rates.extend(mod_tp_rate) - fp_rates.extend(mod_fp_rate) - precisions.extend(mod_precision) - mod_names_for_r.extend(repeat(mod_name, len(mod_tp_rate))) + tp_rates, fp_rates, precisions, mod_names_for_r = prep_accuracy_rates( + all_motif_stats) + if VERBOSE: th.status_message('Plotting.') rocDat = r.DataFrame({ 'TP':r.FloatVector(tp_rates), 'FP':r.FloatVector(fp_rates), 'Precision':r.FloatVector(precisions), 'Comparison':r.StrVector(mod_names_for_r)}) - - if VERBOSE: th.status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotROCPerRead.R').decode()) r.r('pdf("' + pdf_fn + '", height=4, width=6)') r.globalenv[str('plotROCPerRead')](rocDat, all_motif_stats_for_r) @@ -238,6 +406,7 @@ def plot_per_read_roc( return + ################################### #### K-mer Signal Distribution #### ################################### @@ -582,10 +751,12 @@ def get_r_event_data( if reg_plot_sig != 'Density': continue for strand in ('+', '-'): - if sum(r_data.strand == strand - for r_data in reg_data.reads) == 0: + strand_reads = [ + r_data for r_data in reg_data.reads if r_data.strand == strand] + if len(strand_reads) == 0: continue - reg_events = reg_data.get_base_levels() + reg_events = reg_data.copy(include_reads=False).update( + reads=strand_reads).get_base_levels() for pos, base_read_means in enumerate(reg_events): # skip bases with zero or 1 read as ggplot won't # be able to estimate the density @@ -732,8 +903,7 @@ def get_r_raw_signal_data( th.warning_message( 'Genome resolved raw signal could not be retrieved ' + 'for some reads. Ensure that reads have been ' + - 're-squiggled and that all data slot corresponding ' + - 'accordingly.') + 're-squiggled with the specified [--corrected-group].') continue if not genome_centric and r_data.strand == "-": @@ -1284,17 +1454,35 @@ def get_reg_reads(reads, int_start, int_end): std_ref.sds[kmer]) for pos, kmer in enumerate(rev_kmers) if not th.invalid_seq(kmer)])) # if alternative model is supplied add info - if alt_ref is not None: + if alt_ref is not None and len(fwd_kmers) >= alt_ref.kmer_width: + plot_center = len(fwd_kmers) // 2 + fwd_kmer_poss = list(zip( + fwd_kmers[plot_center - alt_ref.central_pos - 1: + plot_center + alt_ref.kmer_width - + alt_ref.central_pos], + range(alt_ref.kmer_width - 1, -1, -1))) + reg_fwd_alt_data = [ + (reg_data.start + (plot_center - pos + alt_ref.central_pos), + alt_ref.means[(kmer, pos)], + alt_ref.sds[(kmer, pos)]) + for kmer, pos in fwd_kmer_poss] if all( + kmer_pos in alt_ref.means + for kmer_pos in fwd_kmer_poss) else [] + rev_kmer_poss = list(zip( + rev_kmers[plot_center - alt_ref.central_pos - 1: + plot_center + alt_ref.kmer_width - + alt_ref.central_pos], + range(alt_ref.kmer_width - 1, -1, -1))) + reg_rev_alt_data = [ + (reg_data.end - (plot_center - pos + alt_ref.central_pos) - 1, + alt_ref.means[(kmer, pos)], + alt_ref.sds[(kmer, pos)]) + for kmer, pos in rev_kmer_poss] if all( + kmer_pos in alt_ref.means + for kmer_pos in rev_kmer_poss) else [] all_reg_alt_model_data.append(( reg_data.reg_id, reg_data.strand, - [(reg_data.start + pos, alt_ref.means[kmer], - alt_ref.sds[kmer]) - for pos, kmer in enumerate(fwd_kmers) - if not th.invalid_seq(kmer)], - [(reg_data.end - pos - 1, alt_ref.means[kmer], - alt_ref.sds[kmer]) - for pos, kmer in enumerate(rev_kmers) - if not th.invalid_seq(kmer)])) + reg_fwd_alt_data, reg_rev_alt_data)) return all_reg_model_data, all_reg_alt_model_data @@ -1618,12 +1806,15 @@ def get_motif_locs(covered_chrms): if chrm not in covered_chrms: continue seq = genome_index.get_seq(chrm) for motif_loc in motif.motif_pat.finditer(seq): - motif_locs.append((chrm, motif_loc.start(), '+' - if not motif.is_palindrome else None)) + motif_locs.append(( + chrm, motif_loc.start() + (motif.motif_len // 2) - 1, '+' + if not motif.is_palindrome else None)) # search over negative strand as well if not palindromic if not motif.is_palindrome: for motif_loc in motif.rev_comp_pat.finditer(seq): - motif_locs.append((chrm, motif_loc.start(), '-')) + motif_locs.append(( + chrm, motif_loc.start() + (motif.motif_len // 2) - 1, + '-')) if len(motif_locs) == 0: th.error_message_and_exit( @@ -1861,15 +2052,16 @@ def plot_motif_centered_signif( def get_stat_pos(start, chrm, strand): # need to handle forward and reverse strand stats separately since # reverse strand stats are in reverse order wrt motif - reg_pos_fracs = [] + reg_pos_stats = [] for pos in range(start, start + plot_width): - pos_frac = all_stats.get_pos_frac(chrm, strand, pos) - if pos_frac is not None: - reg_pos_fracs.append((pos - start if strand == '+' else - -1 * (pos - start - plot_width + 1), - pos_frac)) + pos_stat = all_stats.get_pos_stat(chrm, strand, pos) + if pos_stat is not None: + reg_pos_stats.append(( + pos - start if strand == '+' else + -1 * (pos - start - plot_width + 1), + pos_stat)) - return reg_pos_fracs + return reg_pos_stats if VERBOSE: th.status_message('Getting all regions statistics.') stat_locs = [ @@ -2144,21 +2336,44 @@ def plot_main(args): kwargs = dict(fasta_opt + [('pdf_fn', args.pdf_filename), ('motif_descs', args.motif_descriptions), + ('mod_locs_fns', args.modified_locations), + ('unmod_locs_fns', args.unmodified_locations), ('stats_fns', args.statistics_filenames), ('stats_per_block', args.statistics_per_block), ('total_stats_limit', args.total_statistics_limit)]) plot_roc(**kwargs) + elif args.action_command == 'sample_compare_roc': + kwargs = dict(fasta_opt + + [('pdf_fn', args.pdf_filename), + ('motif_descs', args.motif_descriptions), + ('stats_fns', args.statistics_filenames), + ('ctrl_fns', args.control_statistics_filenames), + ('stats_per_block', args.statistics_per_block), + ('total_stats_limit', args.total_statistics_limit)]) + plot_ctrl_samp_roc(**kwargs) elif args.action_command == 'per_read_roc': kwargs = dict(fasta_opt + [('pdf_fn', args.pdf_filename), ('motif_descs', args.motif_descriptions), + ('mod_locs_fns', args.modified_locations), + ('unmod_locs_fns', args.unmodified_locations), ('pr_stats_fns', args.per_read_statistics_filenames), ('stats_per_block', args.statistics_per_block), ('total_stats_limit', args.total_statistics_limit)]) plot_per_read_roc(**kwargs) + elif args.action_command == 'sample_compare_per_read_roc': + kwargs = dict(fasta_opt + + [('pdf_fn', args.pdf_filename), + ('motif_descs', args.motif_descriptions), + ('pr_stats_fns', args.per_read_statistics_filenames), + ('pr_ctrl_fns', + args.per_read_control_statistics_filenames), + ('stats_per_block', args.statistics_per_block), + ('total_stats_limit', args.total_statistics_limit)]) + plot_ctrl_samp_per_read_roc(**kwargs) else: th.error_message_and_exit('Invalid tombo sub-command entered. ' + - 'Should have been caught by argparse.') + 'Should have been caught by argparse.') return diff --git a/tombo/_preprocess.py b/tombo/_preprocess.py index 1fca27e..247e3de 100644 --- a/tombo/_preprocess.py +++ b/tombo/_preprocess.py @@ -187,7 +187,7 @@ def update_warn(warn_val): elif warn_val == _WARN_IO_VAL: if VERBOSE and not been_warned[_WARN_IO_VAL]: bar.write( - _WARN_PREFIX + 'Some read files that could not be accessed.', + _WARN_PREFIX + 'Some read files could not be accessed.', file=sys.stderr) been_warned[_WARN_IO_VAL] = True elif warn_val == _WARN_MISMATCH_VAL: diff --git a/tombo/_text_output_commands.py b/tombo/_text_output_commands.py index 881b1af..8497dc1 100644 --- a/tombo/_text_output_commands.py +++ b/tombo/_text_output_commands.py @@ -43,12 +43,17 @@ POS_SLOT = 'pos' FRAC_SLOT = 'frac' DFRAC_SLOT = 'damp_frac' +STAT_SLOT = 'stat' VCOV_SLOT = 'valid_cov' + FRAC_WIG_TYPE = 'fraction' DFRAC_WIG_TYPE = 'dampened_fraction' +STAT_WIG_TYPE = 'statistic' VCOV_WIG_TYPE = 'valid_coverage' + FRAC_WIG_NAME = 'fraction_modified_reads' DFRAC_WIG_NAME = 'dampened_fraction_modified_reads' +STAT_WIG_NAME = 'statistic' VCOV_WIG_NAME = 'valid_coverage' @@ -87,77 +92,138 @@ def _write_cs_int_data(wig_fp, chrm, cs_poss, cs_vals): return -def write_frac_wigs(all_stats, wig_base, do_frac, do_damp, do_valid_cov): +def write_frac_wigs( + all_stats, wig_base, do_frac, do_damp, do_stats, do_vcov, fasta_fn, + motif_descs): if VERBOSE: th.status_message('Parsing and outputting statistics wiggles.') + filter_motifs = not(fasta_fn is None or motif_descs is None) + if filter_motifs: + genome_index = th.Fasta(fasta_fn) + motifs = [ + motif for motif in th.parse_motif_descs('::'.join(motif_descs))] + else: + motifs = [(None, ''),] + if do_frac: - plus_frac_fp, minus_frac_fp = open_browser_files( - wig_base, '', FRAC_WIG_NAME) + frac_fps = dict( + kv for _, mod_name in motifs + for kv in zip( + (('+', mod_name), ('-', mod_name)), + open_browser_files(wig_base, mod_name, FRAC_WIG_NAME))) if do_damp: - plus_damp_fp, minus_damp_fp = open_browser_files( - wig_base, '', DFRAC_WIG_NAME) - if do_valid_cov: - plus_vcov_fp, minus_vcov_fp = open_browser_files( - wig_base, '', VCOV_WIG_NAME) - - (curr_chrm, curr_strand, curr_poss, curr_fracs, curr_damp_fracs, - curr_valid_cov) = (None, None, [], [], [], []) + damp_fps = dict( + kv for _, mod_name in motifs + for kv in zip( + (('+', mod_name), ('-', mod_name)), + open_browser_files(wig_base, mod_name, DFRAC_WIG_NAME))) + if do_stats: + stat_fps = dict( + kv for _, mod_name in motifs + for kv in zip( + (('+', mod_name), ('-', mod_name)), + open_browser_files(wig_base, mod_name, STAT_WIG_NAME))) + if do_vcov: + vcov_fps = dict( + kv for _, mod_name in motifs + for kv in zip( + (('+', mod_name), ('-', mod_name)), + open_browser_files(wig_base, mod_name, VCOV_WIG_NAME))) + + + def write_cs_stats( + curr_chrm, curr_strand, curr_poss, + curr_fracs, curr_dampf, curr_stats, curr_vcov): + curr_poss = np.concatenate(curr_poss) + if do_frac: curr_fracs = np.concatenate(curr_fracs) + if do_damp: curr_dampf = np.concatenate(curr_dampf) + if do_stats: curr_stats = np.concatenate(curr_stats) + if do_vcov: curr_vcov = np.concatenate(curr_vcov) + + if filter_motifs: + chrm_seq = genome_index.get_seq(curr_chrm) + for motif, mod_name in motifs: + if do_frac: + frac_fp = frac_fps[(curr_strand, mod_name)] + motif_fracs = curr_fracs + if do_damp: + damp_fp = damp_fps[(curr_strand, mod_name)] + motif_damp = curr_dampf + if do_stats: + stat_fp = stat_fps[(curr_strand, mod_name)] + motif_stats = curr_stats + if do_vcov: + vcov_fp = vcov_fps[(curr_strand, mod_name)] + motif_vcov = curr_vcov + m_curr_poss = curr_poss + if filter_motifs: + if curr_strand == '-': + chrm_motif_poss = np.array([ + m.start() + motif.motif_len - motif.mod_pos + for m in motif.rev_comp_pat.finditer(chrm_seq)]) + else: + chrm_motif_poss = np.array([ + m.start() + motif.mod_pos - 1 + for m in motif.motif_pat.finditer(chrm_seq)]) + valid_poss = np.isin( + curr_poss, chrm_motif_poss, assume_unique=True) + m_curr_poss = curr_poss[valid_poss] + if do_frac: motif_fracs = curr_fracs[valid_poss] + if do_damp: motif_damp = curr_dampf[valid_poss] + if do_stats: motif_stats = curr_stats[valid_poss] + if do_vcov: motif_vcov = curr_vcov[valid_poss] + + # write current chrm/strand data + if do_frac: + _write_cs_data(frac_fp, curr_chrm, m_curr_poss, motif_fracs) + if do_damp: + _write_cs_data(damp_fp, curr_chrm, m_curr_poss, motif_damp) + if do_stats: + _write_cs_data(stat_fp, curr_chrm, m_curr_poss, motif_stats) + if do_vcov: + _write_cs_int_data(vcov_fp, curr_chrm, m_curr_poss, motif_vcov) + + return + + + (curr_chrm, curr_strand, curr_poss, curr_fracs, curr_dampf, curr_stats, + curr_vcov) = (None, None, [], [], [], [], []) for chrm, strand, start, end, block_stats in all_stats: if chrm != curr_chrm or strand != curr_strand: if len(curr_poss) > 0: - curr_poss = np.concatenate(curr_poss) - # write current chrm/strand data - if do_frac: - wig_fp = plus_frac_fp if curr_strand == '+' else minus_frac_fp - _write_cs_data(wig_fp, curr_chrm, curr_poss, - np.concatenate(curr_fracs)) - if do_damp: - wig_fp = plus_damp_fp if curr_strand == '+' else minus_damp_fp - _write_cs_data(wig_fp, curr_chrm, curr_poss, - np.concatenate(curr_damp_fracs)) - if do_valid_cov: - wig_fp = plus_vcov_fp if curr_strand == '+' else minus_vcov_fp - _write_cs_int_data(wig_fp, curr_chrm, curr_poss, - np.concatenate(curr_valid_cov)) + write_cs_stats( + curr_chrm, curr_strand, curr_poss, curr_fracs, + curr_dampf, curr_stats, curr_vcov) # set new chrm and strand and empty lists curr_chrm, curr_strand = chrm, strand - curr_poss, curr_fracs, curr_damp_fracs, curr_valid_cov = ( - [], [], [], []) + curr_poss, curr_fracs, curr_dampf, curr_stats, curr_vcov = ( + [], [], [], [], []) # store block statistics curr_poss.append(block_stats[POS_SLOT]) - if do_frac: - curr_fracs.append(1 - block_stats[FRAC_SLOT]) - if do_damp: - curr_damp_fracs.append(1 - block_stats[DFRAC_SLOT]) - if do_valid_cov: - curr_valid_cov.append(block_stats[VCOV_SLOT]) + if do_frac: curr_fracs.append(1 - block_stats[FRAC_SLOT]) + if do_damp: curr_dampf.append(1 - block_stats[DFRAC_SLOT]) + if do_stats: curr_stats.append(all_stats._stat_transform(block_stats)) + if do_vcov: curr_vcov.append(block_stats[VCOV_SLOT]) # write last chrm/strand data if len(curr_poss) > 0: - curr_poss = np.concatenate(curr_poss) - if do_frac: - wig_fp = plus_frac_fp if curr_strand == '+' else minus_frac_fp - _write_cs_data(wig_fp, curr_chrm, curr_poss, - np.concatenate(curr_fracs)) - if do_damp: - wig_fp = plus_damp_fp if curr_strand == '+' else minus_damp_fp - _write_cs_data(wig_fp, curr_chrm, curr_poss, - np.concatenate(curr_damp_fracs)) - if do_valid_cov: - wig_fp = plus_vcov_fp if curr_strand == '+' else minus_vcov_fp - _write_cs_int_data(wig_fp, curr_chrm, curr_poss, - np.concatenate(curr_valid_cov)) + write_cs_stats( + curr_chrm, curr_strand, curr_poss, + curr_fracs, curr_dampf, curr_stats, curr_vcov) if do_frac: - plus_frac_fp.close() - minus_frac_fp.close() + for wig_fp in frac_fps.values(): + wig_fp.close() if do_damp: - plus_damp_fp.close() - minus_damp_fp.close() - if do_valid_cov: - plus_vcov_fp.close() - minus_vcov_fp.close() + for wig_fp in damp_fps.values(): + wig_fp.close() + if do_stats: + for wig_fp in stat_fps.values(): + wig_fp.close() + if do_vcov: + for wig_fp in vcov_fps.values(): + wig_fp.close() return @@ -255,7 +321,7 @@ def write_cov_wig(reads_index, out_base, group_text): def write_all_browser_files( fast5s_dirs, ctrl_fast5s_dirs, corr_grp, bc_subgrps, - stats_fn, wig_base, wig_types): + stats_fn, wig_base, wig_types, motif_descs, fasta_fn): if fast5s_dirs is not None: reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) if reads_index.is_empty(): @@ -301,13 +367,20 @@ def write_all_browser_files( reads_index, chrm_sizes, wig_base, CTRL_NAME, DWELL_WIG_TYPE, DWELL_SLOT) if any(wig_type in wig_types for wig_type in ( - FRAC_WIG_TYPE, DFRAC_WIG_TYPE, VCOV_WIG_TYPE)): + FRAC_WIG_TYPE, DFRAC_WIG_TYPE, STAT_WIG_TYPE, VCOV_WIG_TYPE)): if VERBOSE: th.status_message('Loading statistics from file.') all_stats = ts.TomboStats(stats_fn) - write_frac_wigs(all_stats, wig_base, - FRAC_WIG_TYPE in wig_types, - DFRAC_WIG_TYPE in wig_types, - VCOV_WIG_TYPE in wig_types) + if all_stats.is_model_stats and any(( + FRAC_WIG_TYPE in wig_types, DFRAC_WIG_TYPE in wig_types, + VCOV_WIG_TYPE in wig_types)): + th.TomboError('Cannot output fraction, dampened_fraction or ' + + 'valid_coverage for LevelStats statistics.') + if not all_stats.is_model_stats and STAT_WIG_TYPE in wig_types: + th.TomboError('Cannot output stat for ModelStats statistics.') + write_frac_wigs( + all_stats, wig_base, FRAC_WIG_TYPE in wig_types, + DFRAC_WIG_TYPE in wig_types, STAT_WIG_TYPE in wig_types, + VCOV_WIG_TYPE in wig_types, motif_descs, fasta_fn) return @@ -361,7 +434,7 @@ def _browser_files_main(args): 'Must provide a fast5 basedir to output signal, difference, ' + 'coverage, signal_sd and/or length browser files.') if (any(wig_type in args.file_types for wig_type in ( - FRAC_WIG_TYPE, DFRAC_WIG_TYPE, VCOV_WIG_TYPE)) and + FRAC_WIG_TYPE, DFRAC_WIG_TYPE, STAT_WIG_TYPE, VCOV_WIG_TYPE)) and args.statistics_filename is None): th.error_message_and_exit( 'Must provide a statistics filename to output ' + @@ -380,7 +453,8 @@ def _browser_files_main(args): write_all_browser_files( args.fast5_basedirs, args.control_fast5_basedirs, args.corrected_group, args.basecall_subgroups, args.statistics_filename, - args.browser_file_basename, args.file_types) + args.browser_file_basename, args.file_types, args.genome_fasta, + args.motif_descriptions) return diff --git a/tombo/_version.py b/tombo/_version.py index 62fdcfc..336236f 100644 --- a/tombo/_version.py +++ b/tombo/_version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -TOMBO_VERSION = '1.4' +TOMBO_VERSION = '1.5' diff --git a/tombo/resquiggle.py b/tombo/resquiggle.py index 7bfb473..e202c58 100644 --- a/tombo/resquiggle.py +++ b/tombo/resquiggle.py @@ -90,14 +90,16 @@ # fit debug plot requires r cowplot package to be installed _DEBUG_FIT = False _DEBUG_START_CLIP_FIT = False +# raw signal re-squiggle DP +_DEBUG_RAW_DP = False # don't plot more than one debug type at a time assert sum(( _DEBUG_DP_ENDS, _DEBUG_FIT, _DEBUG_START_CLIP_FIT, - _DEBUG_DP_START, _DEBUG_CLIP_START)) <= 1 + _DEBUG_DP_START, _DEBUG_CLIP_START, _DEBUG_RAW_DP)) <= 1 _DEBUG_PLOTTING = any(( _DEBUG_FIT, _DEBUG_START_CLIP_FIT, _DEBUG_DP_ENDS, _DEBUG_DP_START, - _DEBUG_CLIP_START)) + _DEBUG_CLIP_START, _DEBUG_RAW_DP)) _DRY_RUN = any(( _DEBUG_PARAMS, _DEBUG_BANDWIDTH, _DEBUG_START_BANDWIDTH, _DEBUG_PLOTTING)) @@ -118,6 +120,7 @@ def _write_params_debug( '\t'.join(map(str, ( rsqgl_params.running_stat_width, rsqgl_params.min_obs_per_base, + rsqgl_params.raw_min_obs_per_base, rsqgl_params.mean_obs_per_event, rsqgl_params.match_evalue, rsqgl_params.skip_pen, @@ -196,6 +199,54 @@ def _debug_plot_dp( return +def _debug_raw_dp(z_scores, fwd_pass, read_tb, sig_data, reg_id='0'): + reg_id = unicode(reg_id) + + event_poss, seq_poss, r_z_scores, fwd_scores = [], [], [], [] + for seq_pos, ((s_z_data, (b_e_start, b_e_end)), s_f_data) in enumerate(zip( + z_scores, map(itemgetter(0), fwd_pass))): + for band_pos, score in enumerate(s_z_data): + r_z_scores.append(score) + event_poss.append(band_pos + b_e_start) + seq_poss.append(seq_pos) + for band_pos, score in enumerate(s_f_data): + fwd_scores.append(score) + + zDat = r.DataFrame({ + 'Score':r.FloatVector(r_z_scores), + 'EventPos':r.IntVector(event_poss), + 'SeqPos':r.IntVector(seq_poss), + 'Region':r.StrVector([reg_id,] * len(seq_poss))}) + fwdDat = r.DataFrame({ + 'Score':r.FloatVector(fwd_scores), + 'EventPos':r.IntVector(event_poss), + 'SeqPos':r.IntVector(seq_poss), + 'Region':r.StrVector([reg_id,] * len(seq_poss))}) + + event_poss, seq_poss = [0,], [0,] + for seq_pos, event_pos in enumerate(read_tb): + event_poss.append(event_pos - 1) + seq_poss.append(seq_pos) + event_poss.append(event_pos) + seq_poss.append(seq_pos + 1) + event_poss.append(z_scores[-1][1][1] - 1) + seq_poss.append(read_tb.shape[0]) + + tbDat = r.DataFrame({ + 'EventPos':r.IntVector(event_poss), + 'SeqPos':r.IntVector(seq_poss), + 'Region':r.StrVector([reg_id,] * len(seq_poss))}) + + sigDat = r.DataFrame({ + 'Pos':r.IntVector(list(range(sig_data.shape[0]))), + 'Signal':r.FloatVector(sig_data) + }) + + r.r(resource_string(__name__, 'R_scripts/debugRawDP.R').decode()) + r.globalenv[str('plotRawDP')](zDat, fwdDat, tbDat, sigDat) + + return + def _debug_fit( fwd_pass_move, band_event_starts, top_max_pos, z_scores, reg_id, final_score, bandwidth, event_means, r_ref_means, @@ -270,6 +321,9 @@ def _open_debug_pdf(): elif _DEBUG_START_CLIP_FIT: importr(str('cowplot')) r.r('pdf("debug_event_align.start_clip_fit.pdf", width=15, height=5)') + elif _DEBUG_RAW_DP: + importr(str('cowplot')) + r.r('pdf("debug_event_align.raw_dp.pdf", width=11, height=7)') else: th.error_message_and_exit('Must specify which debug plot to open.') @@ -327,7 +381,6 @@ def raw_traceback(reg_fwd_scores, min_obs_per_base): # initilize array to store new segments new_segs = np.empty(len(reg_fwd_scores) - 1, dtype=np.int64) # get first two bases of data for lookups - curr_base_sig = 1 curr_b_data, _, (curr_start, curr_end) = reg_fwd_scores[-1] next_b_data, _, (next_start, next_end) = reg_fwd_scores[-2] new_segs[-1] = c_base_traceback( @@ -379,7 +432,7 @@ def window_too_small(start, end): # windows are expanded by one base and the extra signal factor # to allow some room to search for best path return sig_len <= ((n_events + 1) * - rsqgl_params.min_obs_per_base) * extra_sig_factor + rsqgl_params.raw_min_obs_per_base) * extra_sig_factor def expand_small_windows(all_del_windows): expanded_del_windows = [] @@ -458,14 +511,17 @@ def get_deletion_windows(): reg_z_scores = c_reg_z_scores( norm_signal[sig_start:sig_end], dp_res.ref_means[start:end], dp_res.ref_sds[start:end], pseudo_starts, - 0, n_events, n_events, rsqgl_params.min_obs_per_base, + 0, n_events, n_events, rsqgl_params.raw_min_obs_per_base, max_half_z_score=rsqgl_params.max_half_z_score) reg_fwd_scores = raw_forward_pass( - reg_z_scores, rsqgl_params.min_obs_per_base) + reg_z_scores, rsqgl_params.raw_min_obs_per_base) # perform signal based scoring segmentation # - it is ~60X faster than base space reg_segs = raw_traceback( - reg_fwd_scores, rsqgl_params.min_obs_per_base) + sig_start + reg_fwd_scores, rsqgl_params.raw_min_obs_per_base) + sig_start + if _DEBUG_RAW_DP: + _debug_raw_dp(reg_z_scores, reg_fwd_scores, reg_segs - sig_start, + norm_signal[sig_start:sig_end]) if reg_segs.shape[0] != end - start - 1: raise th.TomboError('Invalid segmentation results.') resolved_segs[start+1:end] = reg_segs @@ -669,7 +725,7 @@ def find_seq_start_in_events( if _DEBUG_DP_START: _debug_plot_dp( start_z_scores, start_fwd_pass, start_band_event_starts, - start_fwd_pass_move, top_max_pos, reg_id=reg_id) + start_fwd_pass_move, top_max_pos, reg_id=reg_id, short=True) if _DEBUG_START_BANDWIDTH: _debug_fit( start_fwd_pass_move, start_band_event_starts, top_max_pos, @@ -725,7 +781,7 @@ def find_seq_start_from_clip_basecalls( start_genome_seq = genome_seq[ std_ref.central_pos:num_genome_bases + dnstrm_bases] start_seq = start_clip_bases + start_genome_seq - r_ref_means, r_ref_sds, _, _ = ts.get_ref_from_seq(start_seq, std_ref) + r_ref_means, r_ref_sds = std_ref.get_exp_levels_from_seq(start_seq) seq_len = r_ref_means.shape[0] # now find full sequence to events path using a smaller bandwidth @@ -893,7 +949,8 @@ def plot_debug(shifted_z_scores): return - # if start clip bases are provided, run better start identification algorithm + # if start clip bases are provided, run "cliped bases" start + # identification algorithm if (start_clip_bases is not None and len(genome_seq) > start_clip_params.num_genome_bases): if len(start_clip_bases) < std_ref.central_pos: @@ -907,7 +964,7 @@ def plot_debug(shifted_z_scores): start_clip_params.num_genome_bases, reg_id=reg_id) dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 - r_ref_means, r_ref_sds, _, _ = ts.get_ref_from_seq(genome_seq, std_ref) + r_ref_means, r_ref_sds = std_ref.get_exp_levels_from_seq(genome_seq) # trim genome seq to match model-able positions genome_seq = genome_seq[std_ref.central_pos:-dnstrm_bases] seq_len = len(genome_seq) @@ -998,7 +1055,7 @@ def segment_signal( """Normalize and segment raw signal as defined by `rsqgl_params` into `num_events`. Args: - map_res (:class:`tombo.tombo_helper.resquiggleResults`): containing mapping results + map_res (:class:`tombo.tombo_helper.resquiggleResults`): containing mapping results only (attributes after ``read_start_rel_to_raw`` will all be ``None``) num_events (int): number of events to process rsqgl_params (:class:`tombo.tombo_helper.resquiggleParams`): parameters for the re-squiggle algorithm outlier_thresh (float): windsorize signal greater than this value (optional) @@ -1079,7 +1136,7 @@ def resquiggle_read( seq_samp_type (:class:`tombo.tombo_helper.seqSampleType`): sequencing sample type (default: DNA) Returns: - :class:`tombo.tombo_helper.resquiggleResults` containing raw signal to genome sequence alignment + :class:`tombo.tombo_helper.resquiggleResults` containing raw signal to genome sequence alignment (note that ``raw_signal`` now contains trimmed and normalized raw signal) """ if all_raw_signal is not None: map_res = map_res._replace(raw_signal = all_raw_signal) @@ -1232,7 +1289,7 @@ def map_read( q_score_thresh (float): basecalling mean q-score threshold (optional; default: 0/no filtering) Returns: - :class:`tombo.tombo_helper.resquiggleResults` containing valid mapping values + :class:`tombo.tombo_helper.resquiggleResults` containing valid mapping values (signal to sequence assignment attributes will be ``None``) """ seq_data = get_read_seq( fast5_data, bc_grp, bc_subgrp, seq_samp_type, q_score_thresh) @@ -1297,8 +1354,8 @@ def map_read( genome_seq = genome_seq.decode() if strand == '-': genome_seq = th.rev_comp(genome_seq) - # discordant mapping to sequence extraction is due to reads mapping up to the - # end of a seqeunce record (and don't need to carry around record lens), + # discordant mapping to sequence extraction is due to reads mapping up to + # the end of a seqeunce record (and don't need to carry around record lens), # so don't error on these discordant lengths here #if len(genome_seq) != ref_end - ref_start + std_ref.kmer_width - 1: # raise th.TomboError('Discordant mapped position and sequence') @@ -1524,7 +1581,8 @@ def _resquiggle_worker(*args): def _io_and_mappy_thread_worker( fast5_q, progress_q, failed_reads_q, index_q, bc_grp, bc_subgrps, corr_grp, aligner, outlier_thresh, compute_sd, sig_match_thresh, - obs_filter, seq_samp_type, overwrite, map_conn, q_score_thresh, std_ref): + obs_filter, seq_samp_type, overwrite, map_conn, q_score_thresh, + std_ref): # increase update interval as more reads are provided proc_update_interval = 1 def update_progress(num_processed, proc_update_interval): @@ -1576,6 +1634,8 @@ def update_progress(num_processed, proc_update_interval): aligner, seq_samp_type, map_thr_buf, fast5_fn, num_processed, map_conn, outlier_thresh, compute_sd, obs_filter, index_q, q_score_thresh, sig_match_thresh, std_ref) + except th.TomboError as e: + failed_reads_q.put((str(e), fast5_fn, True)) finally: try: fast5_data.close() @@ -1603,7 +1663,8 @@ def format_fail_summ(header, fail_summ=[], num_proc=0, num_errs=None): (None, '') for _ in range(num_errs - len(summ_errs))]) errs_str = '\n'.join( "{:8.1f}% ({:>7} reads)".format(100 * n_fns / float(num_proc), - n_fns) + " : " + '{:<80}'.format(err) + n_fns) + " : " + '{:<80}'.format( + err) if (n_fns is not None and num_proc > 0) else ' -----' for n_fns, err in summ_errs) return '\n'.join((header, errs_str)) @@ -1879,7 +1940,8 @@ def _resquiggle_main(args): """Main method for resquiggle """ if args.processes > 1 and _DEBUG_PLOTTING: - th.error_message_and_exit('Cannot run multiple processes and debugging.') + th.error_message_and_exit( + 'Cannot run multiple processes and debugging.') if _DEBUG_PLOTTING: th.warning_message( 'Producing de-bug plotting output. Can be very slow and should ' + @@ -1915,7 +1977,8 @@ def _resquiggle_main(args): if VERBOSE: th.status_message('Loading minimap2 reference.') # to be enabled when mappy genome sequence extraction bug is fixed - aligner = mappy.Aligner(str(args.reference), preset=str('map-ont'), best_n=1) + aligner = mappy.Aligner( + str(args.reference), preset=str('map-ont'), best_n=1) if not aligner: th.error_message_and_exit( 'Failed to load reference genome FASTA for mapping.') diff --git a/tombo/tests/shell_tests.sh b/tombo/tests/shell_tests.sh index aa19e5e..55a0d17 100755 --- a/tombo/tests/shell_tests.sh +++ b/tombo/tests/shell_tests.sh @@ -14,6 +14,9 @@ mmiFn="e_coli.K12.NEB5alpha.mmi" genomeLocs='"CP017100.1:1505285" "CP017100.1:2873680"' strandGenomeLocs='"CP017100.1:1505285:+" "CP017100.1:2873680:-"' +modLocsFn='modified_positions.bed' +unmodLocsFn='unmodified_positions.bed' + runHelps=false runResquiggle=true @@ -35,7 +38,8 @@ tombo filter genome_locations -h tombo detect_modifications de_novo -h tombo detect_modifications alternative_model -h -tombo detect_modifications sample_compare -h +tombo detect_modifications model_sample_compare -h +tombo detect_modifications level_sample_compare -h tombo detect_modifications aggregate_per_read_stats -h tombo text_output browser_files -h @@ -159,7 +163,7 @@ tombo plot motif_centered --fast5-basedirs $natDir --motif ATC \ --genome-fasta $genomeFn \ --num-bases 21 --overplot-threshold 1000 --deepest-coverage \ --pdf-filename testing.motif_centered.deepest.1_samp.pdf -tombo plot motif_centered --fast5-basedirs $natDir --motif CCWGG \ +tombo plot motif_centered --fast5-basedirs $natDir --motif NNCCWGG \ --plot-alternate-model 5mC --genome-fasta $genomeFn \ --num-bases 21 --overplot-threshold 1000 \ --pdf-filename testing.motif_centered.w_model.pdf @@ -204,7 +208,11 @@ rm test_stats.de_novo.tombo.stats test_stats.2samp.tombo.stats \ test_stats.alt_default_model.5mC.tombo.per_read_stats \ test_stats.alt_default_model.6mA.tombo.per_read_stats \ test_standard.model test_stats.de_novo.new_thresh.tombo.stats \ - test_alt.model test_alt.use_densities.model + test_alt.model test_alt.use_densities.model \ + test_stats.alt_native.motif.dcm.tombo.stats \ + test_stats.alt_native.motif.dam.tombo.stats \ + test_stats.alt_amp_samp.motif.dcm.tombo.stats \ + test_stats.alt_amp_samp.motif.dam.tombo.stats tombo detect_modifications de_novo --fast5-basedirs $natDir \ --minimum-test-reads 5 \ --statistics-file-basename test_stats.de_novo \ @@ -213,16 +221,20 @@ tombo detect_modifications de_novo --fast5-basedirs $natDir \ --minimum-test-reads 5 --single-read-threshold 0.1 0.75 \ --statistics-file-basename test_stats.de_novo.two_way_thresh \ --per-read-statistics-basename test_stats.de_novo.two_way_thresh -tombo detect_modifications sample_compare --fast5-basedirs $natDir \ +tombo detect_modifications model_sample_compare --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --minimum-test-reads 5 --sample-only-estimates \ --statistics-file-basename test_stats.2samp \ --per-read-statistics-basename test_stats.2samp -tombo detect_modifications sample_compare --fast5-basedirs $natDir \ +tombo detect_modifications model_sample_compare --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --minimum-test-reads 5 \ --statistics-file-basename test_stats.2samp_w_post \ --per-read-statistics-basename test_stats.2samp_w_post +tombo detect_modifications level_sample_compare --fast5-basedirs $natDir \ + --alternate-fast5-basedirs $ampDir \ + --minimum-test-reads 5 \ + --statistics-file-basename test_stats.2samp_levels tombo detect_modifications alternative_model --fast5-basedirs $natDir \ --alternate-bases 5mC 6mA \ --statistics-file-basename test_stats.alt_default_model \ @@ -231,7 +243,9 @@ tombo detect_modifications alternative_model --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ --alternate-model-filenames $altModFn \ --statistics-file-basename test_stats.alt_model \ - --per-read-statistics-basename test_stats.alt_model + --per-read-statistics-basename test_stats.alt_model + +printf "\n\n********* Testing model estimation. **********\n" tombo build_model estimate_reference --fast5-basedirs $natDir \ --tombo-model-filename test_standard.model \ --upstream-bases 1 --downstream-bases 1 --minimum-kmer-observations 1 @@ -248,6 +262,10 @@ tombo build_model estimate_alt_reference \ --alternate-model-filename test_alt.use_densities.model \ --alternate-model-name 5mC --alternate-model-base C \ --minimum-kmer-observations 1 +tombo build_model estimate_motif_alt_reference --fast5-basedirs $natDir \ + --alternate-model-filename test_alt.motif_model \ + --alternate-model-name dcm --motif-description CCWGG:2 \ + --minimum-kmer-observations 1 printf "\n\n********* Testing aggregate per-read stats **********\n" tombo detect_modifications aggregate_per_read_stats --minimum-test-reads 5 \ @@ -259,12 +277,14 @@ printf "\n\n********* Testing ROC and Precision-Recall plotting **********\n" tombo plot roc --genome-fasta e_coli.K12.NEB5alpha.fasta \ --statistics-filenames test_stats.2samp.tombo.stats \ test_stats.2samp_w_post.tombo.stats \ + test_stats.2samp_levels.tombo.stats \ test_stats.alt_default_model.5mC.tombo.stats \ test_stats.alt_default_model.6mA.tombo.stats \ test_stats.de_novo.tombo.stats test_stats.de_novo.new_thresh.tombo.stats \ --motif-descriptions \ - CCWGG:2:"dcm 5mC Samp Comp"::GATC:2:"dam 6mA Samp Comp" \ + CCWGG:2:"dcm 5mC Samp Comp Model"::GATC:2:"dam 6mA Samp Comp Model" \ CCWGG:2:"dcm 5mC Samp Comp w/ post"::GATC:2:"dam 6mA Samp Comp w/ post" \ + CCWGG:2:"dcm 5mC Samp Comp Levels"::GATC:2:"dam 6mA Samp Comp Levels" \ CCWGG:2:"dcm 5mC Alt Test" GATC:2:"dam 6mA Alt Test" \ CCWGG:2:"dcm 5mC De Novo"::GATC:2:"dam 6mA De Novo" \ CCWGG:2:"dcm 5mC De Novo New Thresh"::GATC:2:"dam 6mA De Novo New Thresh" @@ -275,11 +295,77 @@ tombo plot per_read_roc --genome-fasta e_coli.K12.NEB5alpha.fasta \ test_stats.alt_default_model.5mC.tombo.per_read_stats \ test_stats.alt_default_model.6mA.tombo.per_read_stats \ test_stats.de_novo.tombo.per_read_stats --motif-descriptions \ - CCWGG:2:"dcm 5mC Samp Comp"::GATC:2:"dam 6mA Samp Comp" \ + CCWGG:2:"dcm 5mC Samp Comp Model"::GATC:2:"dam 6mA Samp Comp Model" \ CCWGG:2:"dcm 5mC Samp Comp w/ post"::GATC:2:"dam 6mA Samp Comp w/ post" \ CCWGG:2:"dcm 5mC Alt Test" GATC:2:"dam 6mA Alt Test" \ CCWGG:2:"dcm 5mC De Novo"::GATC:2:"dam 6mA De Novo" +printf "\n\n********* Testing Sample Compare ROC and Precision-Recall plotting **********\n" +tombo detect_modifications alternative_model --fast5-basedirs $natDir \ + --alternate-bases dam dcm \ + --statistics-file-basename test_stats.alt_native.motif \ + --per-read-statistics-basename test_stats.alt_native.motif +tombo detect_modifications alternative_model --fast5-basedirs $ampDir \ + --alternate-bases dam dcm \ + --statistics-file-basename test_stats.alt_amp_samp.motif \ + --per-read-statistics-basename test_stats.alt_amp_samp.motif +tombo detect_modifications de_novo --fast5-basedirs $ampDir \ + --minimum-test-reads 5 \ + --statistics-file-basename test_stats.de_novo.amp \ + --per-read-statistics-basename test_stats.de_novo.amp +tombo detect_modifications alternative_model --fast5-basedirs $ampDir \ + --alternate-bases 5mC 6mA \ + --statistics-file-basename test_stats.alt_default_model.amp \ + --per-read-statistics-basename test_stats.alt_default_model.amp +tombo plot sample_compare_roc --genome-fasta e_coli.K12.NEB5alpha.fasta \ + --statistics-filenames \ + test_stats.alt_native.motif.dcm.tombo.stats \ + test_stats.alt_native.motif.dam.tombo.stats \ + test_stats.de_novo.tombo.stats \ + test_stats.alt_default_model.5mC.tombo.stats \ + test_stats.alt_default_model.6mA.tombo.stats \ + --control-statistics-filenames \ + test_stats.alt_amp_samp.motif.dcm.tombo.stats \ + test_stats.alt_amp_samp.motif.dam.tombo.stats \ + test_stats.de_novo.amp.tombo.stats \ + test_stats.alt_default_model.amp.5mC.tombo.stats \ + test_stats.alt_default_model.amp.6mA.tombo.stats \ + --motif-descriptions CCWGG:2:"dcm 5mC Alt Test" \ + GATC:2:"dam 6mA Alt Test" \ + CCWGG:2:"dcm 5mC De novo"::GATC:2:"dam 6mA De novo" \ + CCWGG:2:"5mC all-context Alt Test" GATC:2:"6mA all-context Alt Test" \ + --pdf-filename test_stats.samp_comp_roc.pdf +tombo plot sample_compare_per_read_roc \ + --genome-fasta e_coli.K12.NEB5alpha.fasta \ + --per-read-statistics-filenames \ + test_stats.alt_native.motif.dcm.tombo.per_read_stats \ + test_stats.alt_native.motif.dam.tombo.per_read_stats \ + test_stats.de_novo.tombo.per_read_stats \ + test_stats.alt_default_model.5mC.tombo.per_read_stats \ + test_stats.alt_default_model.6mA.tombo.per_read_stats \ + --per-read-control-statistics-filenames \ + test_stats.alt_amp_samp.motif.dcm.tombo.per_read_stats \ + test_stats.alt_amp_samp.motif.dam.tombo.per_read_stats \ + test_stats.de_novo.amp.tombo.per_read_stats \ + test_stats.alt_default_model.amp.5mC.tombo.per_read_stats \ + test_stats.alt_default_model.amp.6mA.tombo.per_read_stats \ + --motif-descriptions CCWGG:2:"dcm 5mC Alt Test" \ + GATC:2:"dam 6mA Alt Test" \ + CCWGG:2:"dcm 5mC De novo"::GATC:2:"dam 6mA De novo" \ + CCWGG:2:"5mC all-context Alt Test" GATC:2:"6mA all-context Alt Test" \ + --pdf-filename test_stats.samp_comp_per_read_roc.pdf + +printf "\n\n********* Testing Known Site ROC and Precision-Recall plotting **********\n" +tombo plot roc \ + --statistics-filenames test_stats.alt_native.motif.dcm.tombo.stats \ + --modified-locations "dcm 5mC":$modLocsFn \ + --unmodified-locations $unmodLocsFn +tombo plot per_read_roc \ + --per-read-statistics-filenames \ + test_stats.alt_native.motif.dcm.tombo.per_read_stats \ + --modified-locations "dcm 5mC":$modLocsFn \ + --unmodified-locations $unmodLocsFn + printf "\n\n********* Testing mutliple sample statistical testing genome-anchored plotting functions **********\n" tombo plot max_difference --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ @@ -290,6 +376,11 @@ tombo plot most_significant --fast5-basedirs $natDir \ --num-bases 21 --overplot-threshold 1000 \ --statistics-filename test_stats.2samp.tombo.stats \ --pdf-filename testing.most_signif.2samp.pdf +tombo plot most_significant --fast5-basedirs $natDir \ + --control-fast5-basedirs $ampDir \ + --num-bases 21 --overplot-threshold 1000 \ + --statistics-filename test_stats.2samp_levels.tombo.stats \ + --pdf-filename testing.most_signif.2samp_levels.pdf tombo plot most_significant --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1000 \ @@ -316,11 +407,16 @@ tombo plot motif_with_stats --fast5-basedirs $natDir \ --statistics-filename test_stats.2samp.tombo.stats \ --pdf-filename testing.motif_w_stats.2samp.pdf tombo plot motif_with_stats --fast5-basedirs $natDir \ - --plot-alternate-model 5mC --motif CCWGG --genome-fasta $genomeFn \ + --control-fast5-basedirs $ampDir --motif CCWGG \ + --genome-fasta $genomeFn --overplot-threshold 1000 \ + --statistics-filename test_stats.2samp_levels.tombo.stats \ + --pdf-filename testing.motif_w_stats.2samp_levels.pdf +tombo plot motif_with_stats --fast5-basedirs $natDir \ + --plot-alternate-model 5mC --motif NNCCWGG --genome-fasta $genomeFn \ --statistics-filename test_stats.alt_model.5mC.tombo.stats \ --pdf-filename testing.motif_w_stats.alt_model_5mC.pdf tombo plot motif_with_stats --fast5-basedirs $natDir \ - --plot-alternate-model 6mA --motif GATC --genome-fasta $genomeFn \ + --plot-alternate-model 6mA --motif NGATC --genome-fasta $genomeFn \ --statistics-filename test_stats.alt_default_model.6mA.tombo.stats \ --pdf-filename testing.motif_w_stats.alt_model_6mA.alt_dist.pdf @@ -418,6 +514,16 @@ tombo plot per_read --fast5-basedirs $natDir \ tombo plot per_read --genome-locations $genomeLocs --num-bases 101 \ --per-read-statistics-filename test_stats.alt_model.5mC.tombo.per_read_stats \ --pdf-filename testing.per_read.wo_seq.pdf +# plot locs that should cluster +tombo detect_modifications alternative_model --fast5-basedirs $ampDir $natDir \ + --alternate-bases dam dcm \ + --statistics-file-basename test_stats.alt_both_samp.motif \ + --per-read-statistics-basename test_stats.alt_both_samp.motif +tombo plot per_read --genome-locations $genomeLocs --num-bases 10001 \ + --per-read-statistics-filename \ + test_stats.alt_both_samp.motif.dcm.tombo.per_read_stats \ + --genome-fasta $genomeFn \ + --pdf-filename testing.per_read.w_alt_motif.pdf printf "\n\n********* Testing text output commands **********\n" tombo text_output signif_sequence_context --fast5-basedirs $natDir $ampDir \ @@ -434,9 +540,15 @@ tombo text_output browser_files --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --file-types coverage fraction signal signal_sd dwell difference \ --statistics-filename test_stats.2samp.tombo.stats +tombo text_output browser_files --file-types statistic \ + --statistics-filename test_stats.2samp_levels.tombo.stats tombo text_output browser_files --file-types fraction dampened_fraction \ valid_coverage --statistics-filename \ test_stats.de_novo.two_way_thresh.tombo.stats +tombo text_output browser_files --file-types fraction dampened_fraction \ + valid_coverage --statistics-filename \ + test_stats.alt_default_model.5mC.tombo.stats \ + --browser-file-basename test_stats.alt_5mC printf "\n\n********* Testing other plotting commands **********\n" tombo plot cluster_most_significant --fast5-basedirs $natDir \ @@ -448,6 +560,11 @@ tombo plot cluster_most_significant --fast5-basedirs $natDir \ --genome-fasta $genomeFn --num-regions 100 \ --r-data-filename testing.cluster_data.RData \ --statistics-filename test_stats.2samp.tombo.stats +tombo plot cluster_most_significant --fast5-basedirs $natDir \ + --control-fast5-basedirs $ampDir \ + --genome-fasta $genomeFn --num-regions 100 \ + --statistics-filename test_stats.2samp_levels.tombo.stats \ + --pdf-filename testing.cluster_most_signif.2samp_levels.pdf tombo plot kmer --fast5-basedirs $natDir \ --num-kmer-threshold 0 \ --pdf-filename testing.kmer_dist.median.all_events.pdf diff --git a/tombo/tombo_helper.py b/tombo/tombo_helper.py index 7869e44..ef88ea5 100644 --- a/tombo/tombo_helper.py +++ b/tombo/tombo_helper.py @@ -17,7 +17,6 @@ import numpy as np np.seterr(all='raise') -from tqdm import tqdm from time import sleep from time import strftime from operator import itemgetter @@ -174,9 +173,9 @@ class scaleValues(namedtuple( class resquiggleParams(namedtuple( 'resquiggleParams', ('match_evalue', 'skip_pen', 'bandwidth', 'max_half_z_score', - 'running_stat_width', 'min_obs_per_base', 'mean_obs_per_event', - 'z_shift', 'stay_pen', 'use_t_test_seg', 'band_bound_thresh', - 'start_bw', 'start_save_bw', 'start_n_bases'))): + 'running_stat_width', 'min_obs_per_base', 'raw_min_obs_per_base', + 'mean_obs_per_event', 'z_shift', 'stay_pen', 'use_t_test_seg', + 'band_bound_thresh', 'start_bw', 'start_save_bw', 'start_n_bases'))): """Re-squiggle parameters Args: @@ -240,7 +239,7 @@ class resquiggleResults(namedtuple( genome_loc (:class:`tombo.tombo_helper.genomeLocation`): genome mapping location genome_seq (str): mapped genome sequence mean_q_score (float): mean basecalling q-score - raw_signal (np.array::np.float64): raw signal (optional) + raw_signal (np.array::np.float64): raw signal (i.e. un-segmented; signal may be normalized) (optional) channel_info (:class:`tombo.tombo_helper.channelInfo`): channel information (optional) read_start_rel_to_raw (int): read start within raw signal (optional) segs (np.array::np.int64): relative raw signal segment positions (optional) @@ -313,6 +312,21 @@ class regionStats(namedtuple( valid_cov (np.array::np.int64): region valid (tested) read depth """ +class groupStats(namedtuple( + 'groupStats', ('reg_stats', 'reg_poss', 'chrm', 'strand', + 'start', 'reg_cov', 'ctrl_cov'))): + """Region statistics + + Args: + reg_stats (np.array::np.float64): statistic for group comparison + reg_poss (np.array::np.int64): positions for reported fractions + chrm (str): chromosome name + strand (str): strand (should be '+' or '-') + start (int): 0-based region start + reg_cov (np.array::np.int64): region read depth + ctrl_cov (np.array::np.int64): region control sample read depth + """ + class seqSampleType(namedtuple( 'seqSampleType', ('name', 'rev_sig'))): """Description of a sequencing sample type @@ -458,6 +472,23 @@ def parse_genome_regions(all_regs_text): return parsed_regs +def parse_locs_file(locs_fn): + """Parse BED files containing genomic locations (assumes single base locations, so end coordinate is ignored). + """ + raw_locs = defaultdict(set) + with open(locs_fn) as locs_fp: + for line in locs_fp: + try: + chrm, pos, _, _, _, strand = line.split()[:6] + # bed specs indicate 0-based start so don't shift here + pos = int(pos) + raw_locs[(chrm, strand)].add(pos) + except: + continue + + return dict((cs, np.array(sorted(cs_poss))) + for cs, cs_poss in raw_locs.items()) + def parse_obs_filter(obs_filter): """Parse observations per base formatted filtering """ @@ -476,6 +507,22 @@ def parse_obs_filter(obs_filter): return obs_filter +def get_seq_kmers(seq, kmer_width, rev_strand=False): + """Compute expected signal levels for a sequence from a reference model + + Args: + seq (str): genomic seqeunce to be converted to expected signal levels + kmer_width (int): k-mer width + rev_strand (bool): provided sequence is from reverse strand (so flip return order to genome forward direction) + """ + seq_kmers = [seq[i:i + kmer_width] + for i in range(len(seq) - kmer_width + 1)] + # get stat lookups from seq on native strand then flip if rev_strand + if rev_strand: + seq_kmers = seq_kmers[::-1] + + return seq_kmers + class TomboMotif(object): """Description of a sequence motif, including potentially modified position @@ -490,22 +537,53 @@ class TomboMotif(object): .. automethod:: __init__ """ - def _parse_motif(self, rev_comp_motif=False): + def _parse_motif(self, raw_motif, rev_comp_motif=False): conv_motif = ''.join(SINGLE_LETTER_CODE[letter] - for letter in self.raw_motif) + for letter in raw_motif) if rev_comp_motif: # reverse complement and then flip any group brackets conv_motif = rev_comp(conv_motif).translate({ ord('['):']', ord(']'):'['}) return re.compile(conv_motif) + def _compute_partial_patterns(self): + """Compute patterns for partial matches that include the mod_pos + at the start, end or within short sequences. + + Key into _partial_pats with: + 1) whether searching at start, end or within a short sequence + 2) length of the partial pattern + + Values are compiled partial pattern and mod_pos within pattern. + Short patterns are lists. + """ + self._partial_pats = {'start':{}, 'end':{}, 'short':{}} + for offset in range(self.mod_pos - 1): + self._partial_pats['start'][ + self.motif_len - offset - 1] = (self._parse_motif( + self.raw_motif[offset + 1:]), self.mod_pos - offset - 1) + for offset in range(self.motif_len - self.mod_pos): + self._partial_pats['end'][ + self.motif_len - offset - 1] = (self._parse_motif( + self.raw_motif[:-(offset + 1)]), self.mod_pos) + for short_len in range(1, self.motif_len): + self._partial_pats['short'][short_len] = [ + (self._parse_motif(self.raw_motif[offset:offset + short_len]), + self.mod_pos - offset) + for offset in range( + max(0, self.mod_pos - short_len), + min(self.motif_len - short_len + 1, self.mod_pos))] + return + def __init__(self, raw_motif, mod_pos=None): """Parse string motif Args: raw_motif (str): sequence motif. supports IUPAC single letter codes (use T for RNA). - mod_pos (int): 0-based position of modified base within the motif + mod_pos (int): 1-based position of modified base within the motif """ + # TODO convert mod_pos to 0-based coordinate + # (1-based is much more error prone) invalid_chars = re.findall( '[^' + ''.join(SINGLE_LETTER_CODE) + ']', raw_motif) if len(invalid_chars) > 0: @@ -515,8 +593,8 @@ def __init__(self, raw_motif, mod_pos=None): # basic motif parsing self.raw_motif = raw_motif self.motif_len = len(raw_motif) - self.motif_pat = self._parse_motif() - self.rev_comp_pat = self._parse_motif(True) + self.motif_pat = self._parse_motif(raw_motif) + self.rev_comp_pat = self._parse_motif(raw_motif, True) self.is_palindrome = self.motif_pat == self.rev_comp_pat @@ -532,6 +610,86 @@ def __init__(self, raw_motif, mod_pos=None): 'Provided modified position is not a single base, which ' + 'is likely an error. Specified modified base is one of: ' + ' '.join(SINGLE_LETTER_CODE[self.mod_base][1:-1])) + self._compute_partial_patterns() + + def __repr__(self): + return '\n'.join(('Raw Motif:\t' + self.raw_motif, + 'Mod Position:\t' + str(self.mod_pos), + 'Motif Pattern:\t' + str(self.motif_pat), + 'Rev Comp Pattern:\t' + str(self.rev_comp_pat))) + + def matches_seq(self, seq): + """Does the motif match provided sequence (including mod_pos within seq)? + + Including partial matches at beginning and end that include mod_pos. + """ + # check matches to start of sequence + for start_len in range(1, min(len(seq) + 1, self.motif_len)): + try: + start_pat, start_mod_pos = self._partial_pats[ + 'start'][start_len] + except KeyError: + continue + if start_pat.match(seq[:start_len]): + return True + + # check central sequence overlaps + if len(seq) < self.motif_len: + for short_pat, mod_pos in self._partial_pats['short'][len(seq)]: + if short_pat.match(seq): + return True + else: + if self.motif_pat.search(seq): + return True + + # check end of seq matches + for end_len in range(1, min(len(seq) + 1, self.motif_len)): + try: + end_pat, end_mod_pos = self._partial_pats['end'][end_len] + except KeyError: + continue + if end_pat.match(seq[-end_len:]): + return True + + return False + + def find_mod_poss(self, seq): + """Find all mod-base positions within the sequence. + + Including partial matches at beginning and end that include mod_pos. + """ + seq_mod_poss = set() + # check matches to start of sequence + for start_len in range(1, min(len(seq) + 1, self.motif_len)): + try: + start_pat, start_mod_pos = self._partial_pats[ + 'start'][start_len] + except KeyError: + continue + if start_pat.match(seq[:start_len]): + seq_mod_poss.add(start_mod_pos) + + # check central sequence overlaps + if len(seq) < self.motif_len: + for short_pat, short_mod_pos in self._partial_pats[ + 'short'][len(seq)]: + if short_pat.match(seq): + seq_mod_poss.add(short_mod_pos) + else: + for motif_match in self.motif_pat.finditer(seq): + seq_mod_poss.add(motif_match.start() + self.mod_pos) + + # check end of seq matches + for end_len in range(1, min(len(seq) + 1, self.motif_len)): + try: + end_pat, end_mod_pos = self._partial_pats['end'][end_len] + except KeyError: + continue + if end_pat.match(seq[-end_len:]): + seq_mod_poss.add(len(seq) - end_len + end_mod_pos) + + return sorted(seq_mod_poss) + def parse_motif_descs(stat_motif_descs): """Parse string motif descriptions as defined by ``tombo plot roc --motif-descriptions`` @@ -612,7 +770,6 @@ def __init__(self, fasta_fn, dry_run=False, force_in_mem=False, """Load a fasta Args: - fasta_fn (str): path to fasta file dry_run (bool): when pyfaidx is not installed, don't actually read sequence into memory. force_in_mem (bool): force genome to be loaded into memory even if pyfaidx is installed allowing on-disk access @@ -909,7 +1066,7 @@ def get_raw_read_slot(fast5_data): The HDF5 group slot containing the raw signal data. """ try: - raw_read_slot = list(fast5_data['/Raw/Reads'].values())[0] + raw_read_slot = next(iter(fast5_data['/Raw/Reads'].values())) except KeyError: raise TomboError('Raw data is not found in /Raw/Reads/Read_[read#]') @@ -1163,7 +1320,8 @@ def _parse_fast5s(self, fast5s_dirs): except TomboError: warning_message( 'Failed to parse tombo index file for ' + fast5s_dir + - ' directory. Creating index in memory from FAST5 files.') + ' directory. Creating temporary index from ' + + 'FAST5 files.') wo_index_dirs.append(fast5s_dir) else: if not warn_index: @@ -1278,6 +1436,36 @@ def iter_coverage_regions(self, ctrl_reads_index=None): return + def iter_cov_regs( + self, cov_thresh, region_size=None, ctrl_reads_index=None): + """Iterate over regions with coverage greater than or equal to cov_thresh. + + If region_size is provided, regions are rounded to the nearest region_sized windows and only the region start is yielded (e.g. chrm, strand, start). If not provided (chrm, strand, start, end) is yielded. + """ + def round_reg_start(x): + return int(region_size * np.floor(x / float(region_size))) + def round_reg_end(x): + return int(region_size * np.ceil(x / float(region_size))) + + + for chrm, strand, cov, starts in self.iter_coverage_regions( + ctrl_reads_index): + curr_reg_start = -1 + valid_cov = np.where(np.diff(np.concatenate([ + [False,], np.greater_equal(cov, cov_thresh), [False,]])))[0] + for cov_start_i, cov_end_i in zip(valid_cov[:-1], valid_cov[1:]): + cov_start, cov_end = starts[cov_start_i], starts[cov_end_i] + if region_size is None: + yield chrm, strand, cov_start, cov_end + continue + for reg_start in range(round_reg_start(cov_start), + round_reg_end(cov_end), region_size): + if reg_start != curr_reg_start: + yield chrm, strand, reg_start + curr_reg_start = reg_start + + return + def get_all_cs(self): """Get list of all (chromosome, strand) stored in index. """ @@ -2027,6 +2215,7 @@ def get_reads_events(cs_reads): # in RAM at one time read_means = get_single_slot_genome_centric(r_data, 'norm_mean') if read_means is None: continue + if read_means.shape[0] != r_data.end - r_data.start: continue assert read_means.shape[0] == r_data.end - r_data.start, ( 'Read found with mismatching mapping location and ' + 'signal information.') @@ -2084,14 +2273,16 @@ def try_close_prep_err(fast5_data, err_str): analyses_grp = fast5_data['/Analyses'] except: return try_close_prep_err( - fast5_data, 'Analysis group not found at root of FAST5') + fast5_data, + 'Base calls not found in FAST5 (see `tombo preprocess`)') try: # check that the requested basecalls group exsists if bc_grp is not None: analyses_grp[bc_grp] except: return try_close_prep_err( - fast5_data, 'Basecall group not found at [--basecall-group]') + fast5_data, + 'Base calls not found in FAST5 (see `tombo preprocess`)') try: corr_grp_ptr = analyses_grp[corr_grp] diff --git a/tombo/tombo_models/tombo.DNA.5mC.model b/tombo/tombo_models/tombo.DNA.5mC.model index 22afe63..238a986 100644 Binary files a/tombo/tombo_models/tombo.DNA.5mC.model and b/tombo/tombo_models/tombo.DNA.5mC.model differ diff --git a/tombo/tombo_models/tombo.DNA.6mA.model b/tombo/tombo_models/tombo.DNA.6mA.model index bb1b436..3c91632 100644 Binary files a/tombo/tombo_models/tombo.DNA.6mA.model and b/tombo/tombo_models/tombo.DNA.6mA.model differ diff --git a/tombo/tombo_models/tombo.DNA.CpG.model b/tombo/tombo_models/tombo.DNA.CpG.model new file mode 100755 index 0000000..cf51c1c Binary files /dev/null and b/tombo/tombo_models/tombo.DNA.CpG.model differ diff --git a/tombo/tombo_models/tombo.DNA.dam.model b/tombo/tombo_models/tombo.DNA.dam.model new file mode 100644 index 0000000..04073ff Binary files /dev/null and b/tombo/tombo_models/tombo.DNA.dam.model differ diff --git a/tombo/tombo_models/tombo.DNA.dcm.model b/tombo/tombo_models/tombo.DNA.dcm.model new file mode 100644 index 0000000..b5d4165 Binary files /dev/null and b/tombo/tombo_models/tombo.DNA.dcm.model differ diff --git a/tombo/tombo_models/tombo.RNA.5mC.model b/tombo/tombo_models/tombo.RNA.5mC.model index 3ba549b..93220d8 100644 Binary files a/tombo/tombo_models/tombo.RNA.5mC.model and b/tombo/tombo_models/tombo.RNA.5mC.model differ diff --git a/tombo/tombo_stats.py b/tombo/tombo_stats.py index 1749353..214aef8 100644 --- a/tombo/tombo_stats.py +++ b/tombo/tombo_stats.py @@ -57,18 +57,21 @@ # list of classes/functions to include in API __all__ = [ - 'TomboStats', 'PerReadStats', 'TomboModel', - 'normalize_raw_signal', 'compute_base_means', 'get_read_seg_score', - 'get_ref_from_seq', 'calc_kmer_fitted_shift_scale', + 'TomboStats', 'ModelStats', 'LevelStats', 'PerReadStats', + 'TomboModel', 'AltModel', 'normalize_raw_signal', 'compute_base_means', + 'get_read_seg_score', 'calc_kmer_fitted_shift_scale', 'load_resquiggle_parameters', 'compute_num_events'] VERBOSE = True _PROFILE_SIGNIF = False +_PROFILE_SIGNIF_STATS_OUT = False +_PROFILE_SIGNIF_PER_READ = False _PROFILE_EST_REF = False _PROFILE_CENTER_REF = False _PROFILE_ALT_EST = False +_PROFILE_MOTIF_ALT_EST = False _DEBUG_EST_STD = False _DEBUG_EST_BW = 0.05 @@ -79,12 +82,25 @@ DNA_BASES = ['A','C','G','T'] HALF_NORM_EXPECTED_VAL = stats.halfnorm.expect() +LOG10_E = np.log10(np.exp(1)) STANDARD_MODEL_NAME = 'standard' SAMP_COMP_TXT = 'sample_compare' DE_NOVO_TXT = 'de_novo' ALT_MODEL_TXT = 'model_compare' +PER_READ_STATS = (SAMP_COMP_TXT, DE_NOVO_TXT, ALT_MODEL_TXT) + +# TODO only ks-test currently implemented +KS_TEST_TXT = 'ks_test' +U_TEST_TXT = 'u_test' +T_TEST_TXT = 't_test' +KS_STAT_TEST_TXT = 'ks_stat_test' +U_STAT_TEST_TXT = 'u_stat_test' +T_STAT_TEST_TXT = 't_stat_test' +LEVEL_STATS_TXTS = ( + KS_TEST_TXT, U_TEST_TXT, T_TEST_TXT, + KS_STAT_TEST_TXT, U_STAT_TEST_TXT, T_STAT_TEST_TXT) ALT_MODEL_SEP_CHAR = '_' @@ -100,7 +116,8 @@ COV_DAMP_COUNTS_H5_NAME = 'Cov_Damp_Counts' COV_THRESH_H5_NAME = 'Cov_Threshold' -# turned off by default (and not accessible via command line so hardcoded for now) +# turned off by default (and not accessible via command line so +# hardcoded for now) DEFAULT_TRIM_RNA_PARAMS = th.trimRnaParams( moving_window_size=50, min_running_values=100, thresh_scale=0.7, max_raw_obs=40000) @@ -155,7 +172,7 @@ def get_pairwise_dists(reg_sig_diffs, index_q, dists_q, slide_span=None): """ if slide_span > 0: num_bases=reg_sig_diffs[0].shape[0] - (slide_span * 2) - while not index_q.empty(): + while True: try: index = index_q.get(block=False) except queue.Empty: @@ -183,11 +200,13 @@ def get_pairwise_dists(reg_sig_diffs, index_q, dists_q, slide_span=None): ################################## def compute_base_means(all_raw_signal, base_starts): - """Efficiently compute new base mean values from raw signal and base start positions + """Efficiently compute new base mean values from raw signal and base start + positions Args: all_raw_signal (`np.array`): raw nanopore signal obervation values - base_starts (`np.array::np.int32`): 0-based base start positions within raw signal + base_starts (`np.array::np.int32`): 0-based base start positions within + raw signal Returns: `np.array::np.float64` containing base mean levels @@ -350,14 +369,16 @@ def compute_running_mean_diffs(): def calc_kmer_fitted_shift_scale( prev_shift, prev_scale, r_event_means, r_model_means, r_model_inv_vars=None, method='theil_sen'): - """Use robust Theil-Sen estimator to compute fitted shift and scale parameters based on read sequence + """Use robust Theil-Sen estimator to compute fitted shift and scale + parameters based on read sequence Args: prev_shift (float): previous shift parameter prev_scale (float): previous scale parameter r_ref_means (`np.array::np.float64`): expected base signal levels r_ref_sds (`np.array::np.float64`): expected base signal level sds - r_model_inv_vars (`np.array::np.float64`): expected base signal level inverse variances for method of moments (`mom`) computation + r_model_inv_vars (`np.array::np.float64`): expected base signal level + inverse variances for method of moments (`mom`) computation method (str): one of `theil_sen`, `robust`, or `mom` Returns: @@ -466,15 +487,22 @@ def normalize_raw_signal( Args: all_raw_signal (`np.array`): raw nanopore signal obervation values - read_start_rel_to_raw (int): amount of signal to trim from beginning of the signal (default: 0) - read_obs_len (int): length of signal to process from `read_start_rel_to_raw` (default: full length) - norm_type (str): normalization type (`median` (default), `none`, `pA_raw`, `pA`, `median_const_scale`, `robust_median`; ignored is ``scale_values`` provided) + read_start_rel_to_raw (int): amount of signal to trim from beginning of + the signal (default: 0) + read_obs_len (int): length of signal to process from + `read_start_rel_to_raw` (default: full length) + norm_type (str): normalization type (`median` (default), `none`, + `pA_raw`, `pA`, `median_const_scale`, `robust_median`; ignored is + ``scale_values`` provided) outlier_thresh (float): windsorizing threshold (MAD units; default: None) - channel_info (:class:`tombo.tombo_helper.channelInfo`): channel information (optional; only for `pA` and `pA_raw`) - scale_values (:class:`tombo.tombo_helper.scaleValues`): scaling values (optional) + channel_info (:class:`tombo.tombo_helper.channelInfo`): channel + information (optional; only for `pA` and `pA_raw`) + scale_values (:class:`tombo.tombo_helper.scaleValues`): scaling values + (optional) event_means (`np.array`): for `pA` fitted scaling parameters (optional) model_means (`np.array`): for `pA` fitted scaling parameters (optional) - model_inv_vars (`np.array`): for `pA` fitted scaling parameters (optional) + model_inv_vars (`np.array`): for `pA` fitted scaling parameters + (optional) const_scale (float): global scale parameter (optional) Returns: @@ -549,7 +577,9 @@ def normalize_raw_signal( ############################# class TomboModel(object): - """Load, store and access Tombo model attributes and sequence-based expected mean and standard deviation levels (median normalization only). + """Load, store and access Tombo model attributes and sequence-based + expected mean and standard deviation levels (median normalization + only). .. automethod:: __init__ """ @@ -572,7 +602,6 @@ def write_model(self, ref_fn): """Write TomboModel to specified file Args: - ref_fn (str): filename to write TomboModel """ # Explicity use btype string names for py3 compatiblity as well as @@ -586,11 +615,7 @@ def write_model(self, ref_fn): with h5py.File(ref_fn, 'w') as ref_fp: ref_fp.create_dataset('model', data=ref_for_file, compression="gzip") ref_fp.attrs['central_pos'] = self.central_pos - if self.alt_base is None: - ref_fp.attrs['model_name'] = STANDARD_MODEL_NAME - else: - ref_fp.attrs['model_name'] = self.alt_name - ref_fp.attrs['alt_base'] = self.alt_base + ref_fp.attrs['model_name'] = STANDARD_MODEL_NAME return @@ -608,18 +633,12 @@ def _parse_tombo_model(self): except (AttributeError, TypeError): pass - try: - alt_base = ref_fp.attrs.get('alt_base') - except: - alt_base = None - try: - alt_base = alt_base.decode() - except (AttributeError, TypeError): - pass - except: - th.error_message_and_exit('Invalid tombo model file provided: ' - + unicode(self.ref_fn)) + th.error_message_and_exit( + 'Invalid Tombo model file provided: ' + unicode(self.ref_fn)) + + if model_name != STANDARD_MODEL_NAME: + th.error_message_and_exit('Non-standard Tombo model provided.') mean_ref = {} sd_ref = {} @@ -631,7 +650,6 @@ def _parse_tombo_model(self): self.means = mean_ref self.sds = sd_ref self.central_pos = central_pos - self.alt_base = alt_base self.name = model_name return @@ -660,12 +678,11 @@ def _parse_text_model(self): self.means = mean_ref self.sds = sd_ref self.central_pos = NANOPOLISH_CENTRAL_POS - self.alt_base = None self.name = STANDARD_MODEL_NAME return - def _load_std_model(self, kmer_ref, central_pos): + def _load_model(self, kmer_ref, central_pos): mean_ref = {} sd_ref = {} for kmer, kmer_mean, kmer_std in kmer_ref: @@ -680,7 +697,6 @@ def _load_std_model(self, kmer_ref, central_pos): self.means = mean_ref self.sds = sd_ref self.central_pos = central_pos - self.alt_base = None self.name = STANDARD_MODEL_NAME return @@ -726,10 +742,6 @@ def _get_default_standard_ref_from_files(self, fast5_fns): return - def _check_ref_fn_exists(self): - if not os.path.exists(self.ref_fn): - th.error_message_and_exit('Invalid tombo model file provided.') - def __init__( self, ref_fn=None, is_text_model=False, kmer_ref=None, central_pos=None, seq_samp_type=None, reads_index=None, @@ -738,24 +750,33 @@ def __init__( Args: ref_fn (str): tombo model filename - is_text_model (bool): `ref_fn` is text (e.g. https://github.com/nanoporetech/kmer_models/blob/master/r9.4_180mv_450bps_6mer/template_median68pA.model) - kmer_ref (list): containing 3-tuples 1) k-mer 2) expected level 3) level SD - central_pos (int): base within k-mer to assign signal (only applicable when `kmer_ref` is provided) - seq_samp_type (:class:`tombo.tombo_helper.seqSampleType`): sequencing sample type (default: None) - reads_index (:class:`tombo.tombo_helper.TomboReads`): For determining `seq_samp_type` - fast5_fns (list): fast5 read filenames from which to extract read metadata. For determining `seq_samp_type` - minimal_startup (bool): don't compute inverse variances (default True) + is_text_model (bool): `ref_fn` is text (e.g. + https://github.com/nanoporetech/kmer_models/blob/master/r9.4_180mv_450bps_6mer/template_median68pA.model) + kmer_ref (list): containing 3-tuples 1) k-mer 2) expected level 3) + level SD + central_pos (int): base within k-mer to assign signal (only + applicable when `kmer_ref` is provided) + seq_samp_type (:class:`tombo.tombo_helper.seqSampleType`): + sequencing sample type (default: None) + reads_index (:class:`tombo.tombo_helper.TomboReads`): For + determining `seq_samp_type` + fast5_fns (list): fast5 read filenames from which to extract read + metadata. For determining `seq_samp_type` + minimal_startup (bool): don't compute inverse variances (default + True) Note: - Order of priority for initialization when multiple model specifications are provided: + Order of priority for initialization when multiple model + specifications are provided: 1) `ref_fn` 2) `kmer_ref` (requires `central_pos`) 3) `seq_samp_type` 4) `reads_index` 5) `fast5_fns` - Last 3 options load a default model file included with Tombo. Last 2 determine the sample type from read metadata. + Last 3 options load a default model file included with Tombo. Last + 2 determine the sample type from read metadata. """ if ref_fn is not None: self.ref_fn = th.resolve_path(ref_fn) @@ -768,7 +789,7 @@ def __init__( assert central_pos is not None, ( 'central_pos must be provided is TomboModel is loaded ' + 'with a kmer_ref') - self._load_std_model(kmer_ref, central_pos) + self._load_model(kmer_ref, central_pos) self.seq_samp_type = seq_samp_type else: if seq_samp_type is not None: @@ -786,16 +807,15 @@ def __init__( self._parse_tombo_model() self.kmer_width = len(next(k for k in self.means)) - self.is_std_model = (self.name == STANDARD_MODEL_NAME and - self.alt_base is None) - self.is_alt_model = not self.is_std_model self.inv_var = None if not minimal_startup: self._add_invvar() def reverse_sequence_copy(self): - """Return a copy of model for processing sequence/signal in reverse (default models are all saved in genome sequence forward (5p to 3p) direction) + """Return a copy of model for processing sequence/signal in reverse + (default models are all saved in genome sequence forward (5p to 3p) + direction) """ rev_model = deepcopy(self) rev_model.central_pos = self.kmer_width - self.central_pos - 1 @@ -810,6 +830,297 @@ def reverse_sequence_copy(self): return rev_model + def get_exp_levels_from_seq(self, seq, rev_strand=False): + """Compute expected signal levels for a sequence + + Args: + seq (str): genomic seqeunce to be converted to expected signal + levels + rev_strand (bool): provided sequence is from reverse strand (so + flip return order to genome forward direction) + + Note: + Returned expected signal levels will be trimmed compared to the + passed sequence based on the `std_ref.kmer_width` and + `std_ref.central_pos`. + + Returns: + Expected signal level references + 1) ref_means (`np.array::np.float64`) expected signal levels + 2) ref_sds (`np.array::np.float64`) expected signal level sds + """ + seq_kmers = th.get_seq_kmers(seq, self.kmer_width, rev_strand) + + try: + ref_means = np.array([self.means[kmer] for kmer in seq_kmers]) + ref_sds = np.array([self.sds[kmer] for kmer in seq_kmers]) + except KeyError: + th.error_message_and_exit( + 'Invalid sequence encountered from genome sequence.') + + return ref_means, ref_sds + + def get_exp_levels_from_kmers(self, seq_kmers): + """Look up expected signal levels for a list of k-mers + + Args: + seq_kmers (list): list of k-mers (as returned from + :class:`tombo.tombo_helper.get_seq_kmers`) + + Returns: + Expected signal level references + + 1) ref_means (`np.array::np.float64`) expected signal levels + 2) ref_sds (`np.array::np.float64`) expected signal level sds + """ + try: + ref_means = np.array([self.means[kmer] for kmer in seq_kmers]) + ref_sds = np.array([self.sds[kmer] for kmer in seq_kmers]) + except KeyError: + th.error_message_and_exit( + 'Invalid sequence encountered from genome sequence.') + + return ref_means, ref_sds + + def get_exp_levels_from_seq_with_gaps(self, reg_seq, rev_strand): + # loop over regions without valid sequence (non-ACGT) + reg_ref_means, reg_ref_sds = ( + np.empty(len(reg_seq) - self.kmer_width + 1), + np.empty(len(reg_seq) - self.kmer_width + 1)) + reg_ref_means[:] = np.NAN + reg_ref_sds[:] = np.NAN + prev_ibr_end = 0 + for inv_base_run_m in th.INVALID_BASE_RUNS.finditer(reg_seq): + ibr_start, ibr_end = inv_base_run_m.start(), inv_base_run_m.end() + # if valid region is too short continue + if ibr_start - prev_ibr_end < self.kmer_width: + prev_ibr_end = ibr_end + continue + subreg_ref_means, subreg_ref_sds = self.get_exp_levels_from_seq( + reg_seq[prev_ibr_end:ibr_start]) + reg_ref_means[prev_ibr_end: + ibr_start - self.kmer_width + 1] = subreg_ref_means + reg_ref_sds[prev_ibr_end: + ibr_start - self.kmer_width + 1] = subreg_ref_sds + prev_ibr_end = ibr_end + + # if there is valid sequence at the end of a region include it here + if prev_ibr_end <= len(reg_seq) - self.kmer_width: + subreg_ref_means, subreg_ref_sds = self.get_exp_levels_from_seq( + reg_seq[prev_ibr_end:]) + reg_ref_means[prev_ibr_end:] = subreg_ref_means + reg_ref_sds[prev_ibr_end:] = subreg_ref_sds + + if rev_strand: + reg_ref_means = reg_ref_means[::-1] + reg_ref_sds = reg_ref_sds[::-1] + + return reg_ref_means, reg_ref_sds + + +class AltModel(object): + """Load, store and access alternate-base Tombo model attributes and + sequence-based expected mean and standard deviation levels (median + normalization only). + + .. automethod:: __init__ + """ + def write_model(self, ref_fn): + """Write AltModel to specified file + + Args: + ref_fn (str): filename to write AltModel + """ + # Explicity use btype string names for py3 compatiblity as well as + # pickle-ability of numpy arrays for consistency. See discussion here: + # https://github.com/numpy/numpy/issues/2407 + ref_for_file = np.array( + [(kmer, pos, self.means[(kmer, pos)], self.sds[(kmer, pos)]) + for kmer, pos in self.means], + dtype=[(str('kmer'), 'S' + unicode(self.kmer_width)), + (str('pos'), 'u4'), (str('mean'), 'f8'), (str('sd'), 'f8')]) + + with h5py.File(ref_fn, 'w') as ref_fp: + ref_fp.create_dataset( + 'model', data=ref_for_file, compression="gzip") + ref_fp.attrs['central_pos'] = self.central_pos + ref_fp.attrs['model_name'] = self.name + ref_fp.attrs['alt_base'] = self.alt_base + ref_fp.attrs['motif'] = self.motif.raw_motif + ref_fp.attrs['mod_pos'] = self.motif.mod_pos + + return + + def _make_constant_sd(self): + med_sd = np.median(list(self.sds.values())) + self.sds = dict((kmer, med_sd) for kmer in self.sds) + return + + def _parse_alt_model(self): + """Parse an alternate-base Tombo model file + """ + try: + with h5py.File(self.ref_fn, 'r') as ref_fp: + ref_raw = ref_fp['model'][:] + central_pos = ref_fp.attrs.get('central_pos') + model_name = ref_fp.attrs.get('model_name') + + try: + model_name = model_name.decode() + except (AttributeError, TypeError): + pass + + alt_base = ref_fp.attrs.get('alt_base') + try: + alt_base = alt_base.decode() + except (AttributeError, TypeError): + pass + + raw_motif = ref_fp.attrs.get('motif') + try: + raw_motif = raw_motif.decode() + except (AttributeError, TypeError): + pass + mod_pos = ref_fp.attrs.get('mod_pos') + + except: + th.error_message_and_exit( + 'Invalid alternate-base tombo model file provided: ' + + unicode(self.ref_fn)) + + mean_ref = {} + sd_ref = {} + for kmer, pos, kmer_mean, kmer_std in ref_raw: + kmer = kmer.decode() + mean_ref[(kmer, pos)] = kmer_mean + sd_ref[(kmer, pos)] = kmer_std + + self.means = mean_ref + self.sds = sd_ref + self.central_pos = central_pos + self.alt_base = alt_base + self.name = model_name + self.motif = th.TomboMotif(raw_motif, mod_pos) + + return + + def _load_alt_model(self, alt_ref, central_pos, alt_base, name, motif): + mean_ref = {} + sd_ref = {} + for kmer, pos, kmer_mean, kmer_std in alt_ref: + # reference may or may not be stored as a numpy array + try: + kmer = kmer.decode() + except AttributeError: + pass + mean_ref[(kmer, pos)] = kmer_mean + sd_ref[(kmer, pos)] = kmer_std + + self.means = mean_ref + self.sds = sd_ref + self.central_pos = central_pos + self.alt_base = alt_base + self.name = name + if motif is None: + self.motif = th.TomboMotif(self.alt_base, 1) + else: + assert (isinstance(motif, th.TomboMotif) and + motif.mod_pos is not None) + self.motif = motif + + return + + def _add_invvar(self): + self.inv_var = {} + for kmer_pos, stdev in self.sds.items(): + self.inv_var[kmer_pos] = 1 / (stdev * stdev) + + return + + def __init__( + self, ref_fn=None, kmer_ref=None, central_pos=None, alt_base=None, + name=None, motif=None, minimal_startup=True): + """Initialize a Tombo k-mer model object + + Args: + ref_fn (str): tombo model filename + kmer_ref (list): containing 3-tuples 1) k-mer 2) expected level 3) + level SD + central_pos (int): base within k-mer to assign signal (only + applicable when `kmer_ref` is provided) + alt_base (int): "swap" base within k-mer (only applicable when + `kmer_ref` is provided) + name (str): model name (only applicable when `kmer_ref` is provided) + motif (:class:`tombo.tombo_helper.TomboMotif`): model motif + (defaults to alt_base motif; only applicable when `kmer_ref` is + provided) + minimal_startup (bool): don't compute inverse variances (default + True) + + Note: + + ``kmer_ref``, ``central_pos``, ``alt_base``, ``name`` and ``motif`` + are ignored if ``ref_fn`` is provided. + """ + if ref_fn is not None: + self.ref_fn = th.resolve_path(ref_fn) + self._parse_alt_model() + elif kmer_ref is not None: + assert central_pos is not None and alt_base is not None, ( + 'central_pos and alt_base must be provided if AltModel is ' + + 'loaded with a kmer_ref') + self._load_alt_model(kmer_ref, central_pos, alt_base, name, motif) + else: + th.error_message_and_exit( + 'Must provide initialization method for AltModel.') + + self.kmer_width = len(next(kmer for kmer, pos in self.means)) + + self.inv_var = None + if not minimal_startup: + self._add_invvar() + + def get_exp_level(self, kmer, pos): + try: + return self.means[(kmer, pos)] + except KeyError: + return np.NAN + + def get_exp_sd(self, kmer, pos): + try: + return self.sds[(kmer, pos)] + except KeyError: + return np.NAN + + def get_exp_levels_from_kmers(self, seq_kmers, rev_strand=False): + """Look up expected alternate-base signal levels across a central base. + Alternative base to test should be last base in the first k-mer and + first base in the last k-mer + + Args: + seq_kmers (list): list of k-mers the same length as the k-mer + + Returns: + Expected signal level references + + 1) ref_means (`np.array::np.float64`) expected signal levels + 2) ref_sds (`np.array::np.float64`) expected signal level sds + """ + pos_range = (range(self.kmer_width) if rev_strand else + range(self.kmer_width - 1, -1, -1)) + try: + ref_means = np.array([ + self.get_exp_level(kmer, pos) for kmer, pos in zip( + seq_kmers, pos_range)]) + ref_sds = np.array([ + self.get_exp_sd(kmer, pos) for kmer, pos in zip( + seq_kmers, pos_range)]) + except KeyError: + th.error_message_and_exit( + 'Invalid sequence encountered from genome sequence.') + + return ref_means, ref_sds + ############################ ##### Model Estimation ##### @@ -818,14 +1129,14 @@ def reverse_sequence_copy(self): def check_valid_alt_models(alt_refs, std_ref): """Parse several alternative tombo model files """ - for alt_name, alt_ref in alt_refs.items(): + for alt_ref in alt_refs.values(): if (std_ref.central_pos != alt_ref.central_pos or std_ref.kmer_width != alt_ref.kmer_width): th.warning_message( 'Standard and ' + alt_ref.ref_fn + ' alternative base ' + 'models must be estimated using the same k-mer positions.') continue - if not alt_ref.is_alt_model: + if not isinstance(alt_ref, AltModel): th.warning_message( 'Alternative model ' + alt_ref.ref_fn + ' appears to be a ' + 'standard model and will not be processed.') @@ -843,7 +1154,8 @@ def _print_alt_models(): for alt_mod in alt_mods: has_mod = [alt_mod,] for seq_samp in alt_seq_samps[1:]: - has_mod.append(' X' if (seq_samp, alt_mod) in alt_model_types else '') + has_mod.append(' X' if (seq_samp, alt_mod) in alt_model_types + else '') sys.stderr.write(row_format.format(*has_mod)) return @@ -864,7 +1176,7 @@ def load_default_alt_ref(alt_name, seq_samp_type): seq_samp_type.name + ' does not exists.') return None - return TomboModel(ref_fn=alt_model_fn, seq_samp_type=seq_samp_type) + return AltModel(ref_fn=alt_model_fn) def load_alt_refs(alt_model_fns, alt_names, reads_index, std_ref, seq_samp_type=None): @@ -872,11 +1184,11 @@ def load_alt_refs(alt_model_fns, alt_names, reads_index, std_ref, if alt_model_fns is not None: # load alternative models from filenames for alt_model_fn in alt_model_fns: - alt_ref = TomboModel(alt_model_fn) + alt_ref = AltModel(alt_model_fn) if alt_ref.name in alt_refs: th.warning_message( - alt_ref.name + ' alternative model found in more than one ' + - 'model file. Ignoring: ' + alt_model_fn) + alt_ref.name + ' alternative model found in more than ' + + 'one model file. Ignoring: ' + alt_model_fn) continue alt_refs[alt_ref.name] = alt_ref else: @@ -903,7 +1215,7 @@ def load_valid_models( std_ref = TomboModel(ref_fn=tb_model_fn, reads_index=reads_index) if alt_model_fn is not None: - alt_ref = TomboModel(ref_fn=alt_model_fn) + alt_ref = AltModel(ref_fn=alt_model_fn) elif plot_default_alt is not None: seq_samp_type = std_ref.seq_samp_type if seq_samp_type is None: @@ -920,90 +1232,15 @@ def load_valid_models( return std_ref, alt_ref -def get_ref_from_seq(seq, std_ref, rev_strand=False, alt_ref=None): - """Compute expected signal levels for a sequence from a reference model - - Args: - - seq (str): genomic seqeunce to be converted to expected signal levels - std_ref (:class:`tombo.tombo_stats.TomboModel`): expected signal level model - rev_strand (bool): flip sequence (after extracting k-mers for expected level model lookup) - alt_ref (:class:`tombo.tombo_stats.TomboModel`): an alternative expected signal level model - - Note: - - Returned expected signal levels will be trimmed compared to the passed sequence based on the `std_ref.kmer_width` and `std_ref.central_pos`. - - Returns: - Expected signal level references - - 1) ref_means (`np.array::np.float64`) expected signal levels - 2) ref_sds (`np.array::np.float64`) expected signal level sds - 3) alt_means (`np.array::np.float64`) alternate expected signal levels - 4) alt_sds (`np.array::np.float64`) alternate expected signal level sds - """ - seq_kmers = [seq[i:i + std_ref.kmer_width] - for i in range(len(seq) - std_ref.kmer_width + 1)] - # get stat lookups from seq on native strand then flip if rev_strand - if rev_strand: - seq_kmers = seq_kmers[::-1] - - try: - ref_means = np.array([std_ref.means[kmer] for kmer in seq_kmers]) - ref_sds = np.array([std_ref.sds[kmer] for kmer in seq_kmers]) - except KeyError: - th.error_message_and_exit( - 'Invalid sequence encountered from genome sequence.') - if alt_ref is None: - alt_means, alt_sds = None, None - else: - alt_means = np.array([alt_ref.means[kmer] for kmer in seq_kmers]) - alt_sds = np.array([alt_ref.sds[kmer] for kmer in seq_kmers]) - - return ref_means, ref_sds, alt_means, alt_sds - -def get_ref_from_seq_with_gaps(reg_seq, std_ref, rev_strand): - # loop over regions without valid sequence (non-ACGT) - reg_ref_means, reg_ref_sds = ( - np.empty(len(reg_seq) - std_ref.kmer_width + 1), - np.empty(len(reg_seq) - std_ref.kmer_width + 1)) - reg_ref_means[:] = np.NAN - reg_ref_sds[:] = np.NAN - prev_ibr_end = 0 - for inv_base_run_m in th.INVALID_BASE_RUNS.finditer(reg_seq): - ibr_start, ibr_end = inv_base_run_m.start(), inv_base_run_m.end() - # if valid region is too short continue - if ibr_start - prev_ibr_end < std_ref.kmer_width: - prev_ibr_end = ibr_end - continue - subreg_ref_means, subreg_ref_sds, _, _ = get_ref_from_seq( - reg_seq[prev_ibr_end:ibr_start], std_ref) - reg_ref_means[prev_ibr_end: - ibr_start - std_ref.kmer_width + 1] = subreg_ref_means - reg_ref_sds[prev_ibr_end: - ibr_start - std_ref.kmer_width + 1] = subreg_ref_sds - prev_ibr_end = ibr_end - - # if there is valid sequence at the end of a region include it here - if prev_ibr_end <= len(reg_seq) - std_ref.kmer_width: - subreg_ref_means, subreg_ref_sds, _, _ = get_ref_from_seq( - reg_seq[prev_ibr_end:], std_ref) - reg_ref_means[prev_ibr_end:] = subreg_ref_means - reg_ref_sds[prev_ibr_end:] = subreg_ref_sds - - if rev_strand: - reg_ref_means = reg_ref_means[::-1] - reg_ref_sds = reg_ref_sds[::-1] - - return reg_ref_means, reg_ref_sds - def calc_med_sd(vals): - """Helper function to compute median and standard deviation from a numpy array + """Helper function to compute median and standard deviation from a + numpy array """ return np.median(vals), np.std(vals) -def get_region_kmer_levels(reg_data, cov_thresh, upstrm_bases, dnstrm_bases, - cs_cov_thresh, est_mean, region_size): +def get_region_kmer_levels( + reg_data, cov_thresh, upstrm_bases, dnstrm_bases, cs_cov_thresh, + est_mean, region_size, motif=None, valid_poss=None): """Compute mean or median and standard deviation for each k-mer """ if cs_cov_thresh is not None: @@ -1042,11 +1279,20 @@ def get_region_kmer_levels(reg_data, cov_thresh, upstrm_bases, dnstrm_bases, cov_intervals = cov_intervals.reshape(-1,2) kmer_width = upstrm_bases + dnstrm_bases + 1 - reg_kmer_levels = dict( - (''.join(kmer),[]) - for kmer in product(DNA_BASES, repeat=kmer_width)) + if motif is None: + reg_kmer_levels = dict( + (''.join(kmer), []) + for kmer in product(DNA_BASES, repeat=kmer_width)) + else: + reg_kmer_levels = dict( + ((''.join(kmer), i_offset - 1), []) + for kmer in product(DNA_BASES, repeat=kmer_width) + for i_offset in motif.find_mod_poss(''.join(kmer))) + # upstream and downstream changes the sequence selection # depending on the strand + # TODO this will miss motif hits at region boundaries when the motif is + # longer than either end of the k-me relative to the central position bb, ab = (upstrm_bases, dnstrm_bases) if reg_data.strand == '+' else \ (dnstrm_bases, upstrm_bases) for cov_start, cov_end in cov_intervals: @@ -1054,20 +1300,58 @@ def get_region_kmer_levels(reg_data, cov_thresh, upstrm_bases, dnstrm_bases, int_seq = reg_data.copy().update( start=reg_data.start + cov_start - bb, end=reg_data.start + cov_end + ab).add_seq().seq - if reg_data.strand == '-': - int_seq = th.comp_seq(int_seq) int_len = cov_end - cov_start - for pos in range(int_len): + + # get valid positions to include in extracted levels + # for standard extraction fill with None relative position + if valid_poss is None and motif is None: + int_poss = zip(range(int_len), repeat(None)) + # for motif/position extraction record relative modified position + else: + # get modified base position relative to coverage interval + if valid_poss is not None: + if (reg_data.chrm, reg_data.strand) not in valid_poss: continue + reg_mod_poss = (valid_poss[(reg_data.chrm, reg_data.strand)] - + reg_data.start - cov_start) + reg_mod_poss = reg_mod_poss[ + np.logical_and(np.greater_equal(reg_mod_poss, 0), + np.less(reg_mod_poss, int_len))] + else: + if reg_data.strand == '+': + reg_mod_poss = [ + m.start() + motif.mod_pos - 1 - bb + for m in motif.motif_pat.finditer(int_seq) + if 0 <= m.start() + motif.mod_pos - 1 - bb < int_len] + else: + reg_mod_poss = [ + m.start() + motif.motif_len - motif.mod_pos - bb + for m in motif.rev_comp_pat.finditer(int_seq) + if 0 <= m.start() + motif.motif_len - motif.mod_pos - bb + < int_len] + + # record relative mod positions within k-mers + int_poss = [ + (mod_pos - i_offset + bb, i_offset if reg_data.strand == '+' + else kmer_width - i_offset - 1) + for mod_pos in reg_mod_poss for i_offset in range(kmer_width) + if 0 <= mod_pos - i_offset + bb < int_len] + + for pos, offset in int_poss: pos_kmer = int_seq[pos:pos + kmer_width] if reg_data.strand == '-': - pos_kmer = pos_kmer[::-1] + pos_kmer = th.rev_comp(pos_kmer) + #print(pos_kmer, offset, + # pos + reg_data.start + cov_start, reg_data.strand, + # reg_data.start + cov_start - bb, + # reg_data.start + cov_end + ab) + kmer_key = pos_kmer if offset is None else (pos_kmer, offset) try: if est_mean: - reg_kmer_levels[pos_kmer].append(c_mean_std( - base_events[reg_data.start + pos + cov_start])) + reg_kmer_levels[kmer_key].append(c_mean_std( + base_events[pos + reg_data.start + cov_start])) else: - reg_kmer_levels[pos_kmer].append(calc_med_sd( - base_events[reg_data.start + pos + cov_start])) + reg_kmer_levels[kmer_key].append(calc_med_sd( + base_events[pos + reg_data.start + cov_start])) except KeyError: continue @@ -1075,14 +1359,15 @@ def get_region_kmer_levels(reg_data, cov_thresh, upstrm_bases, dnstrm_bases, def _est_kmer_model_worker( region_q, kmer_level_q, progress_q, reads_index, cov_thresh, - upstrm_bases, dnstrm_bases, cs_cov_thresh, est_mean, region_size): - while not region_q.empty(): + upstrm_bases, dnstrm_bases, cs_cov_thresh, est_mean, region_size, + motif, valid_poss): + while True: try: chrm, strand, reg_start = region_q.get(block=False) except queue.Empty: # sometimes throws false empty error with get(block=False) - if not region_q.empty(): - continue + sleep(0.1) + if not region_q.empty(): continue break reg_data = th.intervalData( @@ -1094,7 +1379,7 @@ def _est_kmer_model_worker( reg_kmer_levels = get_region_kmer_levels( reg_data, cov_thresh, upstrm_bases, dnstrm_bases, - cs_cov_thresh, est_mean, region_size) + cs_cov_thresh, est_mean, region_size, motif, valid_poss) if reg_kmer_levels is not None: kmer_level_q.put(reg_kmer_levels) progress_q.put(1) @@ -1111,27 +1396,21 @@ def _est_kmer_model_worker(*args): def extract_kmer_levels( reads_index, region_size, cov_thresh, upstrm_bases, dnstrm_bases, - cs_cov_thresh, est_mean=False, num_processes=1): - chrm_sizes = th.get_chrm_sizes(reads_index) - + cs_cov_thresh, est_mean=False, num_processes=1, + motif=None, valid_poss=None): region_q = Queue() kmer_level_q = Queue() progress_q = Queue() num_regions = 0 - for chrm, chrm_len in chrm_sizes.items(): - plus_covered = (chrm, '+') in reads_index - minus_covered = (chrm, '-') in reads_index - for reg_start in range(0, chrm_len, region_size): - if plus_covered: - region_q.put((chrm, '+', reg_start)) - num_regions +=1 - if minus_covered: - region_q.put((chrm, '-', reg_start)) - num_regions +=1 + for chrm, strand, reg_start in reads_index.iter_cov_regs( + cov_thresh, region_size): + region_q.put((chrm, strand, reg_start)) + num_regions += 1 est_args = ( region_q, kmer_level_q, progress_q, reads_index, cov_thresh, - upstrm_bases, dnstrm_bases, cs_cov_thresh, est_mean, region_size) + upstrm_bases, dnstrm_bases, cs_cov_thresh, est_mean, region_size, + motif, valid_poss) est_ps = [] for p_id in range(num_processes): p = Process(target=_est_kmer_model_worker, args=est_args) @@ -1153,6 +1432,8 @@ def extract_kmer_levels( except queue.Empty: sleep(1) continue + + # clear rest of queue while not kmer_level_q.empty(): reg_kmer_levels = kmer_level_q.get(block=False) all_reg_kmer_levels.append(reg_kmer_levels) @@ -1194,7 +1475,8 @@ def tabulate_kmer_levels(all_reg_kmer_levels, min_kmer_obs): min_obs = min( sum(len(reg_levs[''.join(kmer)]) for reg_levs in all_reg_kmer_levels) - for kmer in product(DNA_BASES, repeat=kmer_width)) + for kmer in product(DNA_BASES, repeat=kmer_width) + if motif is None or motif.matches_seq(''.join(kmer))) th.error_message_and_exit( 'K-mers represeneted in fewer observations than ' + 'requested in the provided reads. Consider a shorter ' + @@ -1225,9 +1507,12 @@ def load_resquiggle_parameters( """Load parameters for re-squiggle algorithm Args: - seq_samp_type (:class:`tombo.tombo_helper.seqSampleType`): sequencing sample type - sig_aln_params (tuple): signal alignment parameters (optional; default: load seq_samp_type defaults) - seg_params (tuple): segmentation parameters (optional; default: load seq_samp_type defaults) + seq_samp_type (:class:`tombo.tombo_helper.seqSampleType`): sequencing + sample type + sig_aln_params (tuple): signal alignment parameters (optional; default: + load seq_samp_type defaults) + seg_params (tuple): segmentation parameters (optional; default: load + seq_samp_type defaults) use_save_bandwidth (bool): load larger "save" bandwidth Returns: @@ -1252,19 +1537,20 @@ def load_resquiggle_parameters( bandwidth = save_bandwidth if seg_params is None: - (running_stat_width, min_obs_per_base, + (running_stat_width, min_obs_per_base, raw_min_obs_per_base, mean_obs_per_event) = SEG_PARAMS_TABLE[seq_samp_type.name] else: - (running_stat_width, min_obs_per_base, mean_obs_per_event) = seg_params + (running_stat_width, min_obs_per_base, raw_min_obs_per_base, + mean_obs_per_event) = seg_params z_shift, stay_pen = get_dynamic_prog_params(match_evalue) rsqgl_params = th.resquiggleParams( match_evalue, skip_pen, bandwidth, max_half_z_score, - running_stat_width, min_obs_per_base, mean_obs_per_event, - z_shift, stay_pen, seq_samp_type.name == RNA_SAMP_TYPE, - band_bound_thresh, start_bw, start_save_bw, - start_n_bases) + running_stat_width, min_obs_per_base, raw_min_obs_per_base, + mean_obs_per_event, z_shift, stay_pen, + seq_samp_type.name == RNA_SAMP_TYPE, band_bound_thresh, + start_bw, start_save_bw, start_n_bases) return rsqgl_params @@ -1277,7 +1563,8 @@ def compute_num_events( signal_len (int): length of raw signal seq_len (int): length of sequence mean_obs_per_base (int): mean raw observations per genome base - min_event_to_seq_ratio (float): minimum event to sequence ratio (optional) + min_event_to_seq_ratio (float): minimum event to sequence ratio + (optional) Returns: Number of events to find for this read @@ -1324,7 +1611,8 @@ def get_event_scale_values(all_raw_signal, r_len): rsqgl_params.running_stat_width, num_events) if COLLAPSE_RNA_STALLS: valid_cpts = remove_stall_cpts( - identify_stalls(all_raw_signal, DEFAULT_STALL_PARAMS), valid_cpts) + identify_stalls(all_raw_signal, DEFAULT_STALL_PARAMS), + valid_cpts) scale_values = get_scale_values_from_events( all_raw_signal, valid_cpts, OUTLIER_THRESH, num_events=RNA_SCALE_NUM_EVENTS, @@ -1367,7 +1655,7 @@ def get_read_corr_factors(r_data): norm_signal = norm_signal[rsrtr:rsrtr + event_starts[-1]] r_seq = b''.join(r_seq).decode() - r_ref_means = get_ref_from_seq(r_seq, init_ref)[0] + r_ref_means, _ = init_ref.get_exp_levels_from_seq(r_seq) (_, _, shift_corr_factor, scale_corr_factor) = calc_kmer_fitted_shift_scale( @@ -1426,7 +1714,7 @@ def center_model_to_median_norm(*args): def estimate_kmer_model( fast5s_dirs, corr_grp, bc_subgrps, - kmer_ref_fn, cov_thresh, upstrm_bases, dnstrm_bases, min_kmer_obs, + cov_thresh, upstrm_bases, dnstrm_bases, min_kmer_obs, kmer_specific_sd, cs_cov_thresh, est_mean, region_size, num_processes): """Estimate a standard tombo k-mer model """ @@ -1437,18 +1725,18 @@ def estimate_kmer_model( all_kmer_mean_sds = tabulate_kmer_levels(all_reg_kmer_levels, min_kmer_obs) - # adjust model to match median normalization best via Theil-Sen optimizer fit - # this will increase the accuracy of median normalized re-squiggle results - # and should reduce the need for (or number of) iterative re-squiggle runs + # adjust model to match median normalization best via Theil-Sen optimizer + # fit this will increase the accuracy of median normalized re-squiggle + # results and should reduce the need for (or number of) iterative + # re-squiggle runs init_ref = TomboModel(kmer_ref=all_kmer_mean_sds, central_pos=upstrm_bases) centered_ref = center_model_to_median_norm(reads_index, init_ref) if not kmer_specific_sd: centered_ref._make_constant_sd() - centered_ref.write_model(kmer_ref_fn) - return + return centered_ref ######################################## @@ -1461,13 +1749,13 @@ def _parse_base_levels_worker( proc_kmer_levels = dict( (''.join(kmer), []) for kmer in product(DNA_BASES, repeat=kmer_width)) - while not reads_q.empty(): + while True: try: r_fn, corr_slot = reads_q.get(block=False) except queue.Empty: - # sometimes throws false empty error with get(block=False) - if not reads_q.empty(): - continue + # sometimes throws false empty exception with get(block=False) + sleep(0.01) + if not reads_q.empty(): continue break with h5py.File(r_fn, 'r') as fast5_data: @@ -1762,7 +2050,6 @@ def get_peak_frac(kmer_std_dens, kmer_alt_dens): alt_ref = [] for kmer, std_level in std_ref.means.items(): if kmer.count(alt_base) == 0: - alt_ref.append((kmer, std_level, model_sd)) continue # assuming random incorporation the prortion of standard base # observations at this k-mer is the standard fraction raised @@ -1773,9 +2060,12 @@ def get_peak_frac(kmer_std_dens, kmer_alt_dens): std_dens[kmer] * kmer_std_frac) diff_dens[diff_dens < 0] = 0 alt_level = np.average(save_x, weights=diff_dens) - alt_ref.append((kmer, alt_level, model_sd)) + # add alt mean for each alt base position within the kmer + for m in re.finditer(alt_base, kmer): + alt_ref.append((kmer, m.start(), alt_level, model_sd)) - alt_ref = TomboModel(kmer_ref=alt_ref, central_pos=std_ref.central_pos) + alt_ref = AltModel( + kmer_ref=alt_ref, central_pos=std_ref.central_pos, alt_base=alt_base) return alt_ref @@ -1814,26 +2104,114 @@ def estimate_alt_model(*args): filename='est_alt_model.prof') return None +def tabulate_mod_kmer_levels(all_reg_kmer_levels, min_kmer_obs, motif): + if VERBOSE: th.status_message('Tabulating k-mer model statistics.') + all_kmer_mean_sds = [] + if _DEBUG_EST_STD: + kmer_dens = [] + save_x = np.linspace(KERNEL_DENSITY_RANGE[0], KERNEL_DENSITY_RANGE[1], + _DEBUG_EST_NUM_KMER_SAVE) + kmer_width = len(next(iter(all_reg_kmer_levels[0].keys()))[0]) + for kmer, offset in [(kmer, offset - 1) + for kmer in product(DNA_BASES, repeat=kmer_width) + for offset in motif.find_mod_poss(''.join(kmer))]: + kmer = ''.join(kmer) + try: + kmer_levels = np.concatenate([ + reg_kmer_levels[(kmer, offset)] + for reg_kmer_levels in all_reg_kmer_levels + if len(reg_kmer_levels[(kmer, offset)]) > 0]) + except ValueError: + th.error_message_and_exit( + 'At least one modified k-mer is not covered at any poitions ' + + 'by --minimum-test-reads.\n\t\tConsider fitting to a smaller ' + + 'k-mer via the --upstream-bases and --downstream-bases, ' + + 'or lowering --minimum-test-reads.\n\t\tNote that this may ' + + 'result in a lower quality model.') + if kmer_levels.shape[0] < min_kmer_obs: + min_obs = min( + sum(len(reg_levs[(''.join(kmer), i_offset - 1)]) + for reg_levs in all_reg_kmer_levels) + for kmer in product(DNA_BASES, repeat=kmer_width) + for i_offset in motif.find_mod_poss(''.join(kmer))) + th.error_message_and_exit( + 'K-mers represeneted in fewer observations than ' + + 'requested in the provided reads. Consider a shorter ' + + 'k-mer or providing more reads.\n\t' + unicode(min_obs) + + ' observations found in least common kmer.') + all_kmer_mean_sds.append((kmer, offset, np.median(kmer_levels[:,0]), + np.median(kmer_levels[:,1]))) + if _DEBUG_EST_STD: + kmer_kde = stats.gaussian_kde( + kmer_levels[:,0], + bw_method=_DEBUG_EST_BW / kmer_levels[:,0].std(ddof=1)) + with np.errstate(under='ignore'): + kmer_dens.append((kmer, offset, kmer_kde.evaluate(save_x))) + + if _DEBUG_EST_STD: + with io.open('debug_est_motif_alt_ref.density.txt', 'wt') as fp: + fp.write('Kmer\tOffset\tSignal\tDensity\n') + fp.write('\n'.join('\t'.join(map(str, (kmer, offset, x, y))) + for kmer, offset, dens_i in kmer_dens + for x, y in zip(save_x, dens_i)) + '\n') + return all_kmer_mean_sds + +def estimate_motif_alt_model( + fast5s_dirs, corr_grp, bc_subgrps, motif_desc, + upstrm_bases, dnstrm_bases, valid_locs_fn, min_kmer_obs, + cov_thresh, cs_cov_thresh, region_size, num_processes): + """Estimate a motif-centered alternate-base tombo k-mer model + """ + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) + try: + raw_motif, mod_pos = motif_desc.split(":") + except: + th.error_message_and_exit('Invalid motif decription format.') + motif = th.TomboMotif(raw_motif, int(mod_pos)) + valid_poss = None if valid_locs_fn is None else th.parse_locs_file( + valid_locs_fn) + + all_reg_kmer_levels = extract_kmer_levels( + reads_index, region_size, cov_thresh, upstrm_bases, dnstrm_bases, + cs_cov_thresh, False, num_processes, motif, valid_poss) + + # process motif kmers with relative position stored + all_mod_kmer_mean_sds = tabulate_mod_kmer_levels( + all_reg_kmer_levels, min_kmer_obs, motif) + + alt_ref = AltModel( + kmer_ref=all_mod_kmer_mean_sds, central_pos=upstrm_bases, + alt_base=motif.mod_base, motif=motif) + + alt_ref._make_constant_sd() + + return alt_ref + +if _PROFILE_MOTIF_ALT_EST: + _est_motif_alt_wrapper = estimate_motif_alt_model + def estimate_motif_alt_model(*args): + import cProfile + cProfile.runctx('_est_motif_alt_wrapper(*args)', globals(), locals(), + filename='est_motif_alt_model.prof') + return None + #################################### ##### Core Statistical Testing ##### #################################### def p_value_to_z_score(pvalue): - """ - Helper function to convert p-value to z-score + """Helper function to convert p-value to z-score """ return -stats.norm.ppf(pvalue) def z_score_to_p_value(zscore): - """ - Helper function to convert z-score to p-value + """Helper function to convert z-score to p-value """ return stats.norm.cdf(zscore) def correct_multiple_testing(pvals): - """ - Use FDR Benjamini-Hochberg multiple testing correction + """Use FDR Benjamini-Hochberg multiple testing correction """ pvals = np.asarray(pvals) @@ -1854,8 +2232,7 @@ def correct_multiple_testing(pvals): return pvals_corrected[sortrevind] def calc_vectorized_fm_pvals(split_pvals, filter_nan=True): - """ - Compute Fisher's Method p-values in a vectorized fashion + """Compute Fisher's Method p-values in a vectorized fashion """ if filter_nan: chi_stats = [np.sum(np.log(base_pvals[~np.isnan(base_pvals)])) * -2 @@ -1872,8 +2249,7 @@ def calc_vectorized_fm_pvals(split_pvals, filter_nan=True): return f_pvals def calc_window_fishers_method(pvals, lag): - """ - Compute Fisher's Method over a moving window across a set of p-values + """Compute Fisher's Method over a moving window across a set of p-values """ assert lag > 0, 'Invalid p-value window provided.' width = (lag * 2) + 1 @@ -1893,9 +2269,24 @@ def calc_window_fishers_method(pvals, lag): return f_pvals -def calc_window_z_transform(r_means, ref_means, ref_sds, lag): +def calc_window_means(stats, lag): + """Compute mean over a moving window across a set of statistics """ - Compute Stouffer's Z-transformation across a read + assert lag > 0, 'Invalid window provided.' + width = (lag * 2) + 1 + if stats.shape[-1] < width: + raise th.TomboError( + "Statistics vector too short for window mean compuation.") + m_stats = np.empty(stats.shape) + m_stats[:] = np.NAN + m_stats[...,lag:-lag] = np.mean(np.lib.stride_tricks.as_strided( + stats, shape=stats.shape[:-1] + (stats.shape[-1] - width + 1, width), + strides=stats.strides + (stats.strides[-1],)), -1) + + return m_stats + +def calc_window_z_transform(r_means, ref_means, ref_sds, lag): + """Compute Stouffer's Z-transformation across a read """ z_scores = np.abs(r_means - ref_means) / ref_sds width = (lag * 2) + 1 @@ -1910,8 +2301,7 @@ def calc_window_z_transform(r_means, ref_means, ref_sds, lag): return window_z_trans def calc_mann_whitney_z_score(samp1, samp2): - """ - Compute Mann-Whitney z-scores comparing two samples + """Compute Mann-Whitney z-scores comparing two samples """ s1_len = samp1.shape[0] s2_len = samp2.shape[0] @@ -1944,12 +2334,11 @@ def get_read_seg_score(r_means, r_ref_means, r_ref_sds): Returns: Mean half z-score for observed versus expected signal levels """ - return np.mean([ - np.abs((b_m - b_ref_m) / b_ref_s) - for b_m, b_ref_m, b_ref_s in zip(r_means, r_ref_means, r_ref_sds)]) + return np.mean(np.abs((r_means - r_ref_means) / r_ref_sds)) def score_valid_bases(read_tb, event_means, r_ref_means, r_ref_sds): - """Compute expected to observed signal matching score for bases not deleted in dynamic programming + """Compute expected to observed signal matching score for bases not deleted + in dynamic programming Args: read_tb (`np.array::np.int32`): event changepoints @@ -1958,7 +2347,8 @@ def score_valid_bases(read_tb, event_means, r_ref_means, r_ref_sds): r_ref_sds (`np.array::np.float64`): expected base signal level sds Returns: - Mean half z-score for observed versus expected signal levels (for valid bases) + Mean half z-score for observed versus expected signal levels (for valid + bases) """ valid_bases = np.where(np.diff(read_tb) != 0)[0] if valid_bases.shape[0] == 0: @@ -1971,8 +2361,7 @@ def score_valid_bases(read_tb, event_means, r_ref_means, r_ref_sds): return get_read_seg_score(base_means, valid_ref_means, valid_ref_sds) def get_dynamic_prog_params(match_evalue): - """ - Compute dynamic programming shift parameters from an expected match + """Compute dynamic programming shift parameters from an expected match expected value """ z_shift = HALF_NORM_EXPECTED_VAL + match_evalue @@ -1988,10 +2377,13 @@ def compute_auc(tp_rate, fp_rate): return np.sum(tp_rate[:-1] * (fp_rate[1:] - fp_rate[:-1])) def compute_mean_avg_precison(tp_rate, precision): - return np.mean(np.cumsum((tp_rate[1:] - tp_rate[:-1]) * precision[1:])) + return np.sum(np.diff(np.concatenate([[0,], tp_rate, [1,]]),) * + np.concatenate([[0,], precision, [1,]])[:-1]) def compute_accuracy_rates(stat_has_mod, num_plot_points=ROC_PLOT_POINTS): - """Given a list or numpy array of true/false values, function returns num_plot_point evenly spaced values along the true positive, false positive and precision arrays + """Given a list or numpy array of true/false values, function returns + num_plot_point evenly spaced values along the true positive, false positive + and precision arrays """ tp_cumsum = np.cumsum(stat_has_mod) tp_rate = tp_cumsum / tp_cumsum[-1] @@ -2011,7 +2403,7 @@ def compute_accuracy_rates(stat_has_mod, num_plot_points=ROC_PLOT_POINTS): return tp_rate, fp_rate, precision def _compute_motif_stats( - stats, motif_descs, genome_index, pos_stat_name='damp_frac', + stats, motif_descs, genome_index, stats_per_block=None, total_stats_limit=None): all_motif_stats = dict( (mod_name, []) for mod_name in list(zip(*motif_descs))[1]) @@ -2052,7 +2444,7 @@ def _compute_motif_stats( for motif, mod_name in motif_descs: if r_pos_seq[before_bases] != motif.mod_base: continue all_motif_stats[mod_name].append(( - r_pos_stat[pos_stat_name], + r_pos_stat[stats._stat_slot], bool(motif.motif_pat.match( r_pos_seq[before_bases - motif.mod_pos + 1:])))) @@ -2062,8 +2454,88 @@ def _compute_motif_stats( return all_motif_stats +def _compute_ground_truth_stats(stats, ground_truth_locs): + all_stats = [] + mod_locs, unmod_locs, mod_name = ground_truth_locs + for chrm, strand, start, end, block_stats in stats: + try: + cs_mod_locs = mod_locs[(chrm, strand)] + except KeyError: + cs_mod_locs = np.array([]) + try: + cs_unmod_locs = unmod_locs[(chrm, strand)] + except KeyError: + cs_unmod_locs = np.array([]) + block_mod_locs = cs_mod_locs[ + np.logical_and(np.greater_equal(cs_mod_locs, start), + np.less(cs_mod_locs, end))] + block_unmod_locs = cs_unmod_locs[ + np.logical_and(np.greater_equal(cs_unmod_locs, start), + np.less(cs_unmod_locs, end))] + block_valid_locs = block_stats[ + np.isin(block_stats['pos'], np.concatenate([ + block_mod_locs, block_unmod_locs]))] + all_stats.extend(zip( + block_valid_locs[stats._stat_slot], + np.isin(block_valid_locs['pos'], block_mod_locs))) + + return {mod_name:all_stats} + +def _compute_ctrl_motif_stats( + stats, ctrl_stats, motif_descs, genome_index, + stats_per_block=None, total_stats_limit=None): + all_motif_stats = dict( + (mod_name, []) for mod_name in list(zip(*motif_descs))[1]) + before_bases = max(( + motif.mod_pos for motif in list(zip(*motif_descs))[0])) - 1 + after_bases = max((motif.motif_len - motif.mod_pos + for motif in list(zip(*motif_descs))[0])) + total_num_stats = 0 + for chrm, strand, start, end, block_stats in stats: + if strand == '+': + seq_start = max(start - before_bases, 0) + seq_end = end + after_bases + else: + seq_start = max(start - after_bases, 0) + seq_end = end + before_bases + + reg_seq = genome_index.get_seq( + chrm, seq_start, seq_end, error_end=False) + + ctrl_block_stats = ctrl_stats.get_reg_stats(chrm, strand, start, end) + for motif, mod_name in motif_descs: + if strand == '+': + mod_poss = np.array([ + m.start() + motif.mod_pos - 1 + for m in motif.motif_pat.finditer(reg_seq)]) + seq_start + else: + mod_poss = np.array([ + m.start() + motif.motif_len - motif.mod_pos + for m in motif.rev_comp_pat.finditer(reg_seq)]) + seq_start + + # TODO possibly use stats_per_block here, but motif stats are + # less common, so probably not as useful here + for r_pos_stat in block_stats[ + np.isin(block_stats['pos'], mod_poss)]: + all_motif_stats[mod_name].append(( + r_pos_stat[stats._stat_slot], True)) + total_num_stats += 1 + if ctrl_block_stats is not None: + for r_pos_stat in ctrl_block_stats[ + np.isin(ctrl_block_stats['pos'], mod_poss)]: + all_motif_stats[mod_name].append(( + r_pos_stat[stats._stat_slot], False)) + total_num_stats += 1 + + if (total_stats_limit is not None and + total_num_stats >= total_stats_limit): + break + + return all_motif_stats + def calc_damp_fraction(cov_damp_counts, fracs, valid_cov): - """Compute dampened fraction of un-modified reads using provided modified and un-modified pseudo-counts from cov_damp_counts + """Compute dampened fraction of un-modified reads using provided modified + and un-modified pseudo-counts from cov_damp_counts See https://nanoporetech.github.io/tombo/text_output.html?highlight=dampened#text-output-browser-files for more details """ @@ -2078,10 +2550,9 @@ def calc_damp_fraction(cov_damp_counts, fracs, valid_cov): return damp_fracs -# TODO write BaseStats class since many operations are quite similar for -# TomboStats and PerReadStats -class TomboStats(object): - """Parse and retrieve relevant information from a standard (per-genomic base) Tombo statistics file. +class ModelStats(object): + """Parse and retrieve relevant information from a standard (per-genomic + base) Tombo statistics file. .. automethod:: __init__ """ @@ -2110,8 +2581,12 @@ def _parse_stats(self): self.most_signif_stats = most_signif_grp[MOST_SIGNIF_H5_NAME][:] self.most_signif_chrm_map = dict( (v,k) for k,v in most_signif_grp['chrm_ids'].attrs.items()) - self.cov_damp_counts = dict(self._fp[ - COV_DAMP_COUNTS_H5_NAME].attrs.items()) + # LevelStats doesn't have damp counts + try: + self.cov_damp_counts = dict(self._fp[ + COV_DAMP_COUNTS_H5_NAME].attrs.items()) + except: + self.cov_damp_counts = None return @@ -2131,7 +2606,8 @@ def _create_new_stats_file(self): # save coverage damp counts and threshold attributes self._fp.attrs[COV_THRESH_H5_NAME] = self.cov_thresh - self.cov_damp_counts_grp = self._fp.create_group(COV_DAMP_COUNTS_H5_NAME) + self.cov_damp_counts_grp = self._fp.create_group( + COV_DAMP_COUNTS_H5_NAME) self.cov_damp_counts_grp.attrs[ 'unmod'] = self.cov_damp_counts['unmod'] self.cov_damp_counts_grp.attrs[ @@ -2161,28 +2637,37 @@ def _create_new_stats_file(self): def __init__(self, stats_fn, stat_type=None, region_size=None, cov_damp_counts=None, cov_thresh=None, num_most_signif=None, most_signif_num_batches=MOST_SIGNIF_NUM_BATCHES_DEFAULT): - """Parse or open for writing a standard (per-genomic base) Tombo statistics file. + """Parse or open for writing a standard (per-genomic base) Tombo + statistics file. Example:: stats = tombo_stats.TomboStats('path/to/stats.file') - for chrm, strand, pos, frac, damp_frac, valid_cov in stats.iter_most_signif_sites(): + for chrm, strand, pos, frac, damp_frac, valid_cov in \ + stats.iter_most_signif_sites(): # do stuff Args: stats_fn (str): filename for previously saved tombo stats - stat_type (str): type of statistic (model_compare, de_novo, or sample_compare); only applicable for new file writing - region_size (int): size of chunked storage blocks; only applicable for new file writing - cov_damp_counts (tuple): pseudo-counts for modified and un-modified reads to compute ``damp_frac`` - cov_thresh (int): only sites with coverage greater than or equal to this value will be stored - num_most_signif (int): number of most significant sites to be stored for faster access - most_signif_num_batches (int): number of region batches to store before re-computing the most significant array (default: 10) + stat_type (str): type of statistic (model_compare, de_novo, or + sample_compare); only applicable for new file writing + region_size (int): size of chunked storage blocks; only applicable + for new file writing + cov_damp_counts (tuple): pseudo-counts for modified and un-modified + reads to compute ``damp_frac`` + cov_thresh (int): only sites with coverage greater than or equal to + this value will be stored + num_most_signif (int): number of most significant sites to be stored + for faster access + most_signif_num_batches (int): number of region batches to store + before re-computing the most significant array (default: 10) Warning: - If all arguments are provided the current file's contents will be deleted. + If all arguments are provided the current file's contents will be + deleted. - Intended to open a fresh ``TomboStats`` file for writing. + Intended to open a fresh ``ModelStats`` file for writing. """ self.stats_fn = stats_fn @@ -2210,12 +2695,25 @@ def __init__(self, stats_fn, stat_type=None, region_size=None, # open file for writing self._create_new_stats_file() + if self.stat_type not in PER_READ_STATS: + if self.stat_type in LEVEL_STATS_TXTS: + raise th.TomboError( + 'This appears to be a group-comparison stats file. Open ' + + 'with tombo_stats.LevelStats.') + raise th.TomboError( + 'This file is not a valid ModelStats file. `stat_type` ' + + 'listed as "' + self.stat_type + '".') + self.is_model_stats = True + self._stat_slot = str('damp_frac') + self._stat_text = 'Est. Frac. Alternate: {0:.2g}' + self._stat_transform = lambda pos_stat: 1 - pos_stat[self._stat_slot] + return def _update_most_signif(self): tmp_most_signif = np.concatenate( [self.running_most_signif_sites,] + self.queued_stat_batches) - tmp_most_signif.sort(kind='mergesort', order=str('damp_frac')) + tmp_most_signif.sort(kind='mergesort', order=self._stat_slot) self.running_most_signif_sites = tmp_most_signif[:self.num_most_signif] self.queued_stat_batches = [] return @@ -2278,10 +2776,10 @@ def _close_write(self): if len(self.queued_stat_batches) >= 1: self._update_most_signif() # trim the array if necessary - if np.isnan(self.running_most_signif_sites['damp_frac'][-1]): + if np.isnan(self.running_most_signif_sites[self._stat_slot][-1]): # not as many signif sites were stored as requested so trim array first_nan = np.where(np.isnan( - self.running_most_signif_sites['damp_frac']))[0][0] + self.running_most_signif_sites[self._stat_slot]))[0][0] self.running_most_signif_sites = self.running_most_signif_sites[ :first_nan,] # add dataset to file @@ -2296,7 +2794,8 @@ def _close_write(self): return def close(self): - """Close open HDF5 file and write most significant sites if open for writing + """Close open HDF5 file and write most significant sites if open for + writing """ if self.open_for_writing: self._close_write() @@ -2308,15 +2807,20 @@ def close(self): def _get_chrm_name(self, pos_stat): return self.most_signif_chrm_map[pos_stat['chrm']] - def iter_stat_seqs(self, genome_index, before_bases, after_bases, - include_pos=True): - """Iterate through most significant genomic sites returning the genomic sequence surrounding each position. + def iter_stat_seqs( + self, genome_index, before_bases, after_bases, include_pos=True): + """Iterate through most significant genomic sites returning the genomic + sequence surrounding each position. Args: - genome_index (:class:`tombo.tombo_helper.Fasta`): genome index object - before_bases (int): number of sequence bases before positions to include - after_bases (int): number of sequence bases after positions to include - include_pos (bool): yeild (pos_seq, chrm, strand, start, end) for each site (default: True) + genome_index (:class:`tombo.tombo_helper.Fasta`): genome index + object + before_bases (int): number of sequence bases before positions to + include + after_bases (int): number of sequence bases after positions to + include + include_pos (bool): yeild (pos_seq, chrm, strand, start, end) for + each site (default: True) """ for pos_stat in self.most_signif_stats: chrm, strand, pos = (self._get_chrm_name(pos_stat), @@ -2338,25 +2842,31 @@ def iter_stat_seqs(self, genome_index, before_bases, after_bases, return def iter_most_signif_sites(self): - """Iterate through statistics table yeilding (chrm, strand, pos, frac, damp_frac). + """Iterate through statistics table yeilding (chrm, strand, pos, stat). """ for pos_stat in self.most_signif_stats: yield ( self._get_chrm_name(pos_stat), pos_stat['strand'].decode(), - pos_stat['pos'], pos_stat['frac'], pos_stat['damp_frac'], - pos_stat['valid_cov']) + pos_stat['pos'], + self._stat_transform(pos_stat[self._stat_slot])) return - def get_most_signif_regions(self, num_bases, num_regions, unique_pos=True, - prepend_loc_to_text=False): - """Select regions centered on locations with the largest fraction of modified bases + def get_most_signif_regions( + self, num_bases, num_regions, unique_pos=True, + prepend_loc_to_text=False): + """Select regions centered on locations with the largest fraction of + modified bases Args: num_bases (int): number of bases to output num_regions (int): number of regions to output - unique_pos (bool): get only unique positions (optional; default True) intervals may overlap, but identified significant position is outside other intervals - prepend_loc_to_text (bool): pre-prend most significant location to the region text (can be off for interval near start/end of sequence records) + unique_pos (bool): get only unique positions (optional; default + True) intervals may overlap, but identified significant + position is outside other intervals + prepend_loc_to_text (bool): pre-prend most significant location to + the region text (can be off for interval near start/end of + sequence records) Returns: A list of :class:`tombo.tombo_helper.intervalData` objects @@ -2371,23 +2881,25 @@ def get_most_signif_regions(self, num_bases, num_regions, unique_pos=True, pos_stat['pos'] not in used_intervals[(chrm, strand)]): used_intervals[(chrm, strand)].update( range(int_start, int_start + num_bases)) - int_text = 'Est. Frac. Alternate: {0:.2g}'.format( - 1 - pos_stat[str('damp_frac')]) + int_text = self._stat_text.format( + self._stat_transform(pos_stat)) if prepend_loc_to_text: int_text = '{0}:{1:d}:{2}'.format( chrm, pos_stat['pos'] + 1, strand) + " " + int_text selected_regs.append(th.intervalData( chrm=chrm, start=int_start, end=int_start + num_bases, - strand=strand, reg_id='{:03d}'.format(i), reg_text=int_text)) + strand=strand, reg_id='{:03d}'.format(i), + reg_text=int_text)) if len(selected_regs) >= num_regions: break if len(selected_regs) == 0: th.error_message_and_exit( - 'No locations identified. Most likely an empty statistics file.') + 'No locations identified. Most likely an empty ' + + 'statistics file.') if len(selected_regs) < num_regions: th.warning_message( - 'Fewer unique significant locations more than [--num-bases]/2 ' + - 'apart were identified. Continuing with ' + + 'Fewer unique significant locations more than ' + + '[--num-bases]/2 apart were identified. Continuing with ' + str(len(selected_regs)) + ' unique locations. Must raise ' + '--num-most-significant-stored in order to see more most ' + 'significant stats.') @@ -2397,28 +2909,86 @@ def get_most_signif_regions(self, num_bases, num_regions, unique_pos=True, def compute_motif_stats( self, motif_descs, genome_index, stats_per_block=None, total_stats_limit=None): - """Compute lists of statistic values and whether this site represents a match to the provided motifs + """Compute lists of statistic values and whether this site represents a + match to the provided motifs Args: - motif_descs (list; see :class:`tombo.tombo_helper.parse_motif_descs`): containing tuples with :class:`tombo.tombo_helper.TomboMotif` and motif/modification names + motif_descs (list; see + :class:`tombo.tombo_helper.parse_motif_descs`): containing + tuples with :class:`tombo.tombo_helper.TomboMotif` and + motif/modification names genome_index (:class:`tombo.tombo_helper.Fasta`): genome index - stats_per_block (int): statistics to include in calculations per-block (`--multiprocess-region-size`) - total_stats_limit (int): maximum total statistics to include in computation (Default: include all stats) + stats_per_block (int): statistics to include in calculations + per-block (`--multiprocess-region-size`) + total_stats_limit (int): maximum total statistics to include in + computation (Default: include all stats) Returns: - Dictionary with (key) motif/modification name and (value) list of tuples containing statistic value and boolean motif match + Dictionary with (key) motif/modification name and (value) list of + tuples containing statistic value and boolean motif match """ return _compute_motif_stats( - self, motif_descs, genome_index, 'damp_frac', - stats_per_block=stats_per_block, total_stats_limit=total_stats_limit) + self, motif_descs, genome_index, stats_per_block=stats_per_block, + total_stats_limit=total_stats_limit) + + def compute_ground_truth_stats(self, ground_truth_locs): + """Compute lists of statistic values and ground truth modification + status (boolean) + + Args: + ground_truth_locs: list containing tuples of 1) modified locations + (from :class:`tombo.tombo_helper.parse_locs_file`), 2) + unmodified locations and mod names. + + Returns: + Dictionary with (key) motif/modification name and (value) list of + tuples containing statistic value and boolean motif match + """ + return _compute_ground_truth_stats(self, ground_truth_locs) + + def compute_ctrl_motif_stats( + self, ctrl_stats, motif_descs, genome_index, + stats_per_block=None, total_stats_limit=None): + """Compute lists of statistic values and whether this site represents a + match to the provided motifs + + Args: + ctrl_stats (:class:`tombo.tombo_stats.ModelStats`): control + statistics + motif_descs (list; see + :class:`tombo.tombo_helper.parse_motif_descs`): containing + tuples with :class:`tombo.tombo_helper.TomboMotif` and + motif/modification names + genome_index (:class:`tombo.tombo_helper.Fasta`): genome index + stats_per_block (int): statistics to include in calculations + per-block (`--multiprocess-region-size`) + total_stats_limit (int): maximum total statistics to include in + computation (Default: include all stats) + + Returns: + Dictionary with (key) motif/modification name and (value) list of + tuples containing statistic value and boolean native or control + sample stats + """ + return _compute_ctrl_motif_stats( + self, ctrl_stats, motif_descs, genome_index, + stats_per_block=stats_per_block, + total_stats_limit=total_stats_limit) def __iter__(self): - """Iterator over all statistics blocks, yeilding chrm, strand, start, end, block_stats + """Iterator over all statistics blocks, yeilding chrm, strand, start, + end, block_stats """ self.iter_all_cs = iter(sorted(self.blocks_index)) - self.iter_curr_cs = next(self.iter_all_cs) - self.iter_curr_cs_blocks = iter( - self.blocks_index[self.iter_curr_cs].items()) + try: + self.iter_curr_cs = next(self.iter_all_cs) + except StopIteration: + self.iter_curr_cs = None + self.iter_curr_cs_blocks = iter([]) + else: + self.iter_curr_cs_blocks = iter( + self.blocks_index[self.iter_curr_cs].items()) + return self def __next__(self): @@ -2439,15 +3009,14 @@ def __next__(self): # for python2 compatibility def next(self): - """Return next statistics block from file including (chrm, strand, block start, block end and statistics table ``numpy structured array``) + """Return next statistics block from file including (chrm, strand, + block start, block end and statistics table ``numpy structured array``) """ return self.__next__() - def get_pos_frac(self, chrm, strand, pos, missing_value=None): + def get_pos_stat(self, chrm, strand, pos, missing_value=None): """Extract statistic value from the requested genomic position. """ - # TODO: Add a get_reg_fracs and only get the reg values - # once. Just need to handle edge of batch cases try: pos_block_start = np.floor_divide( pos, self.region_size) * self.region_size @@ -2458,12 +3027,204 @@ def get_pos_frac(self, chrm, strand, pos, missing_value=None): block_data = self.stat_blocks[block_name]['block_stats'][:] pos_index = np.where(block_data['pos'] == pos)[0] if len(pos_index) != 1: raise KeyError - pos_frac = 1 - block_data['damp_frac'][pos_index[0]] + pos_stat = self._stat_transform(block_data[pos_index[0]]) except KeyError: - pos_frac = missing_value + pos_stat = missing_value + + return pos_stat + + def get_reg_stats(self, chrm, strand, start, end): + if (chrm, strand) not in self.blocks_index: return + reg_stats = [] + for reg_start, block_name in sorted( + self.blocks_index[(chrm, strand)].items()): + # if this is an overlapping block + if reg_start < end and reg_start + self.region_size > start: + # extract stats overlapping this region + reg_stats_block = self.stat_blocks[block_name]['block_stats'][:] + reg_stats.append(reg_stats_block[ + np.logical_and( + np.greater_equal(reg_stats_block['pos'], start), + np.less(reg_stats_block['pos'], end))]) + + if len(reg_stats) == 0: + return + elif len(reg_stats) == 1: + return reg_stats[0] + return np.vstack(reg_stats) + + +class LevelStats(ModelStats): + def _create_new_stats_file(self): + # try to remove file for overwriting old results + try: + os.remove(self.stats_fn) + except: + pass + # open file for writing + self._fp = h5py.File(self.stats_fn, 'w') + + # save attributes to file and open stats blocks group + self._fp.attrs['stat_type'] = self.stat_type + self._fp.attrs['block_size'] = self.region_size + self.stat_blocks = self._fp.create_group(STAT_BLOCKS_H5_NAME) + + # save coverage damp counts and threshold attributes + self._fp.attrs[COV_THRESH_H5_NAME] = self.cov_thresh + + # storage for most significant stats + self.most_signif_sites = self._fp.create_group(MOST_SIGNIF_H5_NAME) + self.running_most_signif_sites = np.empty( + shape=(self.num_most_signif,), + dtype=[(str('stat'), 'f8'), (str('pos'), 'u4'), + (str('cov'), 'u4'), (str('control_cov'), 'u4'), + (str('chrm'), 'u4'), (str('strand'), 'S1')]) + self.running_most_signif_sites[:] = np.NAN + # store a queue of completed stat batches to be concatenated and stored + # as a group to avoid too many array copy and sorting ops + self.queued_stat_batches = [] + # store chromosomes names in dict for storing most signif array + self.curr_chrm_id = 0 + self.chrm_names = {} + self.chrm_id_grp = self.most_signif_sites.create_group('chrm_ids') + + self.is_empty = True + + return + + def __init__(self, stats_fn, stat_type=None, region_size=None, + cov_thresh=None, num_most_signif=None, + most_signif_num_batches=MOST_SIGNIF_NUM_BATCHES_DEFAULT): + """Parse or open for writing a standard (per-genomic base) Tombo + statistics file. + + Example:: + + stats = tombo_stats.TomboStats('path/to/stats.file') + for chrm, strand, pos, nl_pval in \ + stats.iter_most_signif_sites(): + # do stuff + + Args: + stats_fn (str): filename for previously saved tombo stats + stat_type (str): type of statistic (ks, ttest, utest); only + applicable for new file writing + region_size (int): size of chunked storage blocks; only applicable + for new file writing + cov_thresh (int): only sites with coverage greater than or equal to + this value will be stored + num_most_signif (int): number of most significant sites to be stored + for faster access + most_signif_num_batches (int): number of region batches to store + before re-computing the most significant array (default: 10) + + Warning: + + If all arguments are provided the current file's contents will be + deleted. + + Intended to open a fresh ``ModelStats`` file for writing. + """ + self.stats_fn = stats_fn + + if any(arg is None for arg in ( + stat_type, region_size, cov_thresh, num_most_signif)): + self.open_for_writing = False + # open file for reading + try: + self._parse_stats() + except: + raise th.TomboError( + 'Invalid statistics file provided. Try running ' + + 'tombo/scripts/convert_stats.py if this stats file ' + + 'was created before Tombo v1.3.1') + else: + self.open_for_writing = True + # set class attributes + self.stat_type = stat_type + self.region_size = region_size + self.curr_block_num = 0 + self.cov_thresh = cov_thresh + self.num_most_signif = num_most_signif + self.most_signif_num_batches = most_signif_num_batches + # open file for writing + self._create_new_stats_file() + + if self.stat_type not in LEVEL_STATS_TXTS: + if self.stat_type in PER_READ_STATS: + raise th.TomboError( + 'This appears to be a model-based comparison stats ' + + 'file. Open with tombo_stats.ModelStats.') + raise th.TomboError( + 'This file is not a valid LevelStats file. `stat_type` ' + + 'listed as "' + self.stat_type + '".') + + # set values to access statistics consistently + self.is_model_stats = False + self._stat_slot = str('stat') + if self.stat_type in (KS_TEST_TXT, U_TEST_TXT, T_TEST_TXT): + self._stat_text = '-log10(p-value): {0:.2g}' + self._stat_transform = lambda pos_stat: -np.log10( + pos_stat[self._stat_slot]) + elif self.stat_type == KS_STAT_TEST_TXT: + self._stat_text = 'D Statistic: {0:.2g}' + self._stat_transform = lambda pos_stat: ( + 1 - pos_stat[self._stat_slot]) + elif self.stat_type == U_STAT_TEST_TXT: + self._stat_text = 'Common Language Marginal Effect: {0:.2g}' + self._stat_transform = lambda pos_stat: -pos_stat[self._stat_slot] + elif self.stat_type == T_STAT_TEST_TXT: + self._stat_text = "Cohen's D: {0:.2g}" + self._stat_transform = lambda pos_stat: -pos_stat[self._stat_slot] + else: + raise th.TomboError('Unknown statistic type.') + + return + + def _write_stat_block(self, grp_stats): + """Write region group statistics block to file. + """ + try: + block_data = self.stat_blocks.create_group( + 'Block_' + unicode(self.curr_block_num)) + self.curr_block_num += 1 + except: + th.warning_message('Statistics file not opened for writing.') + return + + block_data.attrs['chrm'] = grp_stats.chrm + block_data.attrs['strand'] = grp_stats.strand + block_data.attrs['start'] = grp_stats.start + + grp_stats_arr = np.array( + [pos_stats for pos_stats in zip( + grp_stats.reg_stats, grp_stats.reg_poss, + grp_stats.reg_cov, grp_stats.ctrl_cov) + if not np.isnan(pos_stats[0])], + dtype=[(str('stat'), 'f8'), (str('pos'), 'u4'), + (str('cov'), 'u4'), (str('control_cov'), 'u4')]) + block_data.create_dataset( + 'block_stats', data=grp_stats_arr, compression="gzip") + + self._add_to_most_signif( + grp_stats_arr, grp_stats.chrm, grp_stats.strand) + + self.is_empty = False - return pos_frac + return + +def TomboStats(stat_fn): + """Load per-reference location Tombo statistics. Safe method to load statistics for read-only purposes. + + Returns: + Either :class:`tombo.tombo_stats.ModelStats` or :class:`tombo.tombo_stats.LevelStats` depending on the type of Tombo statistics file provided. + """ + try: + stats = ModelStats(stat_fn) + except th.TomboError: + stats = LevelStats(stat_fn) + return stats class PerReadStats(object): """Store and accses per-read modified base testing statistics @@ -2509,7 +3270,8 @@ def __init__(self, per_read_stats_fn, stat_type=None, region_size=None): Examples:: - per_read_stats = tombo_stats.PerReadStats('path/to/sample.tombo.per_read_stats') + per_read_stats = tombo_stats.PerReadStats(\ + 'path/to/sample.tombo.per_read_stats') int_data = tombo_helper.intervalData( chrm='chr20', start=10000, end=10100, strand='+') reg_per_read_stats = per_read_stats.get_region_per_read_stats( @@ -2517,13 +3279,17 @@ def __init__(self, per_read_stats_fn, stat_type=None, region_size=None): Args: - per_read_stats_fn (str): filename containing (or to write) per-read Tombo statistics - stat_type (str): type of statistic (model_compare, de_novo, or sample_compare); only applicable for new file writing - region_size (int): size of chunked storage blocks; only applicable for new file writing + per_read_stats_fn (str): filename containing (or to write) per-read + Tombo statistics + stat_type (str): type of statistic (model_compare, de_novo, or + sample_compare); only applicable for new file writing + region_size (int): size of chunked storage blocks; only applicable + for new file writing Warning: - If ``stat_type`` and ``region_size`` are provided the current file's contents will be deleted. + If ``stat_type`` and ``region_size`` are provided the current + file's contents will be deleted. Intended to open a fresh ``PerReadStats`` file for writing. """ @@ -2534,7 +3300,8 @@ def __init__(self, per_read_stats_fn, stat_type=None, region_size=None): self._parse_per_read_stats() except: th.error_message_and_exit( - 'Non-existent or invalid per-read statistics file provided.') + 'Non-existent or invalid per-read statistics file ' + + 'provided.') else: # set class attributes self.stat_type = stat_type @@ -2544,6 +3311,11 @@ def __init__(self, per_read_stats_fn, stat_type=None, region_size=None): self.are_pvals = self.stat_type != ALT_MODEL_TXT + self._stat_slot = str('stat') + self._stat_text = '-log10(p-value): {0:.2g}' + self._stat_transform = lambda pos_stat: -np.log10( + pos_stat[self._stat_slot]) + return def _write_per_read_block( @@ -2566,9 +3338,14 @@ def _write_per_read_block( 'block_stats', data=per_read_block, compression="gzip") # add lookup dict for read_id slot stored in table to save space and # avoid memory leak due to vlen slots in h5py datasets - read_id_grp = block_data.create_group('read_ids') - for read_id, read_id_val in read_id_lookup.items(): - read_id_grp.attrs[read_id] = read_id_val + dt = h5py.special_dtype(vlen=str) + read_ids = np.array(list(read_id_lookup.keys()), dtype=dt) + read_ids_ds = block_data.create_dataset( + 'read_ids', read_ids.shape, dtype=dt, compression="gzip") + read_ids_ds[...] = read_ids + block_data.create_dataset( + 'read_id_vals', data=np.array(list(read_id_lookup.values())), + compression="gzip") self._fp.flush() @@ -2579,11 +3356,14 @@ def get_region_per_read_stats(self, interval_data, num_reads=None): Args: - interval_data (:class:`tombo.tombo_helper.intervalData`): genomic interval - num_reads (int): randomly select this many reads (default: inlcude all reads) + interval_data (:class:`tombo.tombo_helper.intervalData`): + genomic interval + num_reads (int): randomly select this many reads (default: inlcude + all reads) Returns: - `np.array` structured array containing ``pos``, ``stat`` and ``read_id`` for per-read stats over requested interval + `np.array` structured array containing ``pos``, ``stat`` and + ``read_id`` for per-read stats over requested interval """ try: cs_blocks = self.blocks_index[( @@ -2598,11 +3378,19 @@ def get_region_per_read_stats(self, interval_data, num_reads=None): # extract stats from FAST5 block_stats = self.per_read_blocks[block_name]['block_stats'][:] reg_poss = block_stats['pos'] - reg_read_stats = block_stats['stat'] + reg_read_stats = block_stats[self._stat_slot] # extract and convert read_ids back into strings - block_read_id_lookup = dict([ - (read_id_val, read_id) for read_id, read_id_val in - self.per_read_blocks[block_name]['read_ids'].attrs.items()]) + if 'read_id_vals' in self.per_read_blocks[block_name]: + block_read_id_lookup = dict([ + (read_id_val, read_id) for read_id, read_id_val in + zip(self.per_read_blocks[block_name]['read_ids'].value, + self.per_read_blocks[block_name]['read_id_vals'].value)]) + else: + # read_ids previously stored (inefficiently) as attributes + # so parse read_ids attributes for backwards compatibility + block_read_id_lookup = dict([ + (read_id_val, read_id) for read_id, read_id_val in + self.per_read_blocks[block_name]['read_ids'].attrs.items()]) reg_read_ids = [ block_read_id_lookup[r_id] for r_id in block_stats['read_id']] int_block_stats.append(np.array( @@ -2634,20 +3422,71 @@ def get_region_per_read_stats(self, interval_data, num_reads=None): def compute_motif_stats( self, motif_descs, genome_index, stats_per_block=None, total_stats_limit=None): - """Compute lists of statistic values and whether this site represents a match to the provided motifs + """Compute lists of statistic values and whether this site represents a + match to the provided motifs Args: - motif_descs (list; see :class:`tombo.tombo_helper.parse_motif_descs`): containing tuples with :class:`tombo.tombo_helper.TomboMotif` and motif/modification names + motif_descs (list; see + :class:`tombo.tombo_helper.parse_motif_descs`): containing + tuples with :class:`tombo.tombo_helper.TomboMotif` and + motif/modification names genome_index (:class:`tombo.tombo_helper.Fasta`): genome index - stats_per_block (int): statistics to include in calculations per-block (`--multiprocess-region-size`) - total_stats_limit (int): maximum total statistics to include in computation (Default: include all stats) + stats_per_block (int): statistics to include in calculations + per-block (`--multiprocess-region-size`) + total_stats_limit (int): maximum total statistics to include in + computation (Default: include all stats) Returns: - Dictionary with (key) motif/modification name and (value) list of tuples containing statistic value and boolean motif match + Dictionary with (key) motif/modification name and (value) list of + tuples containing statistic value and boolean motif match """ return _compute_motif_stats( - self, motif_descs, genome_index, 'stat', - stats_per_block=stats_per_block, total_stats_limit=total_stats_limit) + self, motif_descs, genome_index, stats_per_block=stats_per_block, + total_stats_limit=total_stats_limit) + + def compute_ground_truth_stats(self, ground_truth_locs): + """Compute lists of statistic values and ground truth modification + status (boolean) + + Args: + ground_truth_locs: list containing tuples of 1) modified locations + (from :class:`tombo.tombo_helper.parse_locs_file`), 2) + unmodified locations and mod names. + + Returns: + Dictionary with (key) motif/modification name and (value) list of + tuples containing statistic value and boolean motif match + """ + return _compute_ground_truth_stats(self, ground_truth_locs) + + def compute_ctrl_motif_stats( + self, pr_ctrl_stats, motif_descs, genome_index, + stats_per_block=None, total_stats_limit=None): + """Compute lists of statistic values and whether this site represents a + match to the provided motifs + + Args: + pr_ctrl_stats (:class:`tombo.tombo_stats.PerReadStats`): control + per-read statistics + motif_descs (list; see + :class:`tombo.tombo_helper.parse_motif_descs`): containing + tuples with :class:`tombo.tombo_helper.TomboMotif` and + motif/modification names + genome_index (:class:`tombo.tombo_helper.Fasta`): genome index + stats_per_block (int): statistics to include in calculations + per-block (`--multiprocess-region-size`) + total_stats_limit (int): maximum total statistics to include in + computation (Default: include all stats) + + Returns: + Dictionary with (key) motif/modification name and (value) list of + tuples containing statistic value and boolean native or control + sample stats + """ + return _compute_ctrl_motif_stats( + self, pr_ctrl_stats, motif_descs, genome_index, + stats_per_block=stats_per_block, + total_stats_limit=total_stats_limit) def __iter__(self): """ @@ -2678,7 +3517,9 @@ def __next__(self): # for python2 compatibility def next(self): - """Return next per-read statistics block from file including (chrm, strand, block start, block end and per-read statistics table ``numpy structured array``) + """Return next per-read statistics block from file including (chrm, + strand, block start, block end and per-read statistics table + ``numpy structured array``) """ return self.__next__() @@ -2688,6 +3529,27 @@ def close(self): self._fp.close() return + def get_reg_stats(self, chrm, strand, start, end): + if (chrm, strand) not in self.blocks_index: return + reg_stats = [] + for reg_start, block_name in sorted( + self.blocks_index[(chrm, strand)].items()): + # if this is an overlapping block + if reg_start < end and reg_start + self.region_size > start: + # extract stats overlapping this region + reg_stats_block = self.per_read_blocks[ + block_name]['block_stats'][:] + reg_stats.append(reg_stats_block[ + np.logical_and( + np.greater_equal(reg_stats_block['pos'], start), + np.less(reg_stats_block['pos'], end))]) + + if len(reg_stats) == 0: + return + elif len(reg_stats) == 1: + return reg_stats[0] + return np.vstack(reg_stats) + ################################ ##### Base-by-base Testing ##### @@ -2707,8 +3569,8 @@ def compute_posterior_samp_dists( if ctrl_reg_data.strand == '-': reg_seq = th.rev_comp(reg_seq) - reg_ref_means, reg_ref_sds = get_ref_from_seq_with_gaps( - reg_seq, std_ref, ctrl_reg_data.strand == '-') + reg_ref_means, reg_ref_sds = std_ref.get_exp_levels_from_seq_with_gaps( + reg_seq, ctrl_reg_data.strand == '-') # compute vectorized weighted means for new mean and sd estimates post_ref_means = (( @@ -2749,55 +3611,65 @@ def compute_mean_diff_factor(event_means, ref_mean): return post_ref_means, post_ref_sds def get_reads_ref( - ctrl_reg_data, min_test_reads, fm_offset, std_ref=None, + reg_data, min_test_reads, fm_offset, std_ref=None, prior_weights=None, est_mean=False): """Get mean and standard deviation of levels from a sample across the genome """ + central_func = np.mean if est_mean else np.median + reg_size = reg_data.end - reg_data.start + (fm_offset * 2) + level_means, level_sds = np.empty(reg_size), np.empty(reg_size) + level_means[:] = np.NAN + level_sds[:] = np.NAN + # expand region to include fm_offset - ctrl_base_events = ctrl_reg_data.copy().update( - start=ctrl_reg_data.start - fm_offset, - end=ctrl_reg_data.end + fm_offset).get_base_levels() - # means over all nan values raises warnings so suppress those here - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=RuntimeWarning) - if est_mean: - ctrl_means = np.apply_along_axis(np.nanmean, 1, ctrl_base_events) - else: - ctrl_means = np.apply_along_axis(np.nanmedian, 1, ctrl_base_events) - ctrl_sds = np.apply_along_axis( - lambda x: max(np.nanstd(x), MIN_POSITION_SD), 1, - ctrl_base_events) - ctrl_cov = np.apply_along_axis( - lambda x: sum(~np.isnan(x)), 1, ctrl_base_events) - # set means and sds with cov below min_test_reads to NAN - ctrl_means[ctrl_cov < min_test_reads] = np.NAN - ctrl_sds[ctrl_cov < min_test_reads] = np.NAN + bases_levels = reg_data.copy().update( + start=reg_data.start - fm_offset, + end=reg_data.end + fm_offset).get_base_levels() + valid_indices = np.logical_not(np.isnan(bases_levels)) + cov = valid_indices.sum(axis=1) + cov_regs = np.where(np.diff(np.concatenate([ + [False,], np.greater_equal(cov, min_test_reads), [False,]])))[0] + if len(cov_regs) == 0: + return level_means, level_sds, {} + + for cov_start, cov_end in zip(cov_regs[:-1:2], cov_regs[1::2]): + level_means[cov_start:cov_end] = np.array([ + central_func(b_levels[valid_indices[cov_start + i]]) + for i, b_levels in enumerate(bases_levels[cov_start:cov_end])]) + level_sds[cov_start:cov_end] = np.array([ + np.std(b_levels[valid_indices[cov_start + i]]) + for i, b_levels in enumerate(bases_levels[cov_start:cov_end])]) if std_ref is not None: if prior_weights is None: prior_weights = (MEAN_PRIOR_CONST, SD_PRIOR_CONST) - ctrl_means, ctrl_sds = compute_posterior_samp_dists( - ctrl_means, ctrl_sds, ctrl_cov, ctrl_reg_data, std_ref, + level_means, level_sds = compute_posterior_samp_dists( + level_means, level_sds, cov, reg_data, std_ref, prior_weights, min_test_reads, fm_offset) - # convert coverate to a dict for later lookup - ctrl_cov = dict(zip(range(ctrl_reg_data.start - fm_offset, - ctrl_reg_data.end + fm_offset), ctrl_cov)) + # convert coverage to a dict for later lookup + cov = dict(zip(range(reg_data.start - fm_offset, + reg_data.end + fm_offset), cov)) - return ctrl_means, ctrl_sds, ctrl_cov + return level_means, level_sds, cov def compute_sample_compare_read_stats( r_data, ctrl_means, ctrl_sds, fm_offset=FM_OFFSET_DEFAULT, reg_data=None): - """Compute signficance statistics using comparison of two sequenceing samples method for a single read within a specified genomic region. + """Compute signficance statistics using comparison of two sequenceing + samples method for a single read within a specified genomic region. Args: r_data (:class:`tombo.tombo_helper.readData`): read data - ctrl_means (`np.array::np.float64`): mean level values from control set of reads - ctrl_sds (`np.array::np.float64`): level SD values from control set of reads - fm_offset (int): Fisher's Method offset for computing locally combined p-values (optional; default: 1) - reg_data (:class:`tombo.tombo_helper.intervalData`): region to test (default: test whole read) + ctrl_means (`np.array::np.float64`): mean level values from control set + of reads + ctrl_sds (`np.array::np.float64`): level SD values from control set of + reads + fm_offset (int): Fisher's Method offset for computing locally combined + p-values (optional; default: 1) + reg_data (:class:`tombo.tombo_helper.intervalData`): region to test + (default: test whole read) Returns: Read testing results, positions tested and the read_id @@ -2843,10 +3715,10 @@ def comp_clip_and_flip(): def get_read_comp_z_score(r_means, read_start, read_end): r_z_scores = np.abs( - r_means - ctrl_means[read_start-reg_start+fm_offset: - read_end-reg_start+fm_offset]) / ctrl_sds[ - read_start-reg_start+fm_offset: - read_end-reg_start+fm_offset] + r_means - ctrl_means[read_start - reg_start + fm_offset: + read_end - reg_start + fm_offset]) / ctrl_sds[ + read_start - reg_start + fm_offset: + read_end - reg_start + fm_offset] return r_z_scores @@ -2875,21 +3747,22 @@ def get_pvals(r_z_scores): r_poss += read_start - return r_pvals, r_poss, read_id + return {SAMP_COMP_TXT:r_pvals}, {SAMP_COMP_TXT:r_poss}, read_id def compute_de_novo_read_stats( - r_data, std_ref, fm_offset=FM_OFFSET_DEFAULT, reg_data=None, - gnm_begin_lag=None, gnm_end_lag=None): - """Compute signficance statistics using de novo comparison to a canonical model method for a single read within a specified genomic region. + r_data, std_ref, fm_offset=FM_OFFSET_DEFAULT, reg_data=None): + """Compute signficance statistics using de novo comparison to a canonical + model method for a single read within a specified genomic region. Args: r_data (:class:`tombo.tombo_helper.readData`): read data - std_ref (:class:`tombo.tombo_stats.TomboModel`): canonical expected signal level model - fm_offset (int): Fisher's Method offset for computing locally combined p-values (optional; default: 1) - reg_data (:class:`tombo.tombo_helper.intervalData`): region to test (default: test whole read) - gnm_begin_lag (int): upstream genomic overhang required for k-mer lookup (optional; default compute from read strand and `std_ref`) - gnm_end_lag (int): downstream genomic overhang required for k-mer lookup (optional; default compute from read strand and `std_ref`) + std_ref (:class:`tombo.tombo_stats.TomboModel`): canonical expected + signal level model + fm_offset (int): Fisher's Method offset for computing locally combined + p-values (optional; default: 1) + reg_data (:class:`tombo.tombo_helper.intervalData`): region to test + (default: test whole read) Returns: Read testing results, positions tested and the read_id @@ -2901,12 +3774,11 @@ def compute_de_novo_read_stats( reg_start = reg_data.start if reg_data is not None else r_data.start reg_size = (reg_data.end - reg_data.start if reg_data is not None else r_data.end - r_data.start) - if gnm_begin_lag is None or gnm_end_lag is None: - dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 - gnm_begin_lag = (std_ref.central_pos if r_data.strand == '+' else - dnstrm_bases) - gnm_end_lag = (dnstrm_bases if r_data.strand == '+' else - std_ref.central_pos) + dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 + gnm_begin_lag = (std_ref.central_pos if r_data.strand == '+' else + dnstrm_bases) + gnm_end_lag = (dnstrm_bases if r_data.strand == '+' else + std_ref.central_pos) def de_novo_clip_and_flip(): with h5py.File(r_data.fn, 'r') as fast5_data: @@ -2949,8 +3821,8 @@ def de_novo_clip_and_flip(): raise th.TomboError( 'Read does not contain information in this region.') - r_ref_means, r_ref_sds, _, _ = get_ref_from_seq( - r_seq, std_ref, r_data.strand == '-') + r_ref_means, r_ref_sds = std_ref.get_exp_levels_from_seq( + r_seq, r_data.strand == '-') if r_data.strand == '-': # reverse means to match genomic order @@ -2979,11 +3851,12 @@ def de_novo_clip_and_flip(): r_poss = np.arange(read_start, read_end) - return r_pvals, r_poss, read_id + return {DE_NOVO_TXT:r_pvals}, {DE_NOVO_TXT:r_poss}, read_id -def calc_llh_ratio(reg_means, reg_ref_means, reg_ref_vars, - reg_alt_means, reg_alt_vars): - """Compute log likelihood ratio. This is about 10X slower than the cython version in tombo._c_helper, but has been kept for debugging purposes. +def calc_llh_ratio( + reg_means, reg_ref_means, reg_ref_vars, reg_alt_means, reg_alt_vars): + """Compute log likelihood ratio. This is about 10X slower than the cython + version in tombo._c_helper, but has been kept for debugging purposes. """ # compute log likelihood ratio # positive value means standard base fits data better @@ -2993,19 +3866,107 @@ def calc_llh_ratio(reg_means, reg_ref_means, reg_ref_vars, np.sum(np.square(reg_means - reg_ref_means) / reg_ref_vars) + np.sum(np.log(reg_ref_vars))) +def trim_seq_and_means( + seq, means, r_start, reg_start, reg_end, strand, + kmer_width, central_pos, max_motif_bb, max_motif_ab): + """Return trimmed arrays to the genomic region specified. + + Args: + seq: read genomic sequence (read-centric for rev strand) + means: read base level means + r_start: genomic start position + reg_start: genomic region start + reg_end: genomic region end + strand: read mapped strand + kmer_width: standard and alt k-mer width + central_pos: central position within standard k-mer + max_motif_bb: maximum (over alt models) bases required upstream for + motif search + max_motif_ab: maximum (over alt models) bases required downstream for + motif search + + Arrays are: + 1) read centric k-mers (for expected level lookup) + 2) read-centric k-mer model-able means + 3) genome-centric alt test-able start + 4) motif search seq + """ + r_end = r_start + means.shape[0] + # save un-clipped motif search seq + motif_search_seq = seq + + # clip read if it extends outside the current genomic region, so + # stats are only computed within this region + num_start_clip, num_end_clip = 0, 0 + if r_start + kmer_width - 1 < reg_start: + if strand == '+': + num_start_clip = reg_start - (r_start + kmer_width - 1) + else: + num_end_clip = reg_start - (r_start + kmer_width - 1) + r_start = reg_start - (kmer_width - 1) + if r_end - kmer_width + 1 > reg_end: + if strand == '+': + num_end_clip = r_end - kmer_width + 1 - reg_end + else: + num_start_clip = r_end - kmer_width + 1 - reg_end + + # clip sequence to that required for expected level lookups + seq = seq[num_start_clip:] + if num_end_clip > 0: + seq = seq[:-num_end_clip] + + # clip extra bits from means to test-able means + means = means[num_start_clip + central_pos:] + means = means[:-(num_end_clip + kmer_width - central_pos - 1)] + + # if this read does not cover enough of this region for stat + # alternate stat computation raise an error to be handled below + if means.shape[0] < kmer_width: + raise th.TomboError('Read sequence too short in this region.') + + kmers = th.get_seq_kmers(seq, kmer_width) + if len(kmers) != means.shape[0]: + raise th.TomboError('Mismatching k-mer and mean levels.') + + # shift r_start to first alt test-able position + r_start += kmer_width - 1 + + # clip and/or pad sequence for motif searching + if num_start_clip + kmer_width - 1 - max_motif_bb >= 0: + motif_search_seq = motif_search_seq[ + num_start_clip + kmer_width - 1 - max_motif_bb:] + else: + motif_search_seq = 'N' * -( + num_start_clip + kmer_width - 1 - max_motif_bb) + motif_search_seq + if num_end_clip + kmer_width - 1 - max_motif_ab >= 0: + motif_search_seq = motif_search_seq[ + :-(num_end_clip + kmer_width - 1 - max_motif_ab)] + else: + motif_search_seq = motif_search_seq + 'N' * -( + num_end_clip + kmer_width - 1 - max_motif_ab) + if (len(motif_search_seq) - max_motif_bb - max_motif_bb != + means.shape[0] - ((kmer_width - 1) * 2)): + th.TomboError('Motif search sequence not correct length.') + + return kmers, means, r_start, motif_search_seq + def compute_alt_model_read_stats( - r_data, std_ref, alt_ref, use_standard_llhr=False, reg_data=None, - gnm_begin_lag=None, gnm_end_lag=None): - """Compute signficance statistics using comparison of read signal to canonical and alternative models method for a single read within a specified genomic region. + r_data, std_ref, alt_refs, use_standard_llhr=False, reg_data=None): + """Compute signficance statistics using comparison of read signal to + canonical and alternative models method for a single read within a + specified genomic region. Args: r_data (:class:`tombo.tombo_helper.readData`): read data - std_ref (:class:`tombo.tombo_stats.TomboModel`): canonical expected signal level model - alt_ref (:class:`tombo.tombo_stats.TomboModel`): alternative expected signal level model - use_standard_llhr (bool): compute standard likelihood ratio; for details see https://nanoporetech.github.io/tombo/modified_base_detection.html#alternative-model-method (optional; default: False) - reg_data (:class:`tombo.tombo_helper.intervalData`): region to test (default: test whole read) - gnm_begin_lag (int): upstream genomic overhang required for k-mer lookup (optional; default compute from read strand and `std_ref`) - gnm_end_lag (int): downstream genomic overhang required for k-mer lookup (optional; default compute from read strand and `std_ref`) + std_ref (:class:`tombo.tombo_stats.TomboModel`): canonical expected + signal level model + alt_refs (list of :class:`tombo.tombo_stats.AltModel`): alternative + expected signal level models + use_standard_llhr (bool): compute standard likelihood ratio; for details + see https://nanoporetech.github.io/tombo/modified_base_detection.html#alternative-model-method + (optional; default: False) + reg_data (:class:`tombo.tombo_helper.intervalData`): region to test + (default: test whole read) Returns: Read testing results, positions tested and the read_id @@ -3015,17 +3976,21 @@ def compute_alt_model_read_stats( 3) read_id (str): read identifier """ reg_start = reg_data.start if reg_data is not None else r_data.start - reg_size = (reg_data.end - reg_data.start if reg_data is not None - else r_data.end - r_data.start) - if gnm_begin_lag is None or gnm_end_lag is None: - dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 - gnm_begin_lag = (std_ref.central_pos if r_data.strand == '+' else - dnstrm_bases) - gnm_end_lag = (dnstrm_bases if r_data.strand == '+' else - std_ref.central_pos) - - std_ref.kmer_width = gnm_begin_lag + gnm_end_lag + 1 + reg_end = reg_data.end if reg_data is not None else r_data.end + + # number of sequence positions to keep in order to look up both motif hits + # and expected levels + max_motif_bb = max([ + alt_ref.motif.mod_pos - 1 for _, alt_ref in alt_refs]) + max_motif_ab = max([ + alt_ref.motif.motif_len - alt_ref.motif.mod_pos + for _, alt_ref in alt_refs]) + def alt_clip_and_flip(): + """Get read information in order to test requested region of this read + + base means, seq, k-mers, genomic start and read id + """ with h5py.File(r_data.fn, 'r') as fast5_data: r_means, r_seq = th.get_multiple_slots_read_centric( fast5_data, ['norm_mean', 'base'], r_data.corr_group) @@ -3036,92 +4001,70 @@ def alt_clip_and_flip(): 'Read does not contain valid re-squiggled data.') r_seq = b''.join(r_seq).decode() - read_start = r_data.start - # clip read if it extends outside the current genomic region, so - # stats are only computed within this region - if read_start + std_ref.kmer_width - 1 < reg_start: - num_start_clip = reg_start - (read_start + std_ref.kmer_width - 1) - read_start = reg_start - (std_ref.kmer_width - 1) - if r_data.strand == '+': - r_means = r_means[num_start_clip:] - r_seq = r_seq[num_start_clip:] - else: - r_means = r_means[:-num_start_clip] - r_seq = r_seq[:-num_start_clip] - if r_data.end - (std_ref.kmer_width - 1) > reg_start + reg_size: - num_end_clip = (r_data.end - (std_ref.kmer_width - 1)) - ( - reg_start + reg_size) - if r_data.strand == '+': - r_means = r_means[:-num_end_clip] - r_seq = r_seq[:-num_end_clip] - else: - r_means = r_means[num_end_clip:] - r_seq = r_seq[num_end_clip:] - - # if this read does not cover enough of this region for stat - # computation raise an error to be handled below - if len(r_seq) < std_ref.kmer_width: - raise th.TomboError( - 'Read does not contain information in this region.') - - r_ref_means, r_ref_sds, r_alt_means, r_alt_sds = get_ref_from_seq( - r_seq, std_ref, r_data.strand == '-', alt_ref) - - if r_data.strand == '-': - # reverse means and seq to match genomic order - r_means = r_means[::-1] - r_seq = r_seq[::-1] - # clip means to individual tested positions - r_means = r_means[gnm_begin_lag:-gnm_end_lag] - # trim seq to positions with valid llh ratio test results - # this is shorter than the means and model - r_seq = r_seq[(std_ref.kmer_width - 1):-(std_ref.kmer_width - 1)] - read_start += std_ref.kmer_width - 1 + r_kmers, r_means, r_start, motif_search_seq = trim_seq_and_means( + r_seq, r_means, r_data.start, reg_start, reg_end, r_data.strand, + std_ref.kmer_width, std_ref.central_pos, max_motif_bb, max_motif_ab) - return (r_means, r_seq, r_ref_means, r_ref_sds, read_start, - r_alt_means, r_alt_sds, read_id) + return r_kmers, r_means, motif_search_seq, r_start, read_id - (r_means, r_seq, r_ref_means, r_ref_sds, read_start, - r_alt_means, r_alt_sds, read_id) = alt_clip_and_flip() + r_kmers, r_means, motif_search_seq, r_start, read_id = alt_clip_and_flip() + testable_len = r_means.shape[0] - std_ref.kmer_width + 1 + r_ref_means, r_ref_sds = std_ref.get_exp_levels_from_kmers(r_kmers) r_ref_vars = np.square(r_ref_sds) - r_alt_vars = np.square(r_alt_sds) - - alt_base_poss = [] - log_lh_ratios = [] - # note search space is clipped since all k-mers covering the position - # of interest must be valid - for alt_base_pos in re.finditer(alt_ref.alt_base, r_seq): - alt_pos = alt_base_pos.start() - alt_base_poss.append(alt_pos + read_start) - pos_args = [r_means[alt_pos:alt_pos + std_ref.kmer_width], - r_ref_means[alt_pos:alt_pos + std_ref.kmer_width], - r_alt_means[alt_pos:alt_pos + std_ref.kmer_width]] - if CONST_SD_MODEL: - const_var = r_ref_vars[alt_pos] - if use_standard_llhr: - pos_lh_ratio = c_calc_llh_ratio_const_var( - *(pos_args + const_var)) + + all_poss = {} + all_llhrs = {} + for alt_name, alt_ref in alt_refs: + alt_base_poss = [] + log_lh_ratios = [] + # note search space is clipped since all k-mers covering the position + # of interest must be valid + alt_i_motif_search_seq = motif_search_seq[ + max_motif_bb - (alt_ref.motif.mod_pos - 1):] + if max_motif_ab - (alt_ref.motif.motif_len - alt_ref.motif.mod_pos) > 0: + alt_i_motif_search_seq = alt_i_motif_search_seq[ + :-(max_motif_ab - ( + alt_ref.motif.motif_len - alt_ref.motif.mod_pos))] + for alt_m in alt_ref.motif.motif_pat.finditer(alt_i_motif_search_seq): + alt_pos = alt_m.start() + if r_data.strand == '+': + alt_base_poss.append(r_start + alt_pos) else: - pos_lh_ratio = c_calc_scaled_llh_ratio_const_var( - *(pos_args + [const_var, OCLLHR_SCALE, - OCLLHR_HEIGHT, OCLLHR_POWER])) - else: - if use_standard_llhr: - pos_lh_ratio = c_calc_llh_ratio( - *(pos_args + [ - r_ref_vars[alt_pos:alt_pos + std_ref.kmer_width], - r_alt_vars[alt_pos:alt_pos + std_ref.kmer_width]])) + alt_base_poss.append(r_start + testable_len - alt_pos - 1) + r_pos_alt_means, r_pos_alt_sds = alt_ref.get_exp_levels_from_kmers( + r_kmers[alt_pos:alt_pos + alt_ref.kmer_width]) + pos_args = [r_means[alt_pos:alt_pos + std_ref.kmer_width], + r_ref_means[alt_pos:alt_pos + std_ref.kmer_width], + r_pos_alt_means] + if CONST_SD_MODEL: + const_var = r_ref_vars[alt_pos] + if use_standard_llhr: + pos_lh_ratio = c_calc_llh_ratio_const_var( + *(pos_args + const_var)) + else: + pos_lh_ratio = c_calc_scaled_llh_ratio_const_var( + *(pos_args + [const_var, OCLLHR_SCALE, + OCLLHR_HEIGHT, OCLLHR_POWER])) else: - raise th.TomboError( - 'Variable SD scaled likelihood ratio not implemented.') - log_lh_ratios.append(pos_lh_ratio) + if use_standard_llhr: + pos_lh_ratio = c_calc_llh_ratio( + *(pos_args + [ + r_ref_vars[alt_pos:alt_pos + std_ref.kmer_width], + np.square(r_pos_alt_sds)])) + else: + raise th.TomboError( + 'Variable SD scaled likelihood ratio not implemented.') + log_lh_ratios.append(pos_lh_ratio) - return np.array(log_lh_ratios), np.array(alt_base_poss), read_id + all_llhrs[alt_name] = np.array(log_lh_ratios) + all_poss[alt_name] = np.array(alt_base_poss) + + return all_llhrs, all_poss, read_id def apply_per_read_thresh( reg_base_stats, single_read_thresh, lower_thresh, stat_type, - reg_poss, ctrl_cov=None): + stat_locs, ctrl_cov=None): reg_cov = np.array([base_stats.shape[0] for base_stats in reg_base_stats]) if lower_thresh is not None: @@ -3145,10 +4088,10 @@ def apply_per_read_thresh( if stat_type == SAMP_COMP_TXT: ctrl_cov = [ctrl_cov[pos] if pos in ctrl_cov else 0 - for pos in reg_poss] + for pos in stat_locs] else: # convert to list since python2 repeat objects can't be pickled - ctrl_cov = list(repeat(0, reg_poss.shape[0])) + ctrl_cov = list(repeat(0, stat_locs.shape[0])) reg_frac_std_base = np.array([ np.greater_equal( @@ -3158,27 +4101,80 @@ def apply_per_read_thresh( return reg_frac_std_base, reg_cov, ctrl_cov, valid_cov -def compute_reg_stats( - reg_data, fm_offset, min_test_reads, - single_read_thresh, lower_thresh, ctrl_reg_data, std_ref, - alt_ref, use_standard_llhr, per_read_q, stat_type, prior_weights): - if stat_type == SAMP_COMP_TXT: - ctrl_means, ctrl_sds, ctrl_cov = get_reads_ref( - ctrl_reg_data, min_test_reads, fm_offset, std_ref, prior_weights) - else: +def collate_reg_stats( + stats, stat_locs, read_ids, per_read_q, reg_data, single_read_thresh, + lower_thresh, stat_type, stat_name, ctrl_cov): + stats = np.concatenate(stats) + stat_locs = np.concatenate(stat_locs) + # remove nans possibly introduced by fisher's method calculcations + valid_poss = ~np.isnan(stats) + stat_locs = stat_locs[valid_poss] + stats = stats[valid_poss] + assert stat_locs.shape[0] == stats.shape[0], '\t'.join(map(str, ( + stat_locs.shape[0], stats.shape[0]))) + + if per_read_q is not None: + valid_read_ids = [ + rep_r_id for rep_r_id, is_valid in zip( + [rep_r_id for r_id, r_len in read_ids + for rep_r_id in repeat(r_id, r_len)], valid_poss) if is_valid] + read_id_lookup = dict(( + (read_id, read_id_val) + for read_id_val, read_id in enumerate(set(valid_read_ids)))) + conv_read_ids = np.array([ + read_id_lookup[r_id] for r_id in valid_read_ids]) + assert conv_read_ids.shape[0] == stat_locs.shape[0] + per_read_block = np.array( + list(zip(stat_locs, stats, conv_read_ids)), + dtype=[(str('pos'), 'u4'), (str('stat'), 'f8'), + (str('read_id'), 'u4')]) + per_read_q.put(( + stat_name, (per_read_block, read_id_lookup, reg_data.chrm, + reg_data.strand, reg_data.start))) + + # get order of all bases from position array + as_stat_locs = np.argsort(stat_locs) + # sort all positions from all reads + stat_locs = stat_locs[as_stat_locs] + # get unique tested genomic positions across all reads + us_stat_locs = np.unique(stat_locs) + + if stat_locs.shape[0] == 0: + raise th.TomboError('No valid positions in this region.') + + # then sort the stats array by genomic position and + # split into stats by genomic base position + reg_base_stats = np.split( + stats[as_stat_locs], + np.where(np.concatenate([[0,], np.diff(stat_locs)]) > 0)[0]) + + reg_frac_std_base, reg_cov, ctrl_cov, valid_cov = apply_per_read_thresh( + reg_base_stats, single_read_thresh, lower_thresh, + stat_type, stat_locs, ctrl_cov) + + return th.regionStats( + reg_frac_std_base, us_stat_locs, reg_data.chrm, reg_data.strand, + reg_data.start, reg_cov, ctrl_cov, valid_cov) + +def compute_reg_stats( + reg_data, fm_offset, min_test_reads, + single_read_thresh, lower_thresh, ctrl_reg_data, std_ref, + alt_refs, use_standard_llhr, per_read_q, stat_type, prior_weights): + if stat_type == SAMP_COMP_TXT: + ctrl_means, ctrl_sds, ctrl_cov = get_reads_ref( + ctrl_reg_data, min_test_reads, fm_offset, std_ref, prior_weights) + else: # TODO get region sequence and expected levels/sds here # instead of for each read # after that add per-read stat computation to API ctrl_cov = None - # compute begin and end lag wrt the genome from upstream and downstream - # which are wrt to the read - dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 - gnm_begin_lag = ( - std_ref.central_pos if reg_data.strand == '+' else dnstrm_bases) - gnm_end_lag = ( - dnstrm_bases if reg_data.strand == '+' else std_ref.central_pos) - - reg_read_stats, reg_poss, reg_ids = [], [], [] + + # store multiple alt references lists (default to stat_type for de novo or + # sample comp testing) + stat_names = [stat_type,] if stat_type != ALT_MODEL_TXT else list( + zip(*alt_refs))[0] + reg_read_stats, stat_locs, reg_ids = [ + dict((stat_name, []) for stat_name in stat_names) for _ in range(3)] for r_data in reg_data.reads: try: if stat_type == SAMP_COMP_TXT: @@ -3186,89 +4182,207 @@ def compute_reg_stats( r_data, ctrl_means, ctrl_sds, fm_offset, reg_data) elif stat_type == DE_NOVO_TXT: r_stats, r_poss, read_id = compute_de_novo_read_stats( - r_data, std_ref, fm_offset, reg_data, - gnm_begin_lag, gnm_end_lag) + r_data, std_ref, fm_offset, reg_data) else: r_stats, r_poss, read_id = compute_alt_model_read_stats( - r_data, std_ref, alt_ref, use_standard_llhr, - reg_data, gnm_begin_lag, gnm_end_lag) + r_data, std_ref, alt_refs, use_standard_llhr, reg_data) except th.TomboError: continue if r_stats is None: continue - reg_read_stats.append(r_stats) - reg_poss.append(r_poss) - reg_ids.append(read_id) + for stat_name, stat_r_stats in r_stats.items(): + reg_read_stats[stat_name].append(stat_r_stats) + reg_ids[stat_name].append((read_id, stat_r_stats.shape[0])) + stat_locs[stat_name].append(r_poss[stat_name]) + + if sum(len(stat_reg_read_stats) + for stat_reg_read_stats in reg_read_stats.values()) == 0: + raise th.TomboError('Reads contains no statistics in this region.') + + reg_stats = [ + (stat_name, collate_reg_stats( + stat_name_stats, stat_locs[stat_name], reg_ids[stat_name], + per_read_q, reg_data, single_read_thresh, lower_thresh, stat_type, + stat_name, ctrl_cov)) + for stat_name, stat_name_stats in reg_read_stats.items()] - if len(reg_read_stats) == 0: - raise th.TomboError('Read contains no statistics in this region.') + return reg_stats - if per_read_q is not None: - # compile read_ids vector for per-read output - reg_ids = [(r_id, r_poss.shape[0]) - for r_id, r_poss, in zip(reg_ids, reg_poss)] - reg_read_stats = np.concatenate(reg_read_stats) - reg_poss = np.concatenate(reg_poss) - # remove nans possibly introduced by fisher's method calculcations - valid_poss = ~np.isnan(reg_read_stats) - reg_poss = reg_poss[valid_poss] - reg_read_stats = reg_read_stats[valid_poss] - assert reg_poss.shape[0] == reg_read_stats.shape[0], '\t'.join(map(str, ( - reg_poss.shape[0], reg_read_stats.shape[0]))) +################################### +########## Level Testing ########## +################################### - if per_read_q is not None: - valid_reg_ids = [ - rep_r_id for rep_r_id, is_valid in zip( - [rep_r_id for r_id, r_len in reg_ids - for rep_r_id in repeat(r_id, r_len)], valid_poss) if is_valid] - read_id_lookup = dict(( - (read_id, read_id_val) - for read_id_val, read_id in enumerate(set(valid_reg_ids)))) - conv_reg_ids = np.array([ - read_id_lookup[r_id] for r_id in valid_reg_ids]) - assert conv_reg_ids.shape[0] == reg_poss.shape[0] - per_read_block = np.array( - list(zip(reg_poss, reg_read_stats, conv_reg_ids)), - dtype=[(str('pos'), 'u4'), (str('stat'), 'f8'), - (str('read_id'), 'u4')]) - per_read_q.put(( - per_read_block, read_id_lookup, reg_data.chrm, - reg_data.strand, reg_data.start)) +def compute_ks_tests(samp_base_levels, ctrl_base_levels, return_stat): + def compute_pos_ks_test(pos_samp_levels, pos_ctrl_levels): + """Compute effect size statistic or p-value of two-sample + Kolmogorov-Smirnov test - # get order of all bases from position array - as_reg_poss = np.argsort(reg_poss) - # sort all positions from all reads - reg_poss = reg_poss[as_reg_poss] - # get unique tested genomic positions across all reads - us_reg_poss = np.unique(reg_poss) + Using definition from + https://github.com/scipy/scipy/blob/v0.14.0/scipy/stats/stats.py#L3886 + """ + samp_n, ctrl_n = pos_samp_levels.shape[0], pos_ctrl_levels.shape[0] + pos_all_levels = np.concatenate([pos_samp_levels, pos_ctrl_levels]) + samp_cdf = np.searchsorted( + pos_samp_levels, pos_all_levels, side='right') / samp_n + ctrl_cdf = np.searchsorted( + pos_ctrl_levels, pos_all_levels, side='right') / ctrl_n + d = np.max(np.absolute(samp_cdf - ctrl_cdf)) + if return_stat: + # subtract 1 so most significant are smallest values + return 1 - d + en = np.sqrt(samp_n * ctrl_n / float(samp_n + ctrl_n)) + return stats.distributions.kstwobign.sf((en + 0.12 + 0.11 / en) * d) + + + samp_valid_indices = np.logical_not(np.isnan(samp_base_levels)) + ctrl_valid_indices = np.logical_not(np.isnan(ctrl_base_levels)) + return np.array([compute_pos_ks_test( + np.sort(pos_samp_levels[samp_valid_indices[i]]), + np.sort(pos_ctrl_levels[ctrl_valid_indices[i]])) + for i, (pos_samp_levels, pos_ctrl_levels) in enumerate(zip( + samp_base_levels, ctrl_base_levels))]) + +def compute_u_tests(samp_base_levels, ctrl_base_levels, return_stat): + def compute_pos_u_test(pos_samp_levels, pos_ctrl_levels): + """Compute effect size statistic or p-value of two-sample u-test + """ + samp_n, ctrl_n = pos_samp_levels.shape[0], pos_ctrl_levels.shape[0] + tot_comps = samp_n * ctrl_n + pos_all_levels = np.concatenate([pos_samp_levels, pos_ctrl_levels]) + + ranks = np.empty(samp_n + ctrl_n, int) + ranks[pos_all_levels.argsort()] = np.arange(1, samp_n + ctrl_n + 1) + samp_ranks_sum = ranks[:samp_n].sum() + #ctrl_ranks_sum = ranks[samp_n:].sum() + + u1 = samp_ranks_sum - (samp_n * (samp_n + 1)) / 2 + #u2 = ctrl_ranks_sum - (ctrl_n * (ctrl_n + 1)) / 2 + u2 = tot_comps - u1 + u = min(u1, u2) + + mu = tot_comps / 2 + if return_stat: + # this will be negative (flip sign for stat transform) + return (u - mu) / mu + + rhou = np.sqrt(tot_comps * (tot_comps + 1) / 12) + z = (u - mu) / rhou + return stats.norm.cdf(z) * 2.0 + + + samp_valid_indices = np.logical_not(np.isnan(samp_base_levels)) + ctrl_valid_indices = np.logical_not(np.isnan(ctrl_base_levels)) + return np.array([compute_pos_u_test( + np.sort(pos_samp_levels[samp_valid_indices[i]]), + np.sort(pos_ctrl_levels[ctrl_valid_indices[i]])) + for i, (pos_samp_levels, pos_ctrl_levels) in enumerate(zip( + samp_base_levels, ctrl_base_levels))]) + +def compute_t_tests(samp_base_levels, ctrl_base_levels, return_stat): + def compute_pos_t_test(pos_samp_levels, pos_ctrl_levels): + """Compute effect size statistic or p-value of two-sample t-test + """ + samp_n, ctrl_n = pos_samp_levels.shape[0], pos_ctrl_levels.shape[0] + tot_comps = samp_n * ctrl_n + + samp_mean, samp_sd = c_mean_std(pos_samp_levels) + ctrl_mean, ctrl_sd = c_mean_std(pos_ctrl_levels) + + if return_stat: + # this will be negative (flip sign for stat transform) + return -np.abs(samp_mean - ctrl_mean) / np.sqrt( + ((samp_sd ** 2) + (ctrl_sd ** 2)) / 2) + + sp = np.sqrt((((samp_n - 1) * (samp_sd ** 2)) + + (ctrl_n - 1) * (ctrl_sd ** 2)) / + (samp_n + ctrl_n - 2)) + t = -np.abs(samp_mean - ctrl_mean) / ( + sp * np.sqrt((1 / samp_n) + (1 / ctrl_n))) + + # t dist with samp_n + ctrl_n - 2 d.o.f. + return stats.t.cdf(t, samp_n + ctrl_n - 2) * 2.0 + + + samp_valid_indices = np.logical_not(np.isnan(samp_base_levels)) + ctrl_valid_indices = np.logical_not(np.isnan(ctrl_base_levels)) + return np.array([compute_pos_t_test( + np.sort(pos_samp_levels[samp_valid_indices[i]]), + np.sort(pos_ctrl_levels[ctrl_valid_indices[i]])) + for i, (pos_samp_levels, pos_ctrl_levels) in enumerate(zip( + samp_base_levels, ctrl_base_levels))]) + +def compute_group_reg_stats( + reg_data, ctrl_reg_data, fm_offset, min_test_reads, stat_type): + samp_base_levels = reg_data.copy().update( + start=reg_data.start - fm_offset, + end=reg_data.end + fm_offset).get_base_levels() + ctrl_base_levels = ctrl_reg_data.copy().update( + start=ctrl_reg_data.start - fm_offset, + end=ctrl_reg_data.end + fm_offset).get_base_levels() - if reg_poss.shape[0] == 0: - raise th.TomboError('No valid positions in this region.') + # get regions with coverage greater than min_test_reads + samp_cov = np.logical_not(np.isnan(samp_base_levels)).sum(axis=1) + ctrl_cov = np.logical_not(np.isnan(ctrl_base_levels)).sum(axis=1) + cov_regs = np.where(np.diff(np.concatenate([[False,], np.logical_and( + np.greater_equal(samp_cov, min_test_reads), + np.greater_equal(ctrl_cov, min_test_reads)), [False,]])))[0] + if len(cov_regs) == 0: + return [] - # then sort the stats array by genomic position and - # split into stats by genomic base position - reg_base_stats = np.split( - reg_read_stats[as_reg_poss], - np.where(np.concatenate([[0,], np.diff(reg_poss)]) > 0)[0]) + reg_stats, reg_poss, reg_cov, reg_ctrl_cov = [], [], [], [] + for cov_start, cov_end in zip(cov_regs[:-1:2], cov_regs[1::2]): + if cov_end - cov_start < (fm_offset * 2) + 1: continue + if stat_type in (KS_TEST_TXT, KS_STAT_TEST_TXT): + cov_reg_stats = compute_ks_tests( + samp_base_levels[cov_start:cov_end], + ctrl_base_levels[cov_start:cov_end], + stat_type == KS_STAT_TEST_TXT) + elif stat_type in (U_TEST_TXT, U_STAT_TEST_TXT): + cov_reg_stats = compute_u_tests( + samp_base_levels[cov_start:cov_end], + ctrl_base_levels[cov_start:cov_end], + stat_type == U_STAT_TEST_TXT) + elif stat_type in (T_TEST_TXT, T_STAT_TEST_TXT): + cov_reg_stats = compute_t_tests( + samp_base_levels[cov_start:cov_end], + ctrl_base_levels[cov_start:cov_end], + stat_type == T_STAT_TEST_TXT) + else: + raise NotImplementedError('Unrecognized test type.') + if fm_offset > 0: + if stat_type in (KS_TEST_TXT, U_TEST_TXT, T_TEST_TXT): + cov_reg_stats = calc_window_fishers_method( + cov_reg_stats, fm_offset) + else: + cov_reg_stats = calc_window_means(cov_reg_stats, fm_offset) + reg_stats.append(cov_reg_stats) + reg_poss.append(np.arange(reg_data.start - fm_offset + cov_start, + reg_data.start - fm_offset + cov_end)) + reg_cov.append(samp_cov[cov_start:cov_end]) + reg_ctrl_cov.append(ctrl_cov[cov_start:cov_end]) - (reg_frac_std_base, reg_cov, ctrl_cov, valid_cov) = apply_per_read_thresh( - reg_base_stats, single_read_thresh, lower_thresh, - stat_type, reg_poss, ctrl_cov) + return [(stat_type, th.groupStats( + np.concatenate(reg_stats), np.concatenate(reg_poss), + reg_data.chrm, reg_data.strand, reg_data.start, + np.concatenate(reg_cov), np.concatenate(reg_ctrl_cov))),] - return reg_frac_std_base, us_reg_poss, reg_cov, ctrl_cov, valid_cov + +############################################## +########## Testing Multi-processing ########## +############################################## def _test_signif_worker( region_q, stats_q, progress_q, per_read_q, reads_index, fm_offset, min_test_reads, single_read_thresh, lower_thresh, ctrl_reads_index, - std_ref, alt_ref, use_standard_llhr, stat_type, prior_weights): + std_ref, alt_refs, use_standard_llhr, stat_type, prior_weights): ctrl_reg_data = None - while not region_q.empty(): + while True: try: reg_data = region_q.get(block=False) except queue.Empty: # sometimes throws false empty error with get(block=False) - if not region_q.empty(): - continue + sleep(0.01) + if not region_q.empty(): continue break if ctrl_reads_index is not None: @@ -3279,16 +4393,20 @@ def _test_signif_worker( continue try: - (reg_frac_std_base, reg_poss, - reg_cov, ctrl_cov, valid_cov) = compute_reg_stats( - reg_data, fm_offset, min_test_reads, single_read_thresh, - lower_thresh, ctrl_reg_data, std_ref, alt_ref, - use_standard_llhr, per_read_q, stat_type, prior_weights) - stats_q.put(th.regionStats( - reg_frac_std_base, reg_poss, reg_data.chrm, reg_data.strand, - reg_data.start, reg_cov, ctrl_cov, valid_cov)) + if stat_type in (ALT_MODEL_TXT, DE_NOVO_TXT, SAMP_COMP_TXT): + stat_type_reg_stats = compute_reg_stats( + reg_data, fm_offset, min_test_reads, single_read_thresh, + lower_thresh, ctrl_reg_data, std_ref, alt_refs, + use_standard_llhr, per_read_q, stat_type, prior_weights) + else: + stat_type_reg_stats = compute_group_reg_stats( + reg_data, ctrl_reg_data, fm_offset, min_test_reads, + stat_type) except th.TomboError: - pass + progress_q.put(1) + continue + for stat_type_i_reg_stats in stat_type_reg_stats: + stats_q.put(stat_type_i_reg_stats) progress_q.put(1) return @@ -3301,14 +4419,226 @@ def _test_signif_worker(*args): filename='test_signif.prof') return +def _get_stats_queue( + stats_q, stats_conn, min_test_reads, stats_file_bn, + stat_type, alt_names, reg_size, cov_damp_counts, num_most_signif): + # multiple files with alt_names + all_stats = {} + if stat_type == ALT_MODEL_TXT: + for alt_name in alt_names: + all_stats[alt_name] = ModelStats( + stats_file_bn + '.' + alt_name + '.tombo.stats', + stat_type=stat_type, region_size=reg_size, + cov_damp_counts=cov_damp_counts, cov_thresh=min_test_reads, + num_most_signif=num_most_signif) + elif stat_type in (DE_NOVO_TXT, SAMP_COMP_TXT): + all_stats[stat_type] = ModelStats( + stats_file_bn + '.tombo.stats', + stat_type=stat_type, region_size=reg_size, + cov_damp_counts=cov_damp_counts, cov_thresh=min_test_reads, + num_most_signif=num_most_signif) + else: + all_stats[stat_type] = LevelStats( + stats_file_bn + '.tombo.stats', + stat_type=stat_type, region_size=reg_size, + cov_thresh=min_test_reads, num_most_signif=num_most_signif) + + while True: + try: + stat_name, reg_stats = stats_q.get(block=False) + all_stats[stat_name]._write_stat_block(reg_stats) + except queue.Empty: + # wait for main process to send indicator that all regions + # have been processed + if stats_conn.poll(): + sleep(0.1) + break + sleep(0.1) + continue + + # Clear leftover values from queues + while not stats_q.empty(): + stat_name, reg_stats = stats_q.get(block=False) + all_stats[stat_name]._write_stat_block(reg_stats) + + for stat_name_all_stats in all_stats.values(): + stat_name_all_stats.close() + stats_conn.send(True) + + return + +if _PROFILE_SIGNIF_STATS_OUT: + _get_stats_queue_wrapper = _get_stats_queue + def _get_stats_queue(*args): + import cProfile + cProfile.runctx('_get_stats_queue_wrapper(*args)', globals(), locals(), + filename='test_signif_stats_out.prof') + return + +def _get_per_read_queue( + per_read_q, per_read_conn, per_read_bn, stat_type, alt_names, + region_size): + per_read_stats = {} + if stat_type == ALT_MODEL_TXT: + for alt_name in alt_names: + per_read_stats[alt_name] = PerReadStats( + per_read_bn + '.' + alt_name + '.tombo.per_read_stats', + stat_type, region_size) + else: + per_read_stats[stat_type] = PerReadStats( + per_read_bn + '.tombo.per_read_stats', + stat_type, region_size) + + while True: + try: + stat_name, per_read_block = per_read_q.get(block=False) + per_read_stats[stat_name]._write_per_read_block(*per_read_block) + del per_read_block + except queue.Empty: + if per_read_conn.poll(): + sleep(0.1) + break + sleep(0.1) + continue + + # Clear leftover values from queues + while not per_read_q.empty(): + stat_name, per_read_block = per_read_q.get(block=False) + per_read_stats[stat_name]._write_per_read_block(*per_read_block) + del per_read_block + for stat_name_per_read_stats in per_read_stats.values(): + stat_name_per_read_stats.close() + + # indicate that the process has closed + per_read_conn.send(True) + + return + +if _PROFILE_SIGNIF_PER_READ: + _get_per_read_queue_wrapper = _get_per_read_queue + def _get_per_read_queue(*args): + import cProfile + cProfile.runctx( + '_get_per_read_queue_wrapper(*args)', globals(), locals(), + filename='test_signif_per_read.prof') + return + +def _get_progress_queue(progress_q, prog_conn, num_regions): + th.status_message( + 'Performing modified base detection across genomic regions.') + bar = tqdm(total=num_regions, smoothing=0) + + tot_num_rec_proc = 0 + while True: + try: + iter_val = progress_q.get(block=False) + tot_num_rec_proc += iter_val + bar.update(iter_val) + except queue.Empty: + if prog_conn.poll(): + break + sleep(0.1) + continue + + bar.close() + prog_conn.send(tot_num_rec_proc) + + return + +def test_significance( + reads_index, stat_type, stats_file_bn, region_size, num_processes, + min_test_reads, num_most_signif, per_read_bn=None, + single_read_thresh=None, lower_thresh=None, cov_damp_counts=None, + fm_offset=None, ctrl_reads_index=None, std_ref=None, alt_refs=None, + use_standard_llhr=False, prior_weights=None): + """Test for significant shifted signal in mutliprocessed batches + """ + region_q = Queue() + stats_q = Queue(STAT_BLOCKS_QUEUE_LIMIT) + progress_q = Queue() + per_read_q = Queue(STAT_BLOCKS_QUEUE_LIMIT) \ + if per_read_bn else None + # split chromosomes into separate regions to process independently + num_regions = 0 + # TODO add min coverage when adding ctrl coverage (instead of adding + # as is done currently) + for chrm, strand, reg_start in reads_index.iter_cov_regs( + 1, region_size, ctrl_reads_index): + region_q.put(th.intervalData( + chrm=chrm, start=reg_start, end=reg_start + region_size, + strand=strand)) + num_regions += 1 + # wait for queue items to register in queue and avoid queue appearing empty + sleep(0.1) + + test_args = ( + region_q, stats_q, progress_q, per_read_q, reads_index, fm_offset, + min_test_reads, single_read_thresh, lower_thresh, ctrl_reads_index, + std_ref, alt_refs, use_standard_llhr, stat_type, prior_weights) + test_ps = [] + for p_id in range(num_processes): + p = Process(target=_test_signif_worker, args=test_args) + p.start() + test_ps.append(p) + + # start queue getter processes + if VERBOSE: + main_prog_conn, prog_conn = Pipe() + prog_p = Process(target=_get_progress_queue, + args=(progress_q, prog_conn, num_regions)) + prog_p.daemon = True + prog_p.start() + + # main region stats queue getter + alt_names = None if alt_refs is None else list(zip(*alt_refs))[0] + main_stats_conn, stats_conn = Pipe() + stats_p = Process(target=_get_stats_queue, args=( + stats_q, stats_conn, min_test_reads, stats_file_bn, + stat_type, alt_names, region_size, cov_damp_counts, + num_most_signif)) + stats_p.daemon = True + stats_p.start() + + # per-read stats queue getter + if per_read_bn is not None: + main_per_read_conn, per_read_conn = Pipe() + per_read_p = Process( + target=_get_per_read_queue, + args=(per_read_q, per_read_conn, per_read_bn, stat_type, + alt_names, region_size)) + per_read_p.daemon = True + per_read_p.start() + + # wait for test processes to finish + for test_p in test_ps: + test_p.join() + + # in a very unlikely case the progress queue could die while the + # main process remains active and thus we would have a deadlock here + if VERBOSE and prog_p.is_alive(): + # send signal to getter queue to finish and return results + main_prog_conn.send(True) + # returns total number of processed reads if that is needed + main_prog_conn.recv() + + if per_read_bn is not None: + main_per_read_conn.send(True) + main_per_read_conn.recv() + + main_stats_conn.send(True) + main_stats_conn.recv() + + return + ################################################ ########## Aggregate Multi-processing ########## ################################################ -def _write_stats(stats_q, stats_fn, stat_type, region_size, cov_damp_counts, - min_test_reads, num_most_signif, num_blocks, num_processes): - all_stats = TomboStats( +def _write_stats( + stats_q, stats_fn, stat_type, region_size, cov_damp_counts, + min_test_reads, num_most_signif, num_blocks, num_processes): + all_stats = ModelStats( stats_fn, stat_type=stat_type, region_size=region_size, cov_damp_counts=cov_damp_counts, cov_thresh=min_test_reads, num_most_signif=num_most_signif) @@ -3421,199 +4751,96 @@ def aggregate_per_read_stats( return -############################################## -########## Testing Multi-processing ########## -############################################## - -def _get_stats_queue(stats_q, stats_conn, min_test_reads, stats_file_bn, - alt_name, stat_type, reg_size, cov_damp_counts, - num_most_signif): - stats_fn = stats_file_bn + '.tombo.stats' if alt_name is None else \ - stats_file_bn + '.' + alt_name + '.tombo.stats' - all_stats = TomboStats( - stats_fn, stat_type=stat_type, region_size=reg_size, - cov_damp_counts=cov_damp_counts, cov_thresh=min_test_reads, - num_most_signif=num_most_signif) - while True: - try: - reg_stats = stats_q.get(block=False) - all_stats._write_stat_block(reg_stats) - except queue.Empty: - # wait for main process to send indicator that all regions - # have been processed - if stats_conn.poll(): - sleep(0.1) - break - sleep(0.1) - continue +########################## +##### Main Functions ##### +########################## - # Clear leftover values from queues - while not stats_q.empty(): - reg_stats = stats_q.get(block=False) - all_stats._write_stat_block(reg_stats) +def _est_ref_main(args): + global VERBOSE + VERBOSE = not args.quiet + th.VERBOSE = VERBOSE - if all_stats.is_empty: + if min(args.upstream_bases, args.downstream_bases) == 0: th.error_message_and_exit( - 'No genomic positions contain --minimum-test-reads.') + 'Context upstream and downstream must be greater ' + + 'than 0 for model estimation.') - all_stats.close() - stats_conn.send(True) + std_ref = estimate_kmer_model( + args.fast5_basedirs, args.corrected_group, args.basecall_subgroups, + args.minimum_test_reads, args.upstream_bases, args.downstream_bases, + args.minimum_kmer_observations, args.kmer_specific_sd, + args.coverage_threshold, args.estimate_mean, + args.multiprocess_region_size, args.processes) + std_ref.write_model(args.tombo_model_filename) return -def _get_per_read_queue( - per_read_q, per_read_conn, per_read_fn, stat_type, region_size): - per_read_stats = PerReadStats(per_read_fn, stat_type, region_size) - - while True: - try: - per_read_block = per_read_q.get(block=False) - per_read_stats._write_per_read_block(*per_read_block) - del per_read_block - except queue.Empty: - if per_read_conn.poll(): - sleep(0.1) - break - sleep(0.1) - continue - - # Clear leftover values from queues - while not per_read_q.empty(): - per_read_block = per_read_q.get(block=False) - per_read_stats._write_per_read_block(*per_read_block) - del per_read_block - per_read_stats.close() +def _est_alt_ref_main(args): + global VERBOSE + VERBOSE = not args.quiet + th.VERBOSE = VERBOSE - # indicate that the process has closed - per_read_conn.send(True) + alt_ref = estimate_alt_model( + args.fast5_basedirs, args.control_fast5_basedirs, + args.corrected_group, args.basecall_subgroups, + args.tombo_model_filename, args.seq_sample_type, + args.alternate_model_base, args.alt_fraction_percentile, + args.minimum_kmer_observations, args.save_density_basename, + args.kernel_density_bandwidth, args.alternate_density_filename, + args.control_density_filename, args.processes) + # returns None when profiling method + if alt_ref is None: return + alt_ref.name = args.alternate_model_name + alt_ref.write_model(args.alternate_model_filename) return -def _get_progress_queue(progress_q, prog_conn, num_regions): - th.status_message( - 'Performing modified base detection across genomic regions.') - bar = tqdm(total=num_regions, smoothing=0) - - tot_num_rec_proc = 0 - while True: - try: - iter_val = progress_q.get(block=False) - tot_num_rec_proc += iter_val - bar.update(iter_val) - except queue.Empty: - if prog_conn.poll(): - break - sleep(0.1) - continue +def _est_motif_alt_ref_main(args): + global VERBOSE + VERBOSE = not args.quiet + th.VERBOSE = VERBOSE - bar.close() - prog_conn.send(tot_num_rec_proc) + alt_ref = estimate_motif_alt_model( + args.fast5_basedirs, args.corrected_group, args.basecall_subgroups, + args.motif_description, args.upstream_bases, args.downstream_bases, + args.valid_locations_filename, args.minimum_kmer_observations, + args.minimum_test_reads, args.coverage_threshold, + args.multiprocess_region_size, args.processes) + alt_ref.name = args.alternate_model_name + alt_ref.write_model(args.alternate_model_filename) return -def test_significance( - reads_index, stat_type, per_read_bn, stats_file_bn, - single_read_thresh, lower_thresh, region_size, num_processes, - min_test_reads, cov_damp_counts, num_most_signif, - fm_offset=None, ctrl_reads_index=None, std_ref=None, alt_ref=None, - use_standard_llhr=False, alt_name=None, prior_weights=None): - """Test for significant shifted signal in mutliprocessed batches - """ - region_q = Queue() - stats_q = Queue(STAT_BLOCKS_QUEUE_LIMIT) - progress_q = Queue() - per_read_q = Queue(STAT_BLOCKS_QUEUE_LIMIT) \ - if per_read_bn else None - # split chromosomes into separate regions to process independently - chrm_sizes = th.get_chrm_sizes(reads_index, ctrl_reads_index) - num_regions = 0 - for chrm, chrm_len in chrm_sizes.items(): - # only process regions covered by both samples if control - # reads are provided - plus_covered = ( - (chrm, '+') in reads_index and - (ctrl_reads_index is None or (chrm, '+') in ctrl_reads_index)) - minus_covered = ( - (chrm, '-') in reads_index and - (ctrl_reads_index is None or (chrm, '-') in ctrl_reads_index)) - for reg_start in range(0, chrm_len, region_size): - if plus_covered: - region_q.put(th.intervalData( - chrm=chrm, start=reg_start, end=reg_start + region_size, - strand='+')) - num_regions += 1 - if minus_covered: - region_q.put(th.intervalData( - chrm=chrm, start=reg_start, end=reg_start + region_size, - strand='-')) - num_regions += 1 - - test_args = ( - region_q, stats_q, progress_q, per_read_q, reads_index, fm_offset, - min_test_reads, single_read_thresh, lower_thresh, ctrl_reads_index, - std_ref, alt_ref, use_standard_llhr, stat_type, prior_weights) - test_ps = [] - for p_id in range(num_processes): - p = Process(target=_test_signif_worker, args=test_args) - p.start() - test_ps.append(p) - - # start queue getter processes - if VERBOSE: - main_prog_conn, prog_conn = Pipe() - prog_p = Process(target=_get_progress_queue, - args=(progress_q, prog_conn, num_regions)) - prog_p.daemon = True - prog_p.start() - - # main region stats queue getter - main_stats_conn, stats_conn = Pipe() - stats_p = Process(target=_get_stats_queue, args=( - stats_q, stats_conn, min_test_reads, stats_file_bn, alt_name, stat_type, - region_size, cov_damp_counts, num_most_signif)) - stats_p.daemon = True - stats_p.start() - - # per-read stats queue getter - if per_read_bn is not None: - if stat_type == ALT_MODEL_TXT: - per_read_fn = per_read_bn + '.' + alt_name + '.tombo.per_read_stats' - else: - per_read_fn = per_read_bn + '.tombo.per_read_stats' - main_per_read_conn, per_read_conn = Pipe() - per_read_p = Process( - target=_get_per_read_queue, - args=(per_read_q, per_read_conn, per_read_fn, stat_type, region_size)) - per_read_p.daemon = True - per_read_p.start() - - # wait for test processes to finish - for test_p in test_ps: - test_p.join() - - # in a very unlikely case the progress queue could die while the - # main process remains active and thus we would have a deadlock here - if VERBOSE and prog_p.is_alive(): - # send signal to getter queue to finish and return results - main_prog_conn.send(True) - # returns total number of processed reads if that is needed - main_prog_conn.recv() +def _estimate_scale_main(args): + global VERBOSE + VERBOSE = not args.quiet + th.VERBOSE = VERBOSE - if per_read_bn is not None: - main_per_read_conn.send(True) - main_per_read_conn.recv() + if VERBOSE: th.status_message('Getting files list.') + try: + if not os.path.isdir(args.fast5s_basedir): + th.error_message_and_exit( + 'Provided [fast5-basedir] is not a directory.') + fast5s_basedir = ( + args.fast5s_basedir if args.fast5s_basedir.endswith('/') else + args.fast5s_basedir + '/') + fast5_fns = th.get_files_list(fast5s_basedir) + except OSError: + th.error_message_and_exit( + 'Reads base directory, a sub-directory or an old (hidden) ' + + 'index file does not appear to be accessible. Check ' + + 'directory permissions.') + if len(fast5_fns) < 1: + th.error_message_and_exit( + 'No files identified in the specified ' + + 'directory or within immediate subdirectories.') - main_stats_conn.send(True) - main_stats_conn.recv() + th.status_message('Global scaling estimate: ' + + unicode(estimate_global_scale(fast5_fns))) return - -########################## -##### Main Functions ##### -########################## - -def _test_shifts_de_novo_main( +def _de_novo_main( args, lower_thresh, single_read_thresh, seq_samp_type, reads_index): if seq_samp_type is None: seq_samp_type = th.get_seq_sample_type(reads_index=reads_index) @@ -3628,16 +4855,15 @@ def _test_shifts_de_novo_main( if VERBOSE: th.status_message( 'Performing de novo model testing against canonical model.') test_significance( - reads_index, stat_type, args.per_read_statistics_basename, - args.statistics_file_basename, single_read_thresh, lower_thresh, - args.multiprocess_region_size, args.processes, - args.minimum_test_reads, args.coverage_dampen_counts, - args.num_most_significant_stored, + reads_index, stat_type, args.statistics_file_basename, + args.multiprocess_region_size, args.processes, args.minimum_test_reads, + args.num_most_significant_stored, args.per_read_statistics_basename, + single_read_thresh, lower_thresh, args.coverage_dampen_counts, fm_offset=args.fishers_method_context, std_ref=std_ref) return -def _test_shifts_alt_main( +def _alt_main( args, lower_thresh, single_read_thresh, seq_samp_type, reads_index): if seq_samp_type is None: seq_samp_type = th.get_seq_sample_type(reads_index=reads_index) @@ -3656,22 +4882,20 @@ def _test_shifts_alt_main( if len(alt_refs) == 0: th.error_message_and_exit('No alternative models successfully loaded.') - for alt_name, alt_ref in alt_refs.items(): - if VERBOSE: th.status_message( - 'Performing alternative model testing against ' + - alt_name + ' model.') - test_significance( - reads_index, stat_type, args.per_read_statistics_basename, - args.statistics_file_basename, single_read_thresh, lower_thresh, - args.multiprocess_region_size, args.processes, - args.minimum_test_reads, args.coverage_dampen_counts, - args.num_most_significant_stored, - std_ref=std_ref, alt_ref=alt_ref, alt_name=alt_name, - use_standard_llhr=args.standard_log_likelihood_ratio) + if VERBOSE: th.status_message( + 'Performing specific alternate base(s) testing.') + test_significance( + reads_index, stat_type, args.statistics_file_basename, + args.multiprocess_region_size, args.processes, + args.minimum_test_reads, args.num_most_significant_stored, + args.per_read_statistics_basename, single_read_thresh, lower_thresh, + args.coverage_dampen_counts, + std_ref=std_ref, alt_refs=list(alt_refs.items()), + use_standard_llhr=args.standard_log_likelihood_ratio) return -def _test_shifts_samp_comp_main( +def _model_samp_comp_main( args, lower_thresh, single_read_thresh, seq_samp_type, reads_index): stat_type = SAMP_COMP_TXT if single_read_thresh is None: @@ -3690,17 +4914,41 @@ def _test_shifts_samp_comp_main( reads_index=reads_index) test_significance( - reads_index, stat_type, args.per_read_statistics_basename, - args.statistics_file_basename, single_read_thresh, lower_thresh, - args.multiprocess_region_size, args.processes, - args.minimum_test_reads, args.coverage_dampen_counts, - args.num_most_significant_stored, + reads_index, stat_type, args.statistics_file_basename, + args.multiprocess_region_size, args.processes, args.minimum_test_reads, + args.num_most_significant_stored, args.per_read_statistics_basename, + single_read_thresh, lower_thresh, args.coverage_dampen_counts, fm_offset=args.fishers_method_context, ctrl_reads_index=ctrl_reads_index, std_ref=std_ref, prior_weights=args.model_prior_weights) return +def _level_samp_comp_main(args, reads_index): + if args.statistic_type == 'ks': + stat_type = KS_TEST_TXT if args.store_p_value else KS_STAT_TEST_TXT + elif args.statistic_type == 'u': + stat_type = U_TEST_TXT if args.store_p_value else U_STAT_TEST_TXT + elif args.statistic_type == 't': + stat_type = T_TEST_TXT if args.store_p_value else T_STAT_TEST_TXT + else: + raise th.TomboError('Invalid statistic type.') + + if VERBOSE: th.status_message( + 'Performing two-sample group comparison significance testing.') + ctrl_reads_index = th.TomboReads( + args.alternate_fast5_basedirs, args.corrected_group, + args.basecall_subgroups) + + test_significance( + reads_index, stat_type, args.statistics_file_basename, + args.multiprocess_region_size, args.processes, + args.minimum_test_reads, args.num_most_significant_stored, + fm_offset=args.fishers_method_context, + ctrl_reads_index=ctrl_reads_index) + + return + def _test_shifts_main(args): global VERBOSE VERBOSE = not args.quiet @@ -3723,19 +4971,20 @@ def _test_shifts_main(args): 'Run with --print-available-models option to see possible ' + 'values for the --alternate-bases option.') - if args.single_read_threshold is None: - lower_thresh = None - single_read_thresh = None - elif len(args.single_read_threshold) == 1: - single_read_thresh = args.single_read_threshold[0] - lower_thresh = None - else: - if len(args.single_read_threshold) > 2: - th.warning_message( - 'Only 1 or 2 values may be passed as single-read ' + - 'thresholds. Only using the first 2 options provided.') - lower_thresh = args.single_read_threshold[0] - single_read_thresh = args.single_read_threshold[1] + if 'single_read_threshold' in args: + if args.single_read_threshold is None: + lower_thresh = None + single_read_thresh = None + elif len(args.single_read_threshold) == 1: + single_read_thresh = args.single_read_threshold[0] + lower_thresh = None + else: + if len(args.single_read_threshold) > 2: + th.warning_message( + 'Only 1 or 2 values may be passed as single-read ' + + 'thresholds. Only using the first 2 options provided.') + lower_thresh = args.single_read_threshold[0] + single_read_thresh = args.single_read_threshold[1] try: if args.seq_sample_type is None: @@ -3752,14 +5001,16 @@ def _test_shifts_main(args): args.fast5_basedirs, args.corrected_group, args.basecall_subgroups) if args.action_command == 'de_novo': - _test_shifts_de_novo_main( + _de_novo_main( args, lower_thresh, single_read_thresh, seq_samp_type, reads_index) elif args.action_command == 'alternative_model': - _test_shifts_alt_main( + _alt_main( args, lower_thresh, single_read_thresh, seq_samp_type, reads_index) - elif args.action_command == 'sample_compare': - _test_shifts_samp_comp_main( + elif args.action_command == 'model_sample_compare': + _model_samp_comp_main( args, lower_thresh, single_read_thresh, seq_samp_type, reads_index) + elif args.action_command == 'level_sample_compare': + _level_samp_comp_main(args, reads_index) else: th.error_message_and_exit('Invalid Tombo detect_modifications command.') @@ -3784,77 +5035,8 @@ def _aggregate_per_read_main(args): aggregate_per_read_stats( args.per_read_statistics_filename, single_read_thresh, lower_thresh, args.statistics_filename, args.coverage_dampen_counts, - args.minimum_test_reads, args.num_most_significant_stored, args.processes) - - return - -def _est_ref_main(args): - global VERBOSE - VERBOSE = not args.quiet - th.VERBOSE = VERBOSE - - if min(args.upstream_bases, args.downstream_bases) == 0: - th.error_message_and_exit( - 'Context upstream and downstream must be greater ' + - 'than 0 for model estimation.') - - estimate_kmer_model( - args.fast5_basedirs, args.corrected_group, args.basecall_subgroups, - args.tombo_model_filename, args.minimum_test_reads, - args.upstream_bases, args.downstream_bases, - args.minimum_kmer_observations, args.kmer_specific_sd, - args.coverage_threshold, args.estimate_mean, - args.multiprocess_region_size, args.processes) - - return - -def _est_alt_ref_main(args): - global VERBOSE - VERBOSE = not args.quiet - th.VERBOSE = VERBOSE - - alt_ref = estimate_alt_model( - args.fast5_basedirs, args.control_fast5_basedirs, - args.corrected_group, args.basecall_subgroups, - args.tombo_model_filename, args.seq_sample_type, - args.alternate_model_base, args.alt_fraction_percentile, - args.minimum_kmer_observations, args.save_density_basename, - args.kernel_density_bandwidth, args.alternate_density_filename, - args.control_density_filename, args.processes) - # returns None when profiling method - if alt_ref is None: return - alt_ref.alt_name = args.alternate_model_name - alt_ref.alt_base = args.alternate_model_base - alt_ref.write_model(args.alternate_model_filename) - - return - -def _estimate_scale_main(args): - global VERBOSE - VERBOSE = not args.quiet - th.VERBOSE = VERBOSE - - if VERBOSE: th.status_message('Getting files list.') - try: - if not os.path.isdir(args.fast5s_basedir): - th.error_message_and_exit( - 'Provided [fast5-basedir] is not a directory.') - fast5s_basedir = ( - args.fast5s_basedir if args.fast5s_basedir.endswith('/') else - args.fast5s_basedir + '/') - fast5_fns = th.get_files_list(fast5s_basedir) - except OSError: - th.error_message_and_exit( - 'Reads base directory, a sub-directory or an old (hidden) ' + - 'index file does not appear to be accessible. Check ' + - 'directory permissions.') - if len(fast5_fns) < 1: - th.error_message_and_exit( - 'No files identified in the specified ' + - 'directory or within immediate subdirectories.') - - th.status_message('Global scaling estimate: ' + - unicode(estimate_global_scale(fast5_fns))) + args.minimum_test_reads, args.num_most_significant_stored, + args.processes) return