From c7181acc88cdd8e971e4cd71d01582ae244f00d9 Mon Sep 17 00:00:00 2001 From: Muhammed Hasan Celik Date: Wed, 8 Jan 2020 16:59:57 +0100 Subject: [PATCH] compare chroms, resolves #55 --- kipoiseq/utils.py | 62 +++++++++++++++++++++++++++++++++++-- setup.py | 3 +- tests/data/test.vcf.gz | Bin 573 -> 577 bytes tests/data/test.vcf.gz.tbi | Bin 106 -> 106 bytes tests/test_3_utils.py | 36 ++++++++++++++++++++- 5 files changed, 97 insertions(+), 4 deletions(-) diff --git a/kipoiseq/utils.py b/kipoiseq/utils.py index 5610c37..5d97703 100644 --- a/kipoiseq/utils.py +++ b/kipoiseq/utils.py @@ -1,5 +1,7 @@ -import numpy as np from six import string_types +import numpy as np +import pyranges +from kipoiseq.extractors import MultiSampleVCF, FastaStringExtractor # alphabets: @@ -34,6 +36,62 @@ def parse_dtype(dtype): try: return eval(dtype) except Exception as e: - raise ValueError("Unable to parse dtype: {}. \nException: {}".format(dtype, e)) + raise ValueError( + "Unable to parse dtype: {}. \nException: {}".format(dtype, e)) else: return dtype + + +def _get_chrom_annotation(source): + if type(source) == FastaStringExtractor: + return set(source.fasta.keys()) + elif type(source) == MultiSampleVCF: + return set(source.seqnames) + elif type(source) == pyranges.PyRanges: + return set(source.Chromosome) + else: + raise ValueError('source `%s` is not valid is not valid because ' + ' source type `%s` is not supported.' + % (repr(source), type(source))) + + +def compare_chrom_annotation(sources, strategy='some', core_chroms=None): + """Compares chromosome annotations from different sources. + Throws exception iif annotations are not compatible. + + # Arguments: + sources: list of different objects. vcf, fasta, pyranges are valid. + strategy: comparison strategy. `some` means some intersection excepted + or `all` all chromosomes should be same. + core_chroms: chromosomes must exist. + + # Returns: + chroms common cross files. + + # Example: + ```python + >>> sources = [ + MultiSampleVCF(...), + FastaStringExtractor(...), + pyranges, + pyranges, + MultiSampleVCF(...) + ] + >>> compare_chrom_annotation(sources, strategy='all') + ``` + """ + if not len(sources) > 1: + raise ValueError( + 'At least two item should gived as sources to compare') + + chroms = list(map(_get_chrom_annotation, sources)) + + if strategy == 'all': + assert all(chroms[0] == i for i in chroms), \ + 'chroms annotations are not all same.' + return chroms[0] + elif strategy == 'some': + chrom_intersect = set.intersection(*chroms) + assert len(chrom_intersect) > 0, \ + 'there is not intersection between chromosomes.' + return chrom_intersect diff --git a/setup.py b/setup.py index 4779d5e..59ff134 100755 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ "jedi", "pytest>=3.3.1", "pytest-xdist", # running tests in parallel + "pytest-mock", "pytest-pep8", # see https://github.com/kipoi/kipoi/issues/91 "pytest-cov", "coveralls", @@ -36,7 +37,7 @@ # "genomelake", "keras", "tensorflow", - "pybedtools" + "pybedtools", ] setup( diff --git a/tests/data/test.vcf.gz b/tests/data/test.vcf.gz index d96030e736f3c1f0c1a78351977ff7ccb8e02699..e37d8495bb1302350865c635e79b7e8cee0d7439 100644 GIT binary patch delta 559 zcmV+~0?_@v1i=J$P6IE=N@8&kdzE?Q+4-h_cc)RfsQ03c6&(*-C_Q4u zrvkjGP_T#63P2_$D-_Fxci*Joe1|vmP;ij=7gHGysG|2BoFDbU#kry!iwl%;4i%IM zoK$g&_aJhm_0qtP&UpI<-mh^u$&W%+%nQuIx^Wn2l3{-qw`Hv=MP+~`Jpj(jRC}Tl zb0!s}-SWYIvj6DBcoMZTjS(!SmH{diaXRL~Q$QH)0D#eUY2jmGioWQFVp`*ldHAwkX`r;W%1 xX`)6ZIMm5L8vBSiSHdb?PlI0TMvqoo&#iU-XnaFjRq9Ir0#<b?tiwFb&00000{{{d;LjnLG0*#YhZ<{a_hOgaUVKv$vqL?<7T8&98lq6C@ zK49Cm11G_fV<>hyMcR*_`A8JpDGN6one#s9wf!;}Y#E1bQSGVnevZPAC)>Fj49c8J zCU#y)E5#_+D`_DV+j(E7fk!M3VyAMCJb%+)rctp4*LyMx+6K3;7x^s9aL5T zGObvtSRuUowgnek9MgTtLE>Lb@gTrZY5UOULVHVbhhJmIz_Gd{q)~Z`n4p`DX;G)X3Cn_;#QbE=qA1rU* zT>z!Z4fmnpKI8>e|MVVK@O}W19u0zT7>HG+uiQY06r`%OhZbh+;rj{UoJ>#ykZs~< ziT6-<$()rI-W(XuFmYe{tcm+y)`XUGTDljs1`$qY$=kYtyc4G{Ssf^6>WDYMi{F?y zra%r2!3?C-H%J8@I$!EM$u6Wh{Q#VuLK-gCb3bVi*snuk4GD7oI9){^NL#hSp>Fok*hj>< z5Z3KF4SKC#JzCv7x6%2d@fB%3sW1Ht-@6KFFa-brABzYC000000RIL6LPG)o8vp|U L0000000000=`{Qs diff --git a/tests/data/test.vcf.gz.tbi b/tests/data/test.vcf.gz.tbi index 303734c6182ddd2dd3bd7d460b7e875accd5632c..b385012cf090c177f01988d3925f401248d8a393 100644 GIT binary patch delta 35 rcmd1GnxHQ|S0^BnEp$WGdQXAg7mv>HtY8S$XJKGC`RDkfiCzi-_+$=t delta 35 rcmd1GnxHSeRwp2m&2>Z7dQXAg7mv>HtY8S$XJKHNdPYlgqL%^y@G}g& diff --git a/tests/test_3_utils.py b/tests/test_3_utils.py index 7520b77..201bd3e 100644 --- a/tests/test_3_utils.py +++ b/tests/test_3_utils.py @@ -1,6 +1,10 @@ +from conftest import vcf_file, sample_5kb_fasta_file, example_intervals_bed import pytest import numpy as np -from kipoiseq.utils import parse_alphabet, parse_dtype +import pyranges +from kipoiseq.extractors import FastaStringExtractor, MultiSampleVCF +from kipoiseq.utils import parse_alphabet, parse_dtype, \ + compare_chrom_annotation def test_parse_alphabet(): @@ -17,3 +21,33 @@ def test_parse_type(): assert parse_dtype('float') == float assert parse_dtype(float) == float assert parse_dtype("np.float32") == np.float32 + + +def test_compare_chrom_annotation(): + sources = [ + MultiSampleVCF(vcf_file), + FastaStringExtractor(sample_5kb_fasta_file), + pyranges.read_bed(example_intervals_bed) + ] + + with pytest.raises(ValueError): + assert compare_chrom_annotation([]) + + with pytest.raises(ValueError): + assert compare_chrom_annotation([object()]) + + assert compare_chrom_annotation(sources) == {'chr1'} + assert compare_chrom_annotation(sources, strategy='all') == {'chr1'} + + with pytest.raises(AssertionError) as exception: + sources[1].fasta = {'chr1': '', 'chr2': '', 'chr3': ''} + compare_chrom_annotation(sources, strategy='all') + + assert str(exception.value) == 'chroms annotations are not all same.' + + assert compare_chrom_annotation(sources) == {'chr1'} + + with pytest.raises(AssertionError) as exception: + sources[1].fasta = {'chr2': '', 'chr3': ''} + compare_chrom_annotation(sources) + assert str(exception.value) == 'there is not intersection between chromosomes.'