Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compare chroms, resolves #55 #56

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 60 additions & 2 deletions kipoiseq/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import numpy as np
from six import string_types
import numpy as np
import pyranges
from kipoiseq.extractors import MultiSampleVCF, FastaStringExtractor


# alphabets:
Expand Down Expand Up @@ -34,6 +36,62 @@ def parse_dtype(dtype):
try:
return eval(dtype)
except Exception as e:
raise ValueError("Unable to parse dtype: {}. \nException: {}".format(dtype, e))
raise ValueError(
"Unable to parse dtype: {}. \nException: {}".format(dtype, e))
else:
return dtype


def _get_chrom_annotation(source):
if type(source) == FastaStringExtractor:
return set(source.fasta.keys())
elif type(source) == MultiSampleVCF:
return set(source.seqnames)
elif type(source) == pyranges.PyRanges:
return set(source.Chromosome)
else:
raise ValueError('source `%s` is not valid is not valid because '
' source type `%s` is not supported.'
% (repr(source), type(source)))


def compare_chrom_annotation(sources, strategy='some', core_chroms=None):
"""Compares chromosome annotations from different sources.
Throws exception iif annotations are not compatible.

# Arguments:
sources: list of different objects. vcf, fasta, pyranges are valid.
strategy: comparison strategy. `some` means some intersection excepted
or `all` all chromosomes should be same.
core_chroms: chromosomes must exist.

# Returns:
chroms common cross files.

# Example:
```python
>>> sources = [
MultiSampleVCF(...),
FastaStringExtractor(...),
pyranges,
pyranges,
MultiSampleVCF(...)
]
>>> compare_chrom_annotation(sources, strategy='all')
```
"""
if not len(sources) > 1:
raise ValueError(
'At least two item should gived as sources to compare')

chroms = list(map(_get_chrom_annotation, sources))

if strategy == 'all':
assert all(chroms[0] == i for i in chroms), \
'chroms annotations are not all same.'
return chroms[0]
elif strategy == 'some':
chrom_intersect = set.intersection(*chroms)
assert len(chrom_intersect) > 0, \
'there is not intersection between chromosomes.'
return chrom_intersect
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"jedi",
"pytest>=3.3.1",
"pytest-xdist", # running tests in parallel
"pytest-mock",
"pytest-pep8", # see https://github.com/kipoi/kipoi/issues/91
"pytest-cov",
"coveralls",
Expand All @@ -36,7 +37,7 @@
# "genomelake",
"keras",
"tensorflow",
"pybedtools"
"pybedtools",
]

setup(
Expand Down
Binary file modified tests/data/test.vcf.gz
Binary file not shown.
Binary file modified tests/data/test.vcf.gz.tbi
Binary file not shown.
36 changes: 35 additions & 1 deletion tests/test_3_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from conftest import vcf_file, sample_5kb_fasta_file, example_intervals_bed
import pytest
import numpy as np
from kipoiseq.utils import parse_alphabet, parse_dtype
import pyranges
from kipoiseq.extractors import FastaStringExtractor, MultiSampleVCF
from kipoiseq.utils import parse_alphabet, parse_dtype, \
compare_chrom_annotation


def test_parse_alphabet():
Expand All @@ -17,3 +21,33 @@ def test_parse_type():
assert parse_dtype('float') == float
assert parse_dtype(float) == float
assert parse_dtype("np.float32") == np.float32


def test_compare_chrom_annotation():
sources = [
MultiSampleVCF(vcf_file),
FastaStringExtractor(sample_5kb_fasta_file),
pyranges.read_bed(example_intervals_bed)
]

with pytest.raises(ValueError):
assert compare_chrom_annotation([])

with pytest.raises(ValueError):
assert compare_chrom_annotation([object()])

assert compare_chrom_annotation(sources) == {'chr1'}
assert compare_chrom_annotation(sources, strategy='all') == {'chr1'}

with pytest.raises(AssertionError) as exception:
sources[1].fasta = {'chr1': '', 'chr2': '', 'chr3': ''}
compare_chrom_annotation(sources, strategy='all')

assert str(exception.value) == 'chroms annotations are not all same.'

assert compare_chrom_annotation(sources) == {'chr1'}

with pytest.raises(AssertionError) as exception:
sources[1].fasta = {'chr2': '', 'chr3': ''}
compare_chrom_annotation(sources)
assert str(exception.value) == 'there is not intersection between chromosomes.'