Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use of int8 for variant_contig results in integer overflow with fragmented reference genomes #584 #667

Merged
merged 3 commits into from
Sep 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions sgkit/io/vcf/vcf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
create_genotype_call_dataset,
)
from sgkit.typing import ArrayLike, DType, PathType
from sgkit.utils import max_str_len
from sgkit.utils import max_str_len, smallest_numpy_int_dtype

DEFAULT_MAX_ALT_ALLELES = (
3 # equivalent to DEFAULT_ALT_NUMBER in vcf_read.py in scikit_allel
Expand Down Expand Up @@ -384,7 +384,8 @@ def vcf_to_zarr_sequential(
else:
variants = vcf(region)

variant_contig = np.empty(chunk_length, dtype="i1")
variant_contig_dtype = smallest_numpy_int_dtype(len(variant_contig_names))
variant_contig = np.empty(chunk_length, dtype=variant_contig_dtype)
variant_position = np.empty(chunk_length, dtype="i4")

fields = fields or ["FORMAT/GT"] # default to GT as the only extra field
Expand Down
5 changes: 3 additions & 2 deletions sgkit/io/vcfzarr_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from ..model import DIM_SAMPLE, DIM_VARIANT, create_genotype_call_dataset
from ..typing import ArrayLike, PathType
from ..utils import encode_array, max_str_len
from ..utils import encode_array, max_str_len, smallest_numpy_int_dtype


def _ensure_2d(arr: ArrayLike) -> ArrayLike:
Expand Down Expand Up @@ -170,7 +170,8 @@ def _vcfzarr_to_dataset(
# Get the contigs from variants/CHROM
variants_chrom = da.from_zarr(vcfzarr["variants/CHROM"]).astype(str)
variant_contig, variant_contig_names = encode_array(variants_chrom.compute())
variant_contig = variant_contig.astype("i1")
variant_contig_dtype = smallest_numpy_int_dtype(len(variant_contig_names))
variant_contig = variant_contig.astype(variant_contig_dtype)
variant_contig_names = list(variant_contig_names)
else:
# Single contig: contig names were passed in
Expand Down
Binary file not shown.
12 changes: 12 additions & 0 deletions sgkit/tests/io/vcf/test_vcf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,18 @@ def test_vcf_to_zarr__contig_not_defined_in_header(shared_datadir, tmp_path):
vcf_to_zarr(path, output)


def test_vcf_to_zarr__large_number_of_contigs(shared_datadir, tmp_path):
path = path_for_test(shared_datadir, "Homo_sapiens_assembly38.headerOnly.vcf.gz")
output = tmp_path.joinpath("vcf.zarr").as_posix()

vcf_to_zarr(path, output)

ds = xr.open_zarr(output)

assert len(ds.attrs["contigs"]) == 3366
assert ds["variant_contig"].dtype == np.int16 # needs larger dtype than np.int8


def test_vcf_to_zarr__fields(shared_datadir, tmp_path):
path = path_for_test(shared_datadir, "sample.vcf.gz")
output = tmp_path.joinpath("vcf.zarr").as_posix()
Expand Down
35 changes: 35 additions & 0 deletions sgkit/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
hash_array,
max_str_len,
merge_datasets,
smallest_numpy_int_dtype,
split_array_chunks,
)

Expand Down Expand Up @@ -231,3 +232,37 @@ def test_hash_array(n_rows, n_cols):
# counts[inverse] gives the count for each column in x
# these should be the same for both ways of counting
np.testing.assert_equal(counts[inverse], expected_counts[expected_inverse])


@pytest.mark.parametrize(
"value,expected_dtype",
[
(0, np.int8),
(1, np.int8),
(-1, np.int8),
(np.iinfo(np.int8).min, np.int8),
(np.iinfo(np.int8).max, np.int8),
(np.iinfo(np.int8).min - 1, np.int16),
(np.iinfo(np.int8).max + 1, np.int16),
(np.iinfo(np.int16).min, np.int16),
(np.iinfo(np.int16).max, np.int16),
(np.iinfo(np.int16).min - 1, np.int32),
(np.iinfo(np.int16).max + 1, np.int32),
(np.iinfo(np.int32).min, np.int32),
(np.iinfo(np.int32).max, np.int32),
(np.iinfo(np.int32).min - 1, np.int64),
(np.iinfo(np.int32).max + 1, np.int64),
(np.iinfo(np.int64).min, np.int64),
(np.iinfo(np.int64).max, np.int64),
],
)
def test_smallest_numpy_int_dtype(value, expected_dtype):
assert smallest_numpy_int_dtype(value) == expected_dtype


def test_smallest_numpy_int_dtype__overflow():
with pytest.raises(OverflowError):
smallest_numpy_int_dtype(np.iinfo(np.int64).min - 1)

with pytest.raises(OverflowError):
smallest_numpy_int_dtype(np.iinfo(np.int64).max + 1)
23 changes: 23 additions & 0 deletions sgkit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,3 +336,26 @@ def hash_array(x: ArrayLike, out: ArrayLike) -> None: # pragma: no cover
out[0] = 5381
for i in range(x.shape[0]):
out[0] = out[0] * 33 + x[i]


def smallest_numpy_int_dtype(value: int) -> Optional[DType]:
"""Return the smallest NumPy signed integer dtype that can be used to store the given value.

Parameters
----------
value
An integer value to be stored.

Returns
-------
A NumPy signed integer dtype suitable for storing the given value.

Raises
------
OverflowError
If the value cannot be stored in a signed 64-bit integer dtype (``np.int64``).
"""
for dtype in (np.int8, np.int16, np.int32, np.int64):
if np.iinfo(dtype).min <= value <= np.iinfo(dtype).max:
return dtype
raise OverflowError(f"Value {value} cannot be stored in np.int64")