Skip to content

Commit

Permalink
Use of int8 for variant_contig results in integer overflow with fragm…
Browse files Browse the repository at this point in the history
…ented reference genomes sgkit-dev#584
  • Loading branch information
tomwhite authored and pentschev committed Sep 15, 2021
1 parent 61bca2b commit 8eb4b5a
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 4 deletions.
9 changes: 7 additions & 2 deletions sgkit/io/vcf/vcf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
create_genotype_call_dataset,
)
from sgkit.typing import ArrayLike, DType, PathType
from sgkit.utils import max_str_len
from sgkit.utils import max_str_len, smallest_numpy_int_dtype

DEFAULT_MAX_ALT_ALLELES = (
3 # equivalent to DEFAULT_ALT_NUMBER in vcf_read.py in scikit_allel
Expand Down Expand Up @@ -384,7 +384,12 @@ def vcf_to_zarr_sequential(
else:
variants = vcf(region)

variant_contig = np.empty(chunk_length, dtype="i1")
variant_contig_dtype = smallest_numpy_int_dtype(len(variant_contig_names))
if variant_contig_dtype is None:
raise ValueError(
f"Number of contigs ({len(variant_contig_names)}) exceeds maxmimum NumPy signed int dtype"
) # pragma: no cover
variant_contig = np.empty(chunk_length, dtype=variant_contig_dtype)
variant_position = np.empty(chunk_length, dtype="i4")

fields = fields or ["FORMAT/GT"] # default to GT as the only extra field
Expand Down
9 changes: 7 additions & 2 deletions sgkit/io/vcfzarr_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from ..model import DIM_SAMPLE, DIM_VARIANT, create_genotype_call_dataset
from ..typing import ArrayLike, PathType
from ..utils import encode_array, max_str_len
from ..utils import encode_array, max_str_len, smallest_numpy_int_dtype


def _ensure_2d(arr: ArrayLike) -> ArrayLike:
Expand Down Expand Up @@ -170,7 +170,12 @@ def _vcfzarr_to_dataset(
# Get the contigs from variants/CHROM
variants_chrom = da.from_zarr(vcfzarr["variants/CHROM"]).astype(str)
variant_contig, variant_contig_names = encode_array(variants_chrom.compute())
variant_contig = variant_contig.astype("i1")
variant_contig_dtype = smallest_numpy_int_dtype(len(variant_contig_names))
if variant_contig_dtype is None:
raise ValueError(
f"Number of contigs ({len(variant_contig_names)}) exceeds maxmimum NumPy signed int dtype"
) # pragma: no cover
variant_contig = variant_contig.astype(variant_contig_dtype)
variant_contig_names = list(variant_contig_names)
else:
# Single contig: contig names were passed in
Expand Down
Binary file not shown.
12 changes: 12 additions & 0 deletions sgkit/tests/io/vcf/test_vcf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,18 @@ def test_vcf_to_zarr__contig_not_defined_in_header(shared_datadir, tmp_path):
vcf_to_zarr(path, output)


def test_vcf_to_zarr__large_number_of_contigs(shared_datadir, tmp_path):
path = path_for_test(shared_datadir, "Homo_sapiens_assembly38.headerOnly.vcf.gz")
output = tmp_path.joinpath("vcf.zarr").as_posix()

vcf_to_zarr(path, output)

ds = xr.open_zarr(output)

assert len(ds.attrs["contigs"]) == 3366
assert ds["variant_contig"].dtype == np.int16 # needs larger dtype than np.int8


def test_vcf_to_zarr__fields(shared_datadir, tmp_path):
path = path_for_test(shared_datadir, "sample.vcf.gz")
output = tmp_path.joinpath("vcf.zarr").as_posix()
Expand Down
29 changes: 29 additions & 0 deletions sgkit/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
hash_array,
max_str_len,
merge_datasets,
smallest_numpy_int_dtype,
split_array_chunks,
)

Expand Down Expand Up @@ -231,3 +232,31 @@ def test_hash_array(n_rows, n_cols):
# counts[inverse] gives the count for each column in x
# these should be the same for both ways of counting
np.testing.assert_equal(counts[inverse], expected_counts[expected_inverse])


@pytest.mark.parametrize(
"value,expected_dtype",
[
(0, np.int8),
(1, np.int8),
(-1, np.int8),
(np.iinfo(np.int8).min, np.int8),
(np.iinfo(np.int8).max, np.int8),
(np.iinfo(np.int8).min - 1, np.int16),
(np.iinfo(np.int8).max + 1, np.int16),
(np.iinfo(np.int16).min, np.int16),
(np.iinfo(np.int16).max, np.int16),
(np.iinfo(np.int16).min - 1, np.int32),
(np.iinfo(np.int16).max + 1, np.int32),
(np.iinfo(np.int32).min, np.int32),
(np.iinfo(np.int32).max, np.int32),
(np.iinfo(np.int32).min - 1, np.int64),
(np.iinfo(np.int32).max + 1, np.int64),
(np.iinfo(np.int64).min, np.int64),
(np.iinfo(np.int64).max, np.int64),
(np.iinfo(np.int64).min - 1, None),
(np.iinfo(np.int64).max + 1, None),
],
)
def test_smallest_numpy_int_dtype(value, expected_dtype):
assert smallest_numpy_int_dtype(value) == expected_dtype
19 changes: 19 additions & 0 deletions sgkit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,3 +336,22 @@ def hash_array(x: ArrayLike, out: ArrayLike) -> None: # pragma: no cover
out[0] = 5381
for i in range(x.shape[0]):
out[0] = out[0] * 33 + x[i]


def smallest_numpy_int_dtype(value: int) -> Optional[DType]:
"""Return the smallest NumPy signed integer dtype that can be used to store the given value.
Parameters
----------
value
An integer value to be stored.
Returns
-------
A NumPy signed integer dtype suitable for storing the given value, or None if the value exceeds
the bounds of the largest dtype (``np.int64``).
"""
for dtype in (np.int8, np.int16, np.int32, np.int64):
if np.iinfo(dtype).min <= value <= np.iinfo(dtype).max:
return dtype
return None

0 comments on commit 8eb4b5a

Please sign in to comment.