Use of int8 for variant_contig results in integer overflow with fragm…

…ented reference genomes sgkit-dev#584
pentschev · Sep 15, 2021 · 8eb4b5a · 8eb4b5a
1 parent 61bca2b
commit 8eb4b5a
Show file tree

Hide file tree

Showing 6 changed files with 74 additions and 4 deletions.
diff --git a/sgkit/io/vcf/vcf_reader.py b/sgkit/io/vcf/vcf_reader.py
@@ -36,7 +36,7 @@
     create_genotype_call_dataset,
 )
 from sgkit.typing import ArrayLike, DType, PathType
-from sgkit.utils import max_str_len
+from sgkit.utils import max_str_len, smallest_numpy_int_dtype
 
 DEFAULT_MAX_ALT_ALLELES = (
     3  # equivalent to DEFAULT_ALT_NUMBER in vcf_read.py in scikit_allel
@@ -384,7 +384,12 @@ def vcf_to_zarr_sequential(
         else:
             variants = vcf(region)
 
-        variant_contig = np.empty(chunk_length, dtype="i1")
+        variant_contig_dtype = smallest_numpy_int_dtype(len(variant_contig_names))
+        if variant_contig_dtype is None:
+            raise ValueError(
+                f"Number of contigs ({len(variant_contig_names)}) exceeds maxmimum NumPy signed int dtype"
+            )  # pragma: no cover
+        variant_contig = np.empty(chunk_length, dtype=variant_contig_dtype)
         variant_position = np.empty(chunk_length, dtype="i4")
 
         fields = fields or ["FORMAT/GT"]  # default to GT as the only extra field

diff --git a/sgkit/io/vcfzarr_reader.py b/sgkit/io/vcfzarr_reader.py
@@ -14,7 +14,7 @@
 
 from ..model import DIM_SAMPLE, DIM_VARIANT, create_genotype_call_dataset
 from ..typing import ArrayLike, PathType
-from ..utils import encode_array, max_str_len
+from ..utils import encode_array, max_str_len, smallest_numpy_int_dtype
 
 
 def _ensure_2d(arr: ArrayLike) -> ArrayLike:
@@ -170,7 +170,12 @@ def _vcfzarr_to_dataset(
         # Get the contigs from variants/CHROM
         variants_chrom = da.from_zarr(vcfzarr["variants/CHROM"]).astype(str)
         variant_contig, variant_contig_names = encode_array(variants_chrom.compute())
-        variant_contig = variant_contig.astype("i1")
+        variant_contig_dtype = smallest_numpy_int_dtype(len(variant_contig_names))
+        if variant_contig_dtype is None:
+            raise ValueError(
+                f"Number of contigs ({len(variant_contig_names)}) exceeds maxmimum NumPy signed int dtype"
+            )  # pragma: no cover
+        variant_contig = variant_contig.astype(variant_contig_dtype)
         variant_contig_names = list(variant_contig_names)
     else:
         # Single contig: contig names were passed in

diff --git a/sgkit/tests/io/vcf/data/Homo_sapiens_assembly38.headerOnly.vcf.gz b/sgkit/tests/io/vcf/data/Homo_sapiens_assembly38.headerOnly.vcf.gz
diff --git a/sgkit/tests/io/vcf/test_vcf_reader.py b/sgkit/tests/io/vcf/test_vcf_reader.py
@@ -651,6 +651,18 @@ def test_vcf_to_zarr__contig_not_defined_in_header(shared_datadir, tmp_path):
         vcf_to_zarr(path, output)
 
 
+def test_vcf_to_zarr__large_number_of_contigs(shared_datadir, tmp_path):
+    path = path_for_test(shared_datadir, "Homo_sapiens_assembly38.headerOnly.vcf.gz")
+    output = tmp_path.joinpath("vcf.zarr").as_posix()
+
+    vcf_to_zarr(path, output)
+
+    ds = xr.open_zarr(output)
+
+    assert len(ds.attrs["contigs"]) == 3366
+    assert ds["variant_contig"].dtype == np.int16  # needs larger dtype than np.int8
+
+
 def test_vcf_to_zarr__fields(shared_datadir, tmp_path):
     path = path_for_test(shared_datadir, "sample.vcf.gz")
     output = tmp_path.joinpath("vcf.zarr").as_posix()

diff --git a/sgkit/tests/test_utils.py b/sgkit/tests/test_utils.py
@@ -16,6 +16,7 @@
     hash_array,
     max_str_len,
     merge_datasets,
+    smallest_numpy_int_dtype,
     split_array_chunks,
 )
 
@@ -231,3 +232,31 @@ def test_hash_array(n_rows, n_cols):
     # counts[inverse] gives the count for each column in x
     # these should be the same for both ways of counting
     np.testing.assert_equal(counts[inverse], expected_counts[expected_inverse])
+
+
+@pytest.mark.parametrize(
+    "value,expected_dtype",
+    [
+        (0, np.int8),
+        (1, np.int8),
+        (-1, np.int8),
+        (np.iinfo(np.int8).min, np.int8),
+        (np.iinfo(np.int8).max, np.int8),
+        (np.iinfo(np.int8).min - 1, np.int16),
+        (np.iinfo(np.int8).max + 1, np.int16),
+        (np.iinfo(np.int16).min, np.int16),
+        (np.iinfo(np.int16).max, np.int16),
+        (np.iinfo(np.int16).min - 1, np.int32),
+        (np.iinfo(np.int16).max + 1, np.int32),
+        (np.iinfo(np.int32).min, np.int32),
+        (np.iinfo(np.int32).max, np.int32),
+        (np.iinfo(np.int32).min - 1, np.int64),
+        (np.iinfo(np.int32).max + 1, np.int64),
+        (np.iinfo(np.int64).min, np.int64),
+        (np.iinfo(np.int64).max, np.int64),
+        (np.iinfo(np.int64).min - 1, None),
+        (np.iinfo(np.int64).max + 1, None),
+    ],
+)
+def test_smallest_numpy_int_dtype(value, expected_dtype):
+    assert smallest_numpy_int_dtype(value) == expected_dtype
diff --git a/sgkit/utils.py b/sgkit/utils.py
@@ -336,3 +336,22 @@ def hash_array(x: ArrayLike, out: ArrayLike) -> None:  # pragma: no cover
     out[0] = 5381
     for i in range(x.shape[0]):
         out[0] = out[0] * 33 + x[i]
+
+
+def smallest_numpy_int_dtype(value: int) -> Optional[DType]:
+    """Return the smallest NumPy signed integer dtype that can be used to store the given value.
+
+    Parameters
+    ----------
+    value
+        An integer value to be stored.
+
+    Returns
+    -------
+    A NumPy signed integer dtype suitable for storing the given value, or None if the value exceeds
+    the bounds of the largest dtype (``np.int64``).
+    """
+    for dtype in (np.int8, np.int16, np.int32, np.int64):
+        if np.iinfo(dtype).min <= value <= np.iinfo(dtype).max:
+            return dtype
+    return None