diff --git a/sgkit/io/vcf/vcf_reader.py b/sgkit/io/vcf/vcf_reader.py index 725032376..59b54ea16 100644 --- a/sgkit/io/vcf/vcf_reader.py +++ b/sgkit/io/vcf/vcf_reader.py @@ -36,7 +36,7 @@ create_genotype_call_dataset, ) from sgkit.typing import ArrayLike, DType, PathType -from sgkit.utils import max_str_len +from sgkit.utils import max_str_len, smallest_numpy_int_dtype DEFAULT_MAX_ALT_ALLELES = ( 3 # equivalent to DEFAULT_ALT_NUMBER in vcf_read.py in scikit_allel @@ -384,7 +384,8 @@ def vcf_to_zarr_sequential( else: variants = vcf(region) - variant_contig = np.empty(chunk_length, dtype="i1") + variant_contig_dtype = smallest_numpy_int_dtype(len(variant_contig_names)) + variant_contig = np.empty(chunk_length, dtype=variant_contig_dtype) variant_position = np.empty(chunk_length, dtype="i4") fields = fields or ["FORMAT/GT"] # default to GT as the only extra field diff --git a/sgkit/io/vcfzarr_reader.py b/sgkit/io/vcfzarr_reader.py index 436fa8991..4ca5cf154 100644 --- a/sgkit/io/vcfzarr_reader.py +++ b/sgkit/io/vcfzarr_reader.py @@ -14,7 +14,7 @@ from ..model import DIM_SAMPLE, DIM_VARIANT, create_genotype_call_dataset from ..typing import ArrayLike, PathType -from ..utils import encode_array, max_str_len +from ..utils import encode_array, max_str_len, smallest_numpy_int_dtype def _ensure_2d(arr: ArrayLike) -> ArrayLike: @@ -170,7 +170,8 @@ def _vcfzarr_to_dataset( # Get the contigs from variants/CHROM variants_chrom = da.from_zarr(vcfzarr["variants/CHROM"]).astype(str) variant_contig, variant_contig_names = encode_array(variants_chrom.compute()) - variant_contig = variant_contig.astype("i1") + variant_contig_dtype = smallest_numpy_int_dtype(len(variant_contig_names)) + variant_contig = variant_contig.astype(variant_contig_dtype) variant_contig_names = list(variant_contig_names) else: # Single contig: contig names were passed in diff --git a/sgkit/tests/io/vcf/data/Homo_sapiens_assembly38.headerOnly.vcf.gz b/sgkit/tests/io/vcf/data/Homo_sapiens_assembly38.headerOnly.vcf.gz new file mode 100644 index 000000000..8c3c28dae Binary files /dev/null and b/sgkit/tests/io/vcf/data/Homo_sapiens_assembly38.headerOnly.vcf.gz differ diff --git a/sgkit/tests/io/vcf/test_vcf_reader.py b/sgkit/tests/io/vcf/test_vcf_reader.py index 2ed0ea55b..497483fee 100644 --- a/sgkit/tests/io/vcf/test_vcf_reader.py +++ b/sgkit/tests/io/vcf/test_vcf_reader.py @@ -651,6 +651,18 @@ def test_vcf_to_zarr__contig_not_defined_in_header(shared_datadir, tmp_path): vcf_to_zarr(path, output) +def test_vcf_to_zarr__large_number_of_contigs(shared_datadir, tmp_path): + path = path_for_test(shared_datadir, "Homo_sapiens_assembly38.headerOnly.vcf.gz") + output = tmp_path.joinpath("vcf.zarr").as_posix() + + vcf_to_zarr(path, output) + + ds = xr.open_zarr(output) + + assert len(ds.attrs["contigs"]) == 3366 + assert ds["variant_contig"].dtype == np.int16 # needs larger dtype than np.int8 + + def test_vcf_to_zarr__fields(shared_datadir, tmp_path): path = path_for_test(shared_datadir, "sample.vcf.gz") output = tmp_path.joinpath("vcf.zarr").as_posix() diff --git a/sgkit/tests/test_utils.py b/sgkit/tests/test_utils.py index ee90832a6..fba366283 100644 --- a/sgkit/tests/test_utils.py +++ b/sgkit/tests/test_utils.py @@ -16,6 +16,7 @@ hash_array, max_str_len, merge_datasets, + smallest_numpy_int_dtype, split_array_chunks, ) @@ -231,3 +232,31 @@ def test_hash_array(n_rows, n_cols): # counts[inverse] gives the count for each column in x # these should be the same for both ways of counting np.testing.assert_equal(counts[inverse], expected_counts[expected_inverse]) + + +@pytest.mark.parametrize( + "value,expected_dtype", + [ + (0, np.int8), + (1, np.int8), + (-1, np.int8), + (np.iinfo(np.int8).min, np.int8), + (np.iinfo(np.int8).max, np.int8), + (np.iinfo(np.int8).min - 1, np.int16), + (np.iinfo(np.int8).max + 1, np.int16), + (np.iinfo(np.int16).min, np.int16), + (np.iinfo(np.int16).max, np.int16), + (np.iinfo(np.int16).min - 1, np.int32), + (np.iinfo(np.int16).max + 1, np.int32), + (np.iinfo(np.int32).min, np.int32), + (np.iinfo(np.int32).max, np.int32), + (np.iinfo(np.int32).min - 1, np.int64), + (np.iinfo(np.int32).max + 1, np.int64), + (np.iinfo(np.int64).min, np.int64), + (np.iinfo(np.int64).max, np.int64), + (np.iinfo(np.int64).min - 1, None), + (np.iinfo(np.int64).max + 1, None), + ], +) +def test_smallest_numpy_int_dtype(value, expected_dtype): + assert smallest_numpy_int_dtype(value) == expected_dtype diff --git a/sgkit/utils.py b/sgkit/utils.py index 1a40affab..5e2c7f488 100644 --- a/sgkit/utils.py +++ b/sgkit/utils.py @@ -336,3 +336,22 @@ def hash_array(x: ArrayLike, out: ArrayLike) -> None: # pragma: no cover out[0] = 5381 for i in range(x.shape[0]): out[0] = out[0] * 33 + x[i] + + +def smallest_numpy_int_dtype(value: int) -> Optional[DType]: + """Return the smallest NumPy signed integer dtype that can be used to store the given value. + + Parameters + ---------- + value + An integer value to be stored. + + Returns + ------- + A NumPy signed integer dtype suitable for storing the given value, or None if the value exceeds + the bounds of the largest dtype (``np.int64``). + """ + for dtype in (np.int8, np.int16, np.int32, np.int64): + if np.iinfo(dtype).min <= value <= np.iinfo(dtype).max: + return dtype + return None