Skip to content

Commit 439ed89

Browse files
committed
Issue a warning if the number of alt alleles exceeds the maximum specified
1 parent a7cb326 commit 439ed89

File tree

5 files changed

+92
-19
lines changed

5 files changed

+92
-19
lines changed

sgkit/io/utils.py

+5
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,11 @@ def zarrs_to_dataset(
104104
ds[variable_name] = ds[variable_name].astype(f"S{max_length}")
105105
del ds.attrs[attr]
106106

107+
if "max_alt_alleles_seen" in datasets[0].attrs:
108+
ds.attrs["max_alt_alleles_seen"] = max(
109+
ds.attrs["max_alt_alleles_seen"] for ds in datasets
110+
)
111+
107112
return ds
108113

109114

sgkit/io/vcf/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
try:
44
from ..utils import zarrs_to_dataset
55
from .vcf_partition import partition_into_regions
6-
from .vcf_reader import vcf_to_zarr, vcf_to_zarrs
6+
from .vcf_reader import MaxAltAllelesExceededWarning, vcf_to_zarr, vcf_to_zarrs
77

88
__all__ = [
9+
"MaxAltAllelesExceededWarning",
910
"partition_into_regions",
1011
"vcf_to_zarr",
1112
"vcf_to_zarrs",

sgkit/io/vcf/vcf_reader.py

+20
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import functools
22
import itertools
3+
import warnings
34
from contextlib import contextmanager
45
from dataclasses import dataclass
56
from pathlib import Path
@@ -21,6 +22,7 @@
2122
import xarray as xr
2223
from cyvcf2 import VCF, Variant
2324

25+
from sgkit.io.dataset import load_dataset
2426
from sgkit.io.utils import zarrs_to_dataset
2527
from sgkit.io.vcf import partition_into_regions
2628
from sgkit.io.vcf.utils import build_url, chunks, temporary_directory, url_filename
@@ -34,6 +36,12 @@
3436
)
3537

3638

39+
class MaxAltAllelesExceededWarning(UserWarning):
40+
"""Warning when the number of alt alleles exceeds the maximum specified."""
41+
42+
pass
43+
44+
3745
@contextmanager
3846
def open_vcf(path: PathType) -> Iterator[VCF]:
3947
"""A context manager for opening a VCF file."""
@@ -256,6 +264,7 @@ def vcf_to_zarr_sequential(
256264
# Remember max lengths of variable-length strings
257265
max_variant_id_length = 0
258266
max_variant_allele_length = 0
267+
max_alt_alleles_seen = 0
259268

260269
# Iterate through variants in batches of chunk_length
261270

@@ -303,6 +312,7 @@ def vcf_to_zarr_sequential(
303312
variant_position[i] = variant.POS
304313

305314
alleles = [variant.REF] + variant.ALT
315+
max_alt_alleles_seen = max(max_alt_alleles_seen, len(variant.ALT))
306316
if len(alleles) > n_allele:
307317
alleles = alleles[:n_allele]
308318
elif len(alleles) < n_allele:
@@ -359,6 +369,7 @@ def vcf_to_zarr_sequential(
359369
if add_str_max_length_attrs:
360370
ds.attrs["max_length_variant_id"] = max_variant_id_length
361371
ds.attrs["max_length_variant_allele"] = max_variant_allele_length
372+
ds.attrs["max_alt_alleles_seen"] = max_alt_alleles_seen
362373

363374
if first_variants_chunk:
364375
# Enforce uniform chunks in the variants dimension
@@ -705,6 +716,15 @@ def vcf_to_zarr(
705716
field_defs=field_defs,
706717
)
707718

719+
# Issue a warning if max_alt_alleles caused data to be dropped
720+
ds = load_dataset(output)
721+
max_alt_alleles_seen = ds.attrs["max_alt_alleles_seen"]
722+
if max_alt_alleles_seen > max_alt_alleles:
723+
warnings.warn(
724+
f"Some alternate alleles were dropped, since actual max value {max_alt_alleles_seen} exceeded max_alt_alleles setting of {max_alt_alleles}.",
725+
MaxAltAllelesExceededWarning,
726+
)
727+
708728

709729
def count_variants(path: PathType, region: Optional[str] = None) -> int:
710730
"""Count the number of variants in a VCF file."""

sgkit/tests/io/vcf/test_vcf_reader.py

+61-18
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,11 @@
66
from numpy.testing import assert_allclose, assert_array_equal
77

88
from sgkit import load_dataset
9-
from sgkit.io.vcf import partition_into_regions, vcf_to_zarr
9+
from sgkit.io.vcf import (
10+
MaxAltAllelesExceededWarning,
11+
partition_into_regions,
12+
vcf_to_zarr,
13+
)
1014

1115
from .utils import path_for_test
1216

@@ -96,30 +100,35 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path):
96100
path = path_for_test(shared_datadir, "sample.vcf.gz", is_path)
97101
output = tmp_path.joinpath("vcf.zarr").as_posix()
98102

99-
vcf_to_zarr(path, output, chunk_length=5, chunk_width=2, max_alt_alleles=1)
100-
ds = xr.open_zarr(output) # type: ignore[no-untyped-call]
103+
with pytest.warns(MaxAltAllelesExceededWarning):
104+
vcf_to_zarr(path, output, chunk_length=5, chunk_width=2, max_alt_alleles=1)
105+
ds = xr.open_zarr(output) # type: ignore[no-untyped-call]
101106

102-
# extra alt alleles are silently dropped
103-
assert_array_equal(
104-
ds["variant_allele"],
105-
[
106-
["A", "C"],
107-
["A", "G"],
108-
["G", "A"],
109-
["T", "A"],
110-
["A", "G"],
111-
["T", ""],
112-
["G", "GA"],
113-
["T", ""],
114-
["AC", "A"],
115-
],
116-
)
107+
# extra alt alleles are dropped
108+
assert_array_equal(
109+
ds["variant_allele"],
110+
[
111+
["A", "C"],
112+
["A", "G"],
113+
["G", "A"],
114+
["T", "A"],
115+
["A", "G"],
116+
["T", ""],
117+
["G", "GA"],
118+
["T", ""],
119+
["AC", "A"],
120+
],
121+
)
122+
123+
# the maximum number of alt alleles actually seen is stored as an attribute
124+
assert ds.attrs["max_alt_alleles_seen"] == 3
117125

118126

119127
@pytest.mark.parametrize(
120128
"is_path",
121129
[True, False],
122130
)
131+
@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
123132
def test_vcf_to_zarr__large_vcf(shared_datadir, is_path, tmp_path):
124133
path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
125134
output = tmp_path.joinpath("vcf.zarr").as_posix()
@@ -157,6 +166,7 @@ def test_vcf_to_zarr__plain_vcf_with_no_index(shared_datadir, tmp_path):
157166
"is_path",
158167
[True, False],
159168
)
169+
@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
160170
def test_vcf_to_zarr__mutable_mapping(shared_datadir, is_path):
161171
path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
162172
output: MutableMapping[str, bytes] = {}
@@ -182,6 +192,7 @@ def test_vcf_to_zarr__mutable_mapping(shared_datadir, is_path):
182192
"is_path",
183193
[True, False],
184194
)
195+
@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
185196
def test_vcf_to_zarr__parallel(shared_datadir, is_path, tmp_path):
186197
path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
187198
output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
@@ -208,6 +219,7 @@ def test_vcf_to_zarr__parallel(shared_datadir, is_path, tmp_path):
208219
"is_path",
209220
[False],
210221
)
222+
@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
211223
def test_vcf_to_zarr__parallel_temp_chunk_length(shared_datadir, is_path, tmp_path):
212224
path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
213225
output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
@@ -296,6 +308,7 @@ def test_vcf_to_zarr__parallel_partitioned_by_size(shared_datadir, is_path, tmp_
296308
"is_path",
297309
[True, False],
298310
)
311+
@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
299312
def test_vcf_to_zarr__multiple(shared_datadir, is_path, tmp_path):
300313
paths = [
301314
path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
@@ -323,6 +336,7 @@ def test_vcf_to_zarr__multiple(shared_datadir, is_path, tmp_path):
323336
"is_path",
324337
[True, False],
325338
)
339+
@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
326340
def test_vcf_to_zarr__multiple_partitioned(shared_datadir, is_path, tmp_path):
327341
paths = [
328342
path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
@@ -352,6 +366,7 @@ def test_vcf_to_zarr__multiple_partitioned(shared_datadir, is_path, tmp_path):
352366
"is_path",
353367
[True, False],
354368
)
369+
@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
355370
def test_vcf_to_zarr__multiple_partitioned_by_size(shared_datadir, is_path, tmp_path):
356371
paths = [
357372
path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
@@ -398,6 +413,31 @@ def test_vcf_to_zarr__mutiple_partitioned_invalid_regions(
398413
vcf_to_zarr(paths, output, regions=regions, chunk_length=5_000)
399414

400415

416+
@pytest.mark.parametrize(
417+
"is_path",
418+
[True, False],
419+
)
420+
def test_vcf_to_zarr__multiple_max_alt_alleles(shared_datadir, is_path, tmp_path):
421+
paths = [
422+
path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
423+
path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path),
424+
]
425+
output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
426+
427+
with pytest.warns(MaxAltAllelesExceededWarning):
428+
vcf_to_zarr(
429+
paths,
430+
output,
431+
target_part_size="40KB",
432+
chunk_length=5_000,
433+
max_alt_alleles=1,
434+
)
435+
ds = xr.open_zarr(output) # type: ignore[no-untyped-call]
436+
437+
# the maximum number of alt alleles actually seen is stored as an attribute
438+
assert ds.attrs["max_alt_alleles_seen"] == 7
439+
440+
401441
@pytest.mark.parametrize(
402442
"ploidy,mixed_ploidy,truncate_calls,regions",
403443
[
@@ -560,6 +600,7 @@ def test_vcf_to_zarr__fields(shared_datadir, tmp_path):
560600
assert ds["call_DP"].attrs["comment"] == "Read Depth"
561601

562602

603+
@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
563604
def test_vcf_to_zarr__parallel_with_fields(shared_datadir, tmp_path):
564605
path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz")
565606
output = tmp_path.joinpath("vcf.zarr").as_posix()
@@ -616,6 +657,7 @@ def test_vcf_to_zarr__field_defs(shared_datadir, tmp_path):
616657
assert "comment" not in ds["variant_DP"].attrs
617658

618659

660+
@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
619661
def test_vcf_to_zarr__field_number_A(shared_datadir, tmp_path):
620662
path = path_for_test(shared_datadir, "sample.vcf.gz")
621663
output = tmp_path.joinpath("vcf.zarr").as_posix()
@@ -649,6 +691,7 @@ def test_vcf_to_zarr__field_number_A(shared_datadir, tmp_path):
649691
)
650692

651693

694+
@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
652695
def test_vcf_to_zarr__field_number_R(shared_datadir, tmp_path):
653696
path = path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz")
654697
output = tmp_path.joinpath("vcf.zarr").as_posix()

sgkit/tests/io/vcf/test_vcf_roundtrip.py

+4
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def test_default_fields(shared_datadir, tmpdir):
7979
sg_vcfzarr_path = create_sg_vcfzarr(shared_datadir, tmpdir)
8080
sg_ds = sg.load_dataset(str(sg_vcfzarr_path))
8181
sg_ds = sg_ds.drop_vars("call_genotype_phased") # not included in scikit-allel
82+
del sg_ds.attrs["max_alt_alleles_seen"] # not saved by scikit-allel
8283

8384
assert_identical(allel_ds, sg_ds)
8485

@@ -107,6 +108,7 @@ def test_DP_field(shared_datadir, tmpdir):
107108
)
108109
sg_ds = sg.load_dataset(str(sg_vcfzarr_path))
109110
sg_ds = sg_ds.drop_vars("call_genotype_phased") # not included in scikit-allel
111+
del sg_ds.attrs["max_alt_alleles_seen"] # not saved by scikit-allel
110112

111113
assert_identical(allel_ds, sg_ds)
112114

@@ -120,6 +122,7 @@ def test_DP_field(shared_datadir, tmpdir):
120122
("CEUTrio.20.21.gatk3.4.g.vcf.bgz", ["calldata/PL"], ["FORMAT/PL"]),
121123
],
122124
)
125+
@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
123126
def test_all_fields(
124127
shared_datadir, tmpdir, vcf_file, allel_exclude_fields, sgkit_exclude_fields
125128
):
@@ -159,6 +162,7 @@ def test_all_fields(
159162
)
160163
sg_ds = sg.load_dataset(str(sg_vcfzarr_path))
161164
sg_ds = sg_ds.drop_vars("call_genotype_phased") # not included in scikit-allel
165+
del sg_ds.attrs["max_alt_alleles_seen"] # not saved by scikit-allel
162166

163167
# scikit-allel only records contigs for which there are actual variants,
164168
# whereas sgkit records contigs from the header

0 commit comments

Comments
 (0)