6
6
from numpy .testing import assert_allclose , assert_array_equal
7
7
8
8
from sgkit import load_dataset
9
- from sgkit .io .vcf import partition_into_regions , vcf_to_zarr
9
+ from sgkit .io .vcf import (
10
+ MaxAltAllelesExceededWarning ,
11
+ partition_into_regions ,
12
+ vcf_to_zarr ,
13
+ )
10
14
11
15
from .utils import path_for_test
12
16
@@ -96,30 +100,35 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path):
96
100
path = path_for_test (shared_datadir , "sample.vcf.gz" , is_path )
97
101
output = tmp_path .joinpath ("vcf.zarr" ).as_posix ()
98
102
99
- vcf_to_zarr (path , output , chunk_length = 5 , chunk_width = 2 , max_alt_alleles = 1 )
100
- ds = xr .open_zarr (output ) # type: ignore[no-untyped-call]
103
+ with pytest .warns (MaxAltAllelesExceededWarning ):
104
+ vcf_to_zarr (path , output , chunk_length = 5 , chunk_width = 2 , max_alt_alleles = 1 )
105
+ ds = xr .open_zarr (output ) # type: ignore[no-untyped-call]
101
106
102
- # extra alt alleles are silently dropped
103
- assert_array_equal (
104
- ds ["variant_allele" ],
105
- [
106
- ["A" , "C" ],
107
- ["A" , "G" ],
108
- ["G" , "A" ],
109
- ["T" , "A" ],
110
- ["A" , "G" ],
111
- ["T" , "" ],
112
- ["G" , "GA" ],
113
- ["T" , "" ],
114
- ["AC" , "A" ],
115
- ],
116
- )
107
+ # extra alt alleles are dropped
108
+ assert_array_equal (
109
+ ds ["variant_allele" ],
110
+ [
111
+ ["A" , "C" ],
112
+ ["A" , "G" ],
113
+ ["G" , "A" ],
114
+ ["T" , "A" ],
115
+ ["A" , "G" ],
116
+ ["T" , "" ],
117
+ ["G" , "GA" ],
118
+ ["T" , "" ],
119
+ ["AC" , "A" ],
120
+ ],
121
+ )
122
+
123
+ # the maximum number of alt alleles actually seen is stored as an attribute
124
+ assert ds .attrs ["max_alt_alleles_seen" ] == 3
117
125
118
126
119
127
@pytest .mark .parametrize (
120
128
"is_path" ,
121
129
[True , False ],
122
130
)
131
+ @pytest .mark .filterwarnings ("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning" )
123
132
def test_vcf_to_zarr__large_vcf (shared_datadir , is_path , tmp_path ):
124
133
path = path_for_test (shared_datadir , "CEUTrio.20.21.gatk3.4.g.vcf.bgz" , is_path )
125
134
output = tmp_path .joinpath ("vcf.zarr" ).as_posix ()
@@ -157,6 +166,7 @@ def test_vcf_to_zarr__plain_vcf_with_no_index(shared_datadir, tmp_path):
157
166
"is_path" ,
158
167
[True , False ],
159
168
)
169
+ @pytest .mark .filterwarnings ("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning" )
160
170
def test_vcf_to_zarr__mutable_mapping (shared_datadir , is_path ):
161
171
path = path_for_test (shared_datadir , "CEUTrio.20.21.gatk3.4.g.vcf.bgz" , is_path )
162
172
output : MutableMapping [str , bytes ] = {}
@@ -182,6 +192,7 @@ def test_vcf_to_zarr__mutable_mapping(shared_datadir, is_path):
182
192
"is_path" ,
183
193
[True , False ],
184
194
)
195
+ @pytest .mark .filterwarnings ("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning" )
185
196
def test_vcf_to_zarr__parallel (shared_datadir , is_path , tmp_path ):
186
197
path = path_for_test (shared_datadir , "CEUTrio.20.21.gatk3.4.g.vcf.bgz" , is_path )
187
198
output = tmp_path .joinpath ("vcf_concat.zarr" ).as_posix ()
@@ -208,6 +219,7 @@ def test_vcf_to_zarr__parallel(shared_datadir, is_path, tmp_path):
208
219
"is_path" ,
209
220
[False ],
210
221
)
222
+ @pytest .mark .filterwarnings ("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning" )
211
223
def test_vcf_to_zarr__parallel_temp_chunk_length (shared_datadir , is_path , tmp_path ):
212
224
path = path_for_test (shared_datadir , "CEUTrio.20.21.gatk3.4.g.vcf.bgz" , is_path )
213
225
output = tmp_path .joinpath ("vcf_concat.zarr" ).as_posix ()
@@ -296,6 +308,7 @@ def test_vcf_to_zarr__parallel_partitioned_by_size(shared_datadir, is_path, tmp_
296
308
"is_path" ,
297
309
[True , False ],
298
310
)
311
+ @pytest .mark .filterwarnings ("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning" )
299
312
def test_vcf_to_zarr__multiple (shared_datadir , is_path , tmp_path ):
300
313
paths = [
301
314
path_for_test (shared_datadir , "CEUTrio.20.gatk3.4.g.vcf.bgz" , is_path ),
@@ -323,6 +336,7 @@ def test_vcf_to_zarr__multiple(shared_datadir, is_path, tmp_path):
323
336
"is_path" ,
324
337
[True , False ],
325
338
)
339
+ @pytest .mark .filterwarnings ("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning" )
326
340
def test_vcf_to_zarr__multiple_partitioned (shared_datadir , is_path , tmp_path ):
327
341
paths = [
328
342
path_for_test (shared_datadir , "CEUTrio.20.gatk3.4.g.vcf.bgz" , is_path ),
@@ -352,6 +366,7 @@ def test_vcf_to_zarr__multiple_partitioned(shared_datadir, is_path, tmp_path):
352
366
"is_path" ,
353
367
[True , False ],
354
368
)
369
+ @pytest .mark .filterwarnings ("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning" )
355
370
def test_vcf_to_zarr__multiple_partitioned_by_size (shared_datadir , is_path , tmp_path ):
356
371
paths = [
357
372
path_for_test (shared_datadir , "CEUTrio.20.gatk3.4.g.vcf.bgz" , is_path ),
@@ -398,6 +413,31 @@ def test_vcf_to_zarr__mutiple_partitioned_invalid_regions(
398
413
vcf_to_zarr (paths , output , regions = regions , chunk_length = 5_000 )
399
414
400
415
416
+ @pytest .mark .parametrize (
417
+ "is_path" ,
418
+ [True , False ],
419
+ )
420
+ def test_vcf_to_zarr__multiple_max_alt_alleles (shared_datadir , is_path , tmp_path ):
421
+ paths = [
422
+ path_for_test (shared_datadir , "CEUTrio.20.gatk3.4.g.vcf.bgz" , is_path ),
423
+ path_for_test (shared_datadir , "CEUTrio.21.gatk3.4.g.vcf.bgz" , is_path ),
424
+ ]
425
+ output = tmp_path .joinpath ("vcf_concat.zarr" ).as_posix ()
426
+
427
+ with pytest .warns (MaxAltAllelesExceededWarning ):
428
+ vcf_to_zarr (
429
+ paths ,
430
+ output ,
431
+ target_part_size = "40KB" ,
432
+ chunk_length = 5_000 ,
433
+ max_alt_alleles = 1 ,
434
+ )
435
+ ds = xr .open_zarr (output ) # type: ignore[no-untyped-call]
436
+
437
+ # the maximum number of alt alleles actually seen is stored as an attribute
438
+ assert ds .attrs ["max_alt_alleles_seen" ] == 7
439
+
440
+
401
441
@pytest .mark .parametrize (
402
442
"ploidy,mixed_ploidy,truncate_calls,regions" ,
403
443
[
@@ -560,6 +600,7 @@ def test_vcf_to_zarr__fields(shared_datadir, tmp_path):
560
600
assert ds ["call_DP" ].attrs ["comment" ] == "Read Depth"
561
601
562
602
603
+ @pytest .mark .filterwarnings ("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning" )
563
604
def test_vcf_to_zarr__parallel_with_fields (shared_datadir , tmp_path ):
564
605
path = path_for_test (shared_datadir , "CEUTrio.20.21.gatk3.4.g.vcf.bgz" )
565
606
output = tmp_path .joinpath ("vcf.zarr" ).as_posix ()
@@ -616,6 +657,7 @@ def test_vcf_to_zarr__field_defs(shared_datadir, tmp_path):
616
657
assert "comment" not in ds ["variant_DP" ].attrs
617
658
618
659
660
+ @pytest .mark .filterwarnings ("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning" )
619
661
def test_vcf_to_zarr__field_number_A (shared_datadir , tmp_path ):
620
662
path = path_for_test (shared_datadir , "sample.vcf.gz" )
621
663
output = tmp_path .joinpath ("vcf.zarr" ).as_posix ()
@@ -649,6 +691,7 @@ def test_vcf_to_zarr__field_number_A(shared_datadir, tmp_path):
649
691
)
650
692
651
693
694
+ @pytest .mark .filterwarnings ("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning" )
652
695
def test_vcf_to_zarr__field_number_R (shared_datadir , tmp_path ):
653
696
path = path_for_test (shared_datadir , "CEUTrio.21.gatk3.4.g.vcf.bgz" )
654
697
output = tmp_path .joinpath ("vcf.zarr" ).as_posix ()
0 commit comments