From 65416ca631a5be93ddd288aaff693e47ceb6bcbf Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Fri, 17 Jan 2025 16:30:36 +0000 Subject: [PATCH] Handle some awkward corner cases --- CHANGELOG.md | 2 ++ bio2zarr/vcf2zarr/vcz.py | 7 +++++-- tests/test_icf.py | 13 +++++++++++++ tests/test_vcf_examples.py | 13 +++++++++++++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bb74fac..f97f6e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ # 0.1.2 2024-XX-XX +- Reduce memory requirement for encoding genotypes with large sample sizes + - Transpose default chunk sizes to 1000 variants and 10,000 samples (issue:300) - Add chunksize options to mkschema (issue:294) diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index 087b40e..742e9f6 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -382,7 +382,7 @@ def fixed_field_spec( continue array_specs.append(spec_from_field(field)) - if gt_field is not None: + if gt_field is not None and n > 0: ploidy = max(gt_field.summary.max_number - 1, 1) shape = [m, n] chunks = [variants_chunk_size, samples_chunk_size] @@ -832,6 +832,7 @@ def encode_partition(self, partition_index): self.encode_array_partition(array_spec, partition_index) if self.has_genotypes(): self.encode_genotypes_partition(partition_index) + self.encode_genotype_mask_partition(partition_index) if self.has_local_alleles(): self.encode_local_alleles_partition(partition_index) self.encode_local_allele_fields_partition(partition_index) @@ -899,8 +900,10 @@ def encode_genotypes_partition(self, partition_index): self.finalise_partition_array(partition_index, gt) self.finalise_partition_array(partition_index, gt_phased) - # Read back in the genotypes so we can compute the mask + def encode_genotype_mask_partition(self, partition_index): + partition = self.metadata.partitions[partition_index] gt_mask = self.init_partition_array(partition_index, "call_genotype_mask") + # Read back in the genotypes so we can compute the mask gt_array = zarr.open_array( store=self.wip_partition_array_path(partition_index, "call_genotype"), mode="r", diff --git a/tests/test_icf.py b/tests/test_icf.py index b59ae1d..088be7a 100644 --- a/tests/test_icf.py +++ b/tests/test_icf.py @@ -91,6 +91,19 @@ def test_INFO_NS(self, icf): assert icf["INFO/NS"].values == [None, None, 3, 3, 2, 3, 3, None, None] +class TestWithGtHeaderNoGenotypes: + data_path = "tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz" + + @pytest.fixture(scope="class") + def icf(self, tmp_path_factory): + out = tmp_path_factory.mktemp("data") / "example.exploded" + return vcf2zarr.explode(out, [self.data_path]) + + def test_gts(self, icf): + values = icf["FORMAT/GT"].values + assert values == [None] * icf.num_records + + class TestIcfWriterExample: data_path = "tests/data/vcf/sample.vcf.gz" diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index c1cb2ae..3692fe4 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -508,6 +508,19 @@ def test_ok_without_local_alleles(self, ds): nt.assert_array_equal(ds.call_genotype.values, [[[0, 0, 0]]]) +class TestWithGtHeaderNoGenotypes: + data_path = "tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz" + + @pytest.fixture(scope="class") + def ds(self, tmp_path_factory): + out = tmp_path_factory.mktemp("data") / "example.vcf.zarr" + vcf2zarr.convert([self.data_path], out, worker_processes=0) + return sg.load_dataset(out) + + def test_gts(self, ds): + assert "call_genotype" not in ds + + class Test1000G2020Example: data_path = "tests/data/vcf/1kg_2020_chrM.vcf.gz"