Skip to content

Commit

Permalink
Handle some awkward corner cases
Browse files Browse the repository at this point in the history
  • Loading branch information
jeromekelleher committed Jan 17, 2025
1 parent f084469 commit 65416ca
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 2 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# 0.1.2 2024-XX-XX

- Reduce memory requirement for encoding genotypes with large sample sizes

- Transpose default chunk sizes to 1000 variants and 10,000 samples (issue:300)

- Add chunksize options to mkschema (issue:294)
Expand Down
7 changes: 5 additions & 2 deletions bio2zarr/vcf2zarr/vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ def fixed_field_spec(
continue
array_specs.append(spec_from_field(field))

if gt_field is not None:
if gt_field is not None and n > 0:
ploidy = max(gt_field.summary.max_number - 1, 1)
shape = [m, n]
chunks = [variants_chunk_size, samples_chunk_size]
Expand Down Expand Up @@ -832,6 +832,7 @@ def encode_partition(self, partition_index):
self.encode_array_partition(array_spec, partition_index)
if self.has_genotypes():
self.encode_genotypes_partition(partition_index)
self.encode_genotype_mask_partition(partition_index)
if self.has_local_alleles():
self.encode_local_alleles_partition(partition_index)
self.encode_local_allele_fields_partition(partition_index)
Expand Down Expand Up @@ -899,8 +900,10 @@ def encode_genotypes_partition(self, partition_index):
self.finalise_partition_array(partition_index, gt)
self.finalise_partition_array(partition_index, gt_phased)

# Read back in the genotypes so we can compute the mask
def encode_genotype_mask_partition(self, partition_index):
partition = self.metadata.partitions[partition_index]
gt_mask = self.init_partition_array(partition_index, "call_genotype_mask")
# Read back in the genotypes so we can compute the mask
gt_array = zarr.open_array(
store=self.wip_partition_array_path(partition_index, "call_genotype"),
mode="r",
Expand Down
13 changes: 13 additions & 0 deletions tests/test_icf.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,19 @@ def test_INFO_NS(self, icf):
assert icf["INFO/NS"].values == [None, None, 3, 3, 2, 3, 3, None, None]


class TestWithGtHeaderNoGenotypes:
data_path = "tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz"

@pytest.fixture(scope="class")
def icf(self, tmp_path_factory):
out = tmp_path_factory.mktemp("data") / "example.exploded"
return vcf2zarr.explode(out, [self.data_path])

def test_gts(self, icf):
values = icf["FORMAT/GT"].values
assert values == [None] * icf.num_records


class TestIcfWriterExample:
data_path = "tests/data/vcf/sample.vcf.gz"

Expand Down
13 changes: 13 additions & 0 deletions tests/test_vcf_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,19 @@ def test_ok_without_local_alleles(self, ds):
nt.assert_array_equal(ds.call_genotype.values, [[[0, 0, 0]]])


class TestWithGtHeaderNoGenotypes:
data_path = "tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz"

@pytest.fixture(scope="class")
def ds(self, tmp_path_factory):
out = tmp_path_factory.mktemp("data") / "example.vcf.zarr"
vcf2zarr.convert([self.data_path], out, worker_processes=0)
return sg.load_dataset(out)

def test_gts(self, ds):
assert "call_genotype" not in ds


class Test1000G2020Example:
data_path = "tests/data/vcf/1kg_2020_chrM.vcf.gz"

Expand Down

0 comments on commit 65416ca

Please sign in to comment.