Skip to content

Commit

Permalink
Reduce encoding memory by computing mask separately
Browse files Browse the repository at this point in the history
  • Loading branch information
jeromekelleher committed Jan 17, 2025
1 parent 9f817a2 commit f084469
Showing 1 changed file with 15 additions and 9 deletions.
24 changes: 15 additions & 9 deletions bio2zarr/vcf2zarr/vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,14 +881,10 @@ def encode_array_partition(self, array_spec, partition_index):
self.finalise_partition_array(partition_index, ba)

def encode_genotypes_partition(self, partition_index):
# FIXME we should be doing these one at a time, reading back in the genotypes
# like we do for local alleles
partition = self.metadata.partitions[partition_index]
gt = self.init_partition_array(partition_index, "call_genotype")
gt_mask = self.init_partition_array(partition_index, "call_genotype_mask")
gt_phased = self.init_partition_array(partition_index, "call_genotype_phased")

partition = self.metadata.partitions[partition_index]

source_field = self.icf.fields["FORMAT/GT"]
for value in source_field.iter_values(partition.start, partition.stop):
j = gt.next_buffer_row()
Expand All @@ -899,13 +895,23 @@ def encode_genotypes_partition(self, partition_index):
icf.sanitise_value_int_1d(
gt_phased.buff, j, value[:, -1] if value is not None else None
)
# TODO check is this the correct semantics when we are padding
# with mixed ploidies?
j = gt_mask.next_buffer_row()
gt_mask.buff[j] = gt.buff[j] < 0

self.finalise_partition_array(partition_index, gt)
self.finalise_partition_array(partition_index, gt_phased)

# Read back in the genotypes so we can compute the mask
gt_mask = self.init_partition_array(partition_index, "call_genotype_mask")
gt_array = zarr.open_array(
store=self.wip_partition_array_path(partition_index, "call_genotype"),
mode="r",
)
for genotypes in core.first_dim_slice_iter(
gt_array, partition.start, partition.stop
):
# TODO check is this the correct semantics when we are padding
# with mixed ploidies?
j = gt_mask.next_buffer_row()
gt_mask.buff[j] = genotypes < 0
self.finalise_partition_array(partition_index, gt_mask)

def encode_local_alleles_partition(self, partition_index):
Expand Down

0 comments on commit f084469

Please sign in to comment.