22
22
import xarray as xr
23
23
from cyvcf2 import VCF , Variant
24
24
from numcodecs import PackBits
25
+ from typing_extensions import Literal
25
26
26
27
from sgkit import variables
27
28
from sgkit .io .dataset import load_dataset
28
29
from sgkit .io .utils import zarrs_to_dataset
29
30
from sgkit .io .vcf import partition_into_regions
30
31
from sgkit .io .vcf .utils import build_url , chunks , temporary_directory , url_filename
31
- from sgkit .io .vcfzarr_reader import vcf_number_to_dimension_and_size
32
+ from sgkit .io .vcfzarr_reader import (
33
+ concat_zarrs_optimized ,
34
+ vcf_number_to_dimension_and_size ,
35
+ )
32
36
from sgkit .model import (
33
37
DIM_PLOIDY ,
34
38
DIM_SAMPLE ,
@@ -529,6 +533,7 @@ def vcf_to_zarr_parallel(
529
533
fields : Optional [Sequence [str ]] = None ,
530
534
exclude_fields : Optional [Sequence [str ]] = None ,
531
535
field_defs : Optional [Dict [str , Dict [str , Any ]]] = None ,
536
+ concat_algorithm : Optional [Literal ["xarray_internal" ]] = None ,
532
537
) -> None :
533
538
"""Convert specified regions of one or more VCF files to zarr files, then concat, rechunk, write to zarr"""
534
539
@@ -557,11 +562,15 @@ def vcf_to_zarr_parallel(
557
562
field_defs = field_defs ,
558
563
)
559
564
560
- ds = zarrs_to_dataset (paths , chunk_length , chunk_width , tempdir_storage_options )
561
-
562
- # Ensure Dask task graph is efficient, see https://github.com/dask/dask/issues/5105
563
- with dask .config .set ({"optimization.fuse.ave-width" : dask_fuse_avg_width }):
564
- ds .to_zarr (output , mode = "w" )
565
+ concat_zarrs (
566
+ paths ,
567
+ output ,
568
+ concat_algorithm = concat_algorithm ,
569
+ chunk_length = chunk_length ,
570
+ chunk_width = chunk_width ,
571
+ storage_options = tempdir_storage_options ,
572
+ dask_fuse_avg_width = dask_fuse_avg_width ,
573
+ )
565
574
566
575
567
576
def vcf_to_zarrs (
@@ -703,6 +712,64 @@ def vcf_to_zarrs(
703
712
return parts
704
713
705
714
715
+ def concat_zarrs (
716
+ urls : Sequence [str ],
717
+ output : Union [PathType , MutableMapping [str , bytes ]],
718
+ * ,
719
+ concat_algorithm : Optional [Literal ["xarray_internal" ]] = None ,
720
+ chunk_length : int = 10_000 ,
721
+ chunk_width : int = 1_000 ,
722
+ storage_options : Optional [Dict [str , str ]] = None ,
723
+ dask_fuse_avg_width : int = 50 ,
724
+ ) -> None :
725
+ """Concatenate multiple Zarr stores into a single Zarr store.
726
+
727
+ The Zarr stores are concatenated and rechunked to produce a single combined store.
728
+
729
+ Parameters
730
+ ----------
731
+ urls
732
+ A list of URLs to the Zarr stores to combine, typically the return value of
733
+ :func:`vcf_to_zarrs`.
734
+ output
735
+ Zarr store or path to directory in file system.
736
+ concat_algorithm
737
+ The algorithm to use to concatenate and rechunk Zarr files. The default None means
738
+ use the optimized version suitable for large files, whereas ``xarray_internal`` will
739
+ use built-in Xarray APIs, which can exhibit high memory usage, see https://github.com/dask/dask/issues/6745.
740
+ chunk_length
741
+ Length (number of variants) of chunks in which data are stored, by default 10,000.
742
+ This is only used when ``concat_algorithm`` is ``xarray_internal``.
743
+ chunk_width
744
+ Width (number of samples) to use when storing chunks in output, by default 1,000.
745
+ This is only used when ``concat_algorithm`` is ``xarray_internal``.
746
+ storage_options
747
+ Any additional parameters for the storage backend (see ``fsspec.open``).
748
+ dask_fuse_avg_width
749
+ Setting for Dask's ``optimization.fuse.ave-width``, see https://github.com/dask/dask/issues/5105
750
+ """
751
+ if concat_algorithm == "xarray_internal" :
752
+ ds = zarrs_to_dataset (urls , chunk_length , chunk_width , storage_options )
753
+
754
+ with dask .config .set ({"optimization.fuse.ave-width" : dask_fuse_avg_width }):
755
+ ds .to_zarr (output , mode = "w" )
756
+ else :
757
+
758
+ vars_to_rechunk = []
759
+ vars_to_copy = []
760
+ storage_options = storage_options or {}
761
+ ds = xr .open_zarr ( # type: ignore[no-untyped-call]
762
+ fsspec .get_mapper (urls [0 ], ** storage_options ), concat_characters = False
763
+ )
764
+ for (var , arr ) in ds .data_vars .items ():
765
+ if arr .dims [0 ] == "variants" :
766
+ vars_to_rechunk .append (var )
767
+ else :
768
+ vars_to_copy .append (var )
769
+
770
+ concat_zarrs_optimized (urls , output , vars_to_rechunk , vars_to_copy )
771
+
772
+
706
773
def vcf_to_zarr (
707
774
input : Union [PathType , Sequence [PathType ]],
708
775
output : Union [PathType , MutableMapping [str , bytes ]],
@@ -723,6 +790,7 @@ def vcf_to_zarr(
723
790
fields : Optional [Sequence [str ]] = None ,
724
791
exclude_fields : Optional [Sequence [str ]] = None ,
725
792
field_defs : Optional [Dict [str , Dict [str , Any ]]] = None ,
793
+ concat_algorithm : Optional [Literal ["xarray_internal" ]] = None ,
726
794
) -> None :
727
795
"""Convert VCF files to a single Zarr on-disk store.
728
796
@@ -735,8 +803,7 @@ def vcf_to_zarr(
735
803
is None.
736
804
737
805
For more control over these two steps, consider using :func:`vcf_to_zarrs` followed by
738
- :func:`zarrs_to_dataset`, then saving the dataset using Xarray's
739
- :meth:`xarray.Dataset.to_zarr` method.
806
+ :func:`concat_zarrs`.
740
807
741
808
Parameters
742
809
----------
@@ -811,6 +878,10 @@ def vcf_to_zarr(
811
878
(which is defined as Number 2 in the VCF header) as ``haplotypes``.
812
879
(Note that Number ``A`` is the number of alternate alleles, see section 1.4.2 of the
813
880
VCF spec https://samtools.github.io/hts-specs/VCFv4.3.pdf.)
881
+ concat_algorithm
882
+ The algorithm to use to concatenate and rechunk Zarr files. The default None means
883
+ use the optimized version suitable for large files, whereas ``xarray_internal`` will
884
+ use built-in Xarray APIs, which can exhibit high memory usage, see https://github.com/dask/dask/issues/6745.
814
885
"""
815
886
816
887
if temp_chunk_length is not None :
@@ -842,6 +913,7 @@ def vcf_to_zarr(
842
913
temp_chunk_length = temp_chunk_length ,
843
914
tempdir = tempdir ,
844
915
tempdir_storage_options = tempdir_storage_options ,
916
+ concat_algorithm = concat_algorithm ,
845
917
)
846
918
convert_func (
847
919
input , # type: ignore
0 commit comments