diff --git a/.circleci/config.yml b/.circleci/config.yml index 995d9aaf..329b0a22 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ version: 2 jobs: build: docker: - - image: cimg/python:3.7 + - image: cimg/python:3.8 steps: - checkout diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f1bba77a..5463e6f0 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -29,8 +29,9 @@ jobs: name: Python runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: - python: [ 3.7, "3.10" ] + python: [ 3.8, "3.10" ] os: [ macos-latest, ubuntu-latest, windows-latest ] defaults: run: @@ -55,7 +56,7 @@ jobs: /usr/share/miniconda/envs/anaconda-client-env ~/osx-conda ~/.profile - key: ${{ runner.os }}-${{ matrix.python}}-conda-v12-${{ hashFiles('requirements/CI-tests-conda/requirements.txt') }} + key: ${{ runner.os }}-${{ matrix.python}}-conda-v13-${{ hashFiles('requirements/CI-tests-conda/requirements.txt') }} - name: Install Conda uses: conda-incubator/setup-miniconda@v2 @@ -85,6 +86,13 @@ jobs: shell: bash -l {0} #We need a login shell to get conda run: conda install --yes --file=requirements/CI-tests-conda/requirements.txt + - name: Install cyvcf2 #Fails if done via conda due to no windows support. + if: steps.cache.outputs.cache-hit != 'true' && matrix.os != 'windows-latest' + run: | + source ~/.profile + conda activate anaconda-client-env + pip install cyvcf2==0.30.18 + - name: Fix OSX Cache Write #OSX Won't let the cache restore due to file perms if: steps.cache.outputs.cache-hit != 'true' && matrix.os == 'macos-latest' run: | diff --git a/.mergify.yml b/.mergify.yml index 59affc9c..fae6ff9e 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -4,11 +4,11 @@ queue_rules: - "#approved-reviews-by>=1" - "#changes-requested-reviews-by=0" - status-success=Lint - - status-success=Python (3.7, macos-latest) + - status-success=Python (3.8, macos-latest) - status-success=Python (3.10, macos-latest) - - status-success=Python (3.7, ubuntu-latest) + - status-success=Python (3.8, ubuntu-latest) - status-success=Python (3.10, ubuntu-latest) - - status-success=Python (3.7, windows-latest) + - status-success=Python (3.8, windows-latest) - status-success=Python (3.10, windows-latest) - "status-success=ci/circleci: build" @@ -21,11 +21,11 @@ pull_request_rules: - base=main - label=AUTOMERGE-REQUESTED - status-success=Lint - - status-success=Python (3.7, macos-latest) + - status-success=Python (3.8, macos-latest) - status-success=Python (3.10, macos-latest) - - status-success=Python (3.7, ubuntu-latest) + - status-success=Python (3.8, ubuntu-latest) - status-success=Python (3.10, ubuntu-latest) - - status-success=Python (3.7, windows-latest) + - status-success=Python (3.8, windows-latest) - status-success=Python (3.10, windows-latest) - "status-success=ci/circleci: build" actions: diff --git a/CHANGELOG.md b/CHANGELOG.md index e9b8fd5d..b26fb161 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,11 @@ In development +**Breaking Changes** + +- Remove the `uuid` field from SampleData. SampleData equality is now purely based + on data. ({pr}`748`, {user}`benjeffery`) + **Performance improvements** - Reduce memory usage when running `match_samples` against large cohorts diff --git a/requirements/CI-tests-complete/requirements.txt b/requirements/CI-tests-complete/requirements.txt index 5c859cf8..4d3a6673 100644 --- a/requirements/CI-tests-complete/requirements.txt +++ b/requirements/CI-tests-complete/requirements.txt @@ -7,17 +7,18 @@ h5py==3.6.0 humanize==4.1.0 lmdb==1.3.0 matplotlib==3.4.1 -meson==0.62.0 +meson==0.62.0 msprime==1.1.1 numpy==1.21.6 -pandas==1.2.5 +pandas==1.3.5 pytest==7.1.2 pytest-cov==3.0.0 pytest-xdist==2.5.0 seaborn==0.11.2 -setuptools==65.4.1 +setuptools==65.5.0 +sgkit[vcf]==0.5.0 sortedcontainers==2.4.0 tqdm==4.64.0 tskit==0.5.3 twine==4.0.1 -zarr==2.11.3 +zarr==2.10.3 diff --git a/requirements/CI-tests-conda/requirements.txt b/requirements/CI-tests-conda/requirements.txt index b92c2901..38104cdb 100644 --- a/requirements/CI-tests-conda/requirements.txt +++ b/requirements/CI-tests-conda/requirements.txt @@ -9,7 +9,8 @@ numcodecs==0.10.2 pytest==7.2.0 python-lmdb==1.3.0 seaborn==0.12.1 -sortedcontainers==2.4.0 +sgkit==0.5.0 +sortedcontainers==2.4.0 tqdm==4.64.1 tskit==0.5.3 zarr==2.13.3 diff --git a/requirements/development.txt b/requirements/development.txt index 52523426..a1e161e0 100644 --- a/requirements/development.txt +++ b/requirements/development.txt @@ -2,7 +2,8 @@ attrs codecov coverage flake8 -numpy +# Held at < 1.22 for sgkit compat +numpy<1.22 six tqdm humanize @@ -29,8 +30,11 @@ setuptools>=45 setuptools_scm cyvcf2 # Needed for evaluation script. -pandas +# Held at < 1.4.0 for sgkit compat +pandas<1.4.0 matplotlib seaborn colorama - +sgkit[vcf] +# Held at zarr<2.11.0,>=2.10.0 for sgkit compat +zarr<2.11.0,>=2.10.0 \ No newline at end of file diff --git a/tests/test_formats.py b/tests/test_formats.py index 6cb48e9d..e9b21220 100644 --- a/tests/test_formats.py +++ b/tests/test_formats.py @@ -1077,7 +1077,6 @@ def test_copy_new_uuid(self): data.finalise() copy = data.copy() copy.finalise() - assert copy.uuid != data.uuid assert copy.data_equal(data) def test_copy_update_sites_time(self): @@ -1921,8 +1920,6 @@ def verify_data_round_trip(self, sample_data, ancestor_data, ancestors): ancestor_data.record_provenance("verify_data_round_trip") ancestor_data.finalise() - assert len(ancestor_data.uuid) > 0 - assert ancestor_data.sample_data_uuid == sample_data.uuid assert ancestor_data.sequence_length == sample_data.sequence_length assert ancestor_data.format_name == formats.AncestorData.FORMAT_NAME assert ancestor_data.format_version == formats.AncestorData.FORMAT_VERSION @@ -2195,11 +2192,9 @@ def test_bad_insert_proxy_samples(self): def test_insert_proxy_bad_sample_data(self): sample_data, _ = self.get_example_data(10, 10, 40) ancestors = tsinfer.generate_ancestors(sample_data) - # by default, sample_data must be the same sd_copy, _ = self.get_example_data(10, 10, num_ancestors=40) - with pytest.raises(ValueError): - ancestors.insert_proxy_samples(sd_copy) - # But works if we don't require same data + ancestors.insert_proxy_samples(sd_copy) + # Deprecated flag should change nothing ancestors.insert_proxy_samples(sd_copy, require_same_sample_data=False) # Unless seq lengths differ sd_copy, _ = self.get_example_data(10, sequence_length=11, num_ancestors=40) @@ -2229,8 +2224,8 @@ def test_insert_proxy_no_samples(self): sample_data, _ = self.get_example_data(10, 10, 40) ancestors = tsinfer.generate_ancestors(sample_data) ancestors_extra = ancestors.insert_proxy_samples(sample_data, sample_ids=[]) - assert ancestors != ancestors_extra # UUIDs should differ ... - assert ancestors.data_equal(ancestors_extra) # but data be identical + assert ancestors == ancestors_extra # Equality based on data + assert ancestors.data_equal(ancestors_extra) # data should be identical def test_insert_proxy_1_sample(self): sample_data, _ = self.get_example_data(10, 10, 40) diff --git a/tests/test_sgkit.py b/tests/test_sgkit.py new file mode 100644 index 00000000..177cf405 --- /dev/null +++ b/tests/test_sgkit.py @@ -0,0 +1,50 @@ +# +# Copyright (C) 2022 University of Oxford +# +# This file is part of tsinfer. +# +# tsinfer is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# tsinfer is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with tsinfer. If not, see . +# +""" +Tests for the data files. +""" +import sys + +import msprime +import numpy as np +import pytest + +import tsinfer + + +@pytest.mark.skipif(sys.platform == "win32", reason="No cyvcf2 on windows") +def test_sgkit_dataset(tmp_path): + import sgkit.io.vcf + + ts = msprime.sim_ancestry( + samples=50, + ploidy=3, + recombination_rate=0.25, + sequence_length=50, + random_seed=100, + ) + ts = msprime.sim_mutations(ts, rate=0.025, model=msprime.BinaryMutationModel()) + with open(tmp_path / "data.vcf", "w") as f: + ts.write_vcf(f) + sgkit.io.vcf.vcf_to_zarr( + tmp_path / "data.vcf", tmp_path / "data.zarr", ploidy=3, max_alt_alleles=1 + ) + samples = tsinfer.SgkitSampleData(tmp_path / "data.zarr") + inf_ts = tsinfer.infer(samples) + assert np.array_equal(ts.genotype_matrix(), inf_ts.genotype_matrix()) diff --git a/tsinfer/formats.py b/tsinfer/formats.py index 7fcc7ceb..85c3c196 100644 --- a/tsinfer/formats.py +++ b/tsinfer/formats.py @@ -28,7 +28,6 @@ import queue import sys import threading -import uuid import warnings import attr @@ -385,7 +384,6 @@ def __init__( self.data = zarr.open_group(store=store, mode="w") self.data.attrs[FORMAT_NAME_KEY] = self.FORMAT_NAME self.data.attrs[FORMAT_VERSION_KEY] = self.FORMAT_VERSION - self.data.attrs["uuid"] = str(uuid.uuid4()) chunks = self._chunk_size provenances_group = self.data.create_group("provenances") @@ -486,8 +484,7 @@ def copy(self, path=None, max_file_size=None): """ Returns a copy of this DataContainer opened in 'edit' mode. If path is specified, this must not be equal to the path of the current - data container. The new container will have a different UUID to the - current. + data container. """ if self._mode != self.READ_MODE: raise ValueError("Cannot copy unless in read mode.") @@ -516,8 +513,6 @@ def copy(self, path=None, max_file_size=None): store = other._new_lmdb_store(max_file_size) zarr.copy_store(self.data.store, store) other.data = zarr.group(store) - # Set a new UUID - other.data.attrs["uuid"] = str(uuid.uuid4()) other.data.attrs[FINALISED_KEY] = False other._mode = self.EDIT_MODE return other @@ -663,10 +658,6 @@ def finalised(self): ret = self.data.attrs[FINALISED_KEY] return ret - @property - def uuid(self): - return str(self.data.attrs["uuid"]) - @property def num_provenances(self): return self.provenances_timestamp.shape[0] @@ -693,7 +684,7 @@ def _format_str(self, values): def __eq__(self, other): ret = NotImplemented if isinstance(other, type(self)): - ret = self.uuid == other.uuid and self.data_equal(other) + ret = self.data_equal(other) return ret def __str__(self): @@ -703,7 +694,6 @@ def __str__(self): ("format_name", self.format_name), ("format_version", self.format_version), ("finalised", self.finalised), - ("uuid", self.uuid), ("num_provenances", self.num_provenances), ("provenances/timestamp", zarr_summary(self.provenances_timestamp)), ("provenances/record", zarr_summary(self.provenances_record)), @@ -1320,10 +1310,10 @@ def data_equal(self, other): """ Returns True if all the data attributes of this input file and the specified input file are equal. This compares every attribute except - the UUID and provenance. + the provenance. To compare two :class:`SampleData` instances for exact equality of - all data including UUIDs and provenance data, use ``s1 == s2``. + all data including provenance data, use ``s1 == s2``. :param SampleData other: The other :class:`SampleData` instance to compare with. @@ -2100,7 +2090,7 @@ def variants(self, sites=None, recode_ancestral=None): genos = geno_map[genos] yield Variant(site=site, alleles=alleles, genotypes=genos) - def __all_haplotypes(self, sites=None, recode_ancestral=None): + def _all_haplotypes(self, sites=None, recode_ancestral=None): # We iterate over chunks vertically here, and it's not worth complicating # the chunk iterator to handle this. if recode_ancestral is None: @@ -2156,7 +2146,7 @@ def haplotypes(self, samples=None, sites=None, recode_ancestral=None): raise ValueError("Sample index too large.") j = 0 - for index, a in self.__all_haplotypes(sites, recode_ancestral): + for index, a in self._all_haplotypes(sites, recode_ancestral): if j == len(samples): break if index == samples[j]: @@ -2225,6 +2215,257 @@ def populations(self): yield Population(j, metadata=metadata) +class SgkitSampleData(SampleData): + + FORMAT_NAME = "tsinfer-sgkit-sample-data" + FORMAT_VERSION = (0, 1) + + def __init__(self, path): + self.path = path + self.data = zarr.open(path, mode="r") + genotypes_arr = self.data["call_genotype"] + self._num_sites, self._num_individuals, self.ploidy = genotypes_arr.shape + self._num_samples = self._num_individuals * self.ploidy + + assert self.ploidy == self.data["call_genotype"].chunks[2] + + def __metadata_schema_getter(self, zarr_group): + try: + return self.data[zarr_group].attrs["metadata_schema"] + except KeyError: + return {"codec": "json"} + + @property + def format_name(self): + return self.FORMAT_NAME + + @property + def format_version(self): + return self.FORMAT_VERSION + + @property + def finalised(self): + return True + + @property + def sequence_length(self): + try: + return self.data.attrs["sequence_length"] + except KeyError: + return int(np.max(self.data["variant_position"])) + 1 + + @property + def num_sites(self): + return self._num_sites + + @property + def sites_metadata_schema(self): + return self.__metadata_schema_getter("sites") + + @property + def sites_metadata(self): + try: + return self.data["sites/metadata"] + except KeyError: + return zarr.array( + [{}] * self.num_individuals, object_codec=numcodecs.JSON() + ) + + @property + def sites_time(self): + try: + return self.data["sites/time"] + except KeyError: + return np.full(self.data["variant_position"].shape, tskit.UNKNOWN_TIME) + + @property + def sites_position(self): + return self.data["variant_position"] + + @property + def sites_alleles(self): + return self.data["variant_allele"] + + @property + def sites_ancestral_allele(self): + try: + return self.data["variant_ancestral_allele_index"] + except KeyError: + # Maintains backwards compatibility: in previous tsinfer versions the + # ancestral allele was always the zeroth element in the alleles list + return np.zeros(self.num_sites, dtype=np.int8) + + @property + def sites_genotypes(self): + gt = self.data["call_genotype"] + # This method is only used for test/debug so we retrive and + # reshape the entire array. + return gt[...].reshape(gt.shape[0], gt.shape[1] * gt.shape[2]) + + @property + def provenances_timestamp(self): + try: + return self.data["provenances_timestamp"] + except KeyError: + return np.array([], dtype=object) + + @property + def provenances_record(self): + try: + return self.data["provenances_record"] + except KeyError: + return np.array([], dtype=object) + + @property + def num_samples(self): + return self._num_samples + + @property + def samples_individual(self): + ret = np.zeros((self.num_samples), dtype=np.int32) + for p in range(self.ploidy): + ret[p :: self.ploidy] = np.arange(self.num_individuals) + return ret + + @property + def metadata_schema(self): + try: + return self.data.attrs["metadata_schema"] + except KeyError: + None + + @property + def metadata(self): + try: + return self.data.attrs["metadata"] + except KeyError: + return b"" + + @property + def populations_metadata(self): + try: + return self.data["populations/metadata"] + except KeyError: + return np.array([], dtype=object) + + @property + def populations_metadata_schema(self): + return self.__metadata_schema_getter("populations") + + @property + def num_individuals(self): + return self._num_individuals + + @property + def individuals_time(self): + try: + return self.data["individuals/time"] + except KeyError: + return np.full(self.num_individuals, tskit.UNKNOWN_TIME) + + @property + def individuals_metadata_schema(self): + return self.__metadata_schema_getter("individuals") + + @property + def individuals_metadata(self): + try: + return self.data["individuals/metadata"] + except KeyError: + return zarr.array( + [{}] * self.num_individuals, object_codec=numcodecs.JSON() + ) + + @property + def individuals_location(self): + try: + return self.data["individuals/location"] + except KeyError: + return zarr.array([[]] * self.num_individuals, dtype=float) + + @property + def individuals_population(self): + try: + return self.data["individuals/population"] + except KeyError: + return np.full((self.num_individuals), tskit.NULL, dtype=np.int32) + + @property + def individuals_flags(self): + try: + return self.data["individuals/population"] + except KeyError: + return np.full((self.num_individuals), 0, dtype=np.int32) + + def variants(self, sites=None, recode_ancestral=None): + """ + Returns an iterator over the :class:`Variant` objects. This is equivalent to + the :meth:`tskit.TreeSequence.variants` iterator. If recode_ancestral is + ``True``, the ``.alleles`` attribute of each variant is guaranteed to return + the alleles in an order such that the ancestral state is the first item + in the list. In this case, ``variant.alleles`` may list the alleles in a + different order from the input order as listed in ``variant.site.alleles``, + and the values in genotypes array will be recoded so that the ancestral + state will have a genotype of 0. If the ancestral state is unknown, the + original input order is kept. + + :param array sites: A numpy array of ascending site ids for which to return + data. If None (default) return all sites. + :param bool recode_ancestral: If True, recode genotypes at sites where the + ancestral state is known such that the ancestral state is coded as 0, + as described above. Otherwise return genotypes in the input allele encoding. + Default: ``None`` treated as ``False``. + :return: An iterator over the variants in the sample data file. + :rtype: iter(:class:`Variant`) + """ + if recode_ancestral is None: + recode_ancestral = False + all_genotypes = chunk_iterator(self.data["call_genotype"], indexes=sites) + assert MISSING_DATA < 0 # required for geno_map to remap MISSING_DATA + for genos, site in zip(all_genotypes, self.sites(ids=sites)): + # We have an extra ploidy dimension when coming from sgkit + genos = genos.reshape(self.num_samples) + aa = site.ancestral_allele + alleles = site.alleles + if aa != MISSING_DATA and aa > 0 and recode_ancestral: + # Need to recode this site + alleles = site.reorder_alleles() + # re-map the genotypes + geno_map = np.arange(len(alleles) - MISSING_DATA, dtype=genos.dtype) + geno_map[MISSING_DATA] = MISSING_DATA + geno_map[aa] = 0 + geno_map[0:aa] += 1 + genos = geno_map[genos] + yield Variant(site=site, alleles=alleles, genotypes=genos) + + def _all_haplotypes(self, sites=None, recode_ancestral=None): + # We iterate over chunks vertically here, and it's not worth complicating + # the chunk iterator to handle this. + if recode_ancestral is None: + recode_ancestral = False + aa_index = self.sites_ancestral_allele[:] + # If ancestral allele is missing, keep the order unchanged (aa_index of zero) + aa_index[aa_index == MISSING_DATA] = 0 + gt = self.data["call_genotype"] + chunk_size = gt.chunks[1] + for j in range(self.num_individuals): + if j % chunk_size == 0: + chunk = gt[:, j : j + chunk_size, :] + indiv_gt = chunk[:, j % chunk_size, :] + for k in range(self.ploidy): + a = indiv_gt[:, k].T + if recode_ancestral: + # Remap the genotypes at all sites, depending on the aa_index + a = np.where( + a == aa_index, + 0, + np.where( + np.logical_and(a != MISSING_DATA, a < aa_index), a + 1, a + ), + ) + yield (j * self.ploidy) + k, a if sites is None else a[sites] + + @attr.s(order=False, eq=False) class Ancestor: """ @@ -2291,7 +2532,6 @@ def __init__(self, sample_data, **kwargs): super().__init__(**kwargs) sample_data._check_finalised() self.sample_data = sample_data - self.data.attrs["sample_data_uuid"] = sample_data.uuid if self.sample_data.sequence_length == 0: raise ValueError("Bad samples file: sequence_length cannot be zero") self.data.attrs["sequence_length"] = self.sample_data.sequence_length @@ -2371,7 +2611,6 @@ def summary(self): def __str__(self): values = [ ("sequence_length", self.sequence_length), - ("sample_data_uuid", self.sample_data_uuid), ("num_ancestors", self.num_ancestors), ("num_sites", self.num_sites), ("sites/position", zarr_summary(self.sites_position)), @@ -2386,12 +2625,10 @@ def __str__(self): def data_equal(self, other): """ Returns True if all the data attributes of this input file and the - specified input file are equal. This compares every attribute except - the UUID. + specified input file are equal. This compares every attribute. """ return ( self.sequence_length == other.sequence_length - and self.sample_data_uuid == other.sample_data_uuid and self.format_name == other.format_name and self.format_version == other.format_version and self.num_ancestors == other.num_ancestors @@ -2413,10 +2650,6 @@ def sequence_length(self): """ return self.data.attrs["sequence_length"] - @property - def sample_data_uuid(self): - return self.data.attrs["sample_data_uuid"] - @property def num_ancestors(self): return self.ancestors_start.shape[0] @@ -2531,14 +2764,7 @@ def insert_proxy_samples( (i.e. breaking the infinite sites assumption), allowing them to possess derived alleles at sites where there are no pre-existing mutations in older ancestors. - :param bool require_same_sample_data: If ``True`` (default) then the - the ``sample_data`` parameter must point to the same :class:`.SampleData` - instance as that used to generate the current ancestors. If ``False``, - this requirement is not enforced, and it is the user's responsibility - to ensure that the encoding of alleles in ``sample_data`` matches the - encoding in the current :class:`AncestorData` instance (i.e. that in the - original :class:`.SampleData` instance on which the current ancestors - are based). + :param bool require_same_sample_data: **Deprecated** Has no effect. :param \\**kwargs: Further arguments passed to the constructor when creating the new :class:`AncestorData` instance which will be returned. @@ -2547,11 +2773,6 @@ def insert_proxy_samples( """ self._check_finalised() sample_data._check_finalised() - if require_same_sample_data: - if sample_data.uuid != self.sample_data_uuid: - raise ValueError( - "sample_data differs from that used to build the initial ancestors" - ) if self.sequence_length != sample_data.sequence_length: raise ValueError("sample_data does not have the correct sequence length") used_sites = np.isin(sample_data.sites_position[:], self.sites_position[:]) @@ -2646,8 +2867,6 @@ def insert_proxy_samples( other.clear_provenances() for timestamp, record in self.provenances(): other.add_provenance(timestamp, record) - if sample_data.uuid != self.sample_data_uuid: - pass # TODO: if sample files don't match, we need extra provenance info other.record_provenance(command="insert_proxy_samples", **kwargs) assert other.num_ancestors == self.num_ancestors + len(sample_ids) diff --git a/tsinfer/inference.py b/tsinfer/inference.py index a55edec1..26301c02 100644 --- a/tsinfer/inference.py +++ b/tsinfer/inference.py @@ -485,9 +485,7 @@ def match_ancestors( for timestamp, record in ancestor_data.provenances(): tables.provenances.add_row(timestamp=timestamp, record=json.dumps(record)) if record_provenance: - record = provenance.get_provenance_dict( - command="match_ancestors", source={"uuid": ancestor_data.uuid} - ) + record = provenance.get_provenance_dict(command="match_ancestors") tables.provenances.add_row(record=json.dumps(record)) ts = tables.tree_sequence() return ts