Merge pull request #748 from benjeffery/sgkit-sampledata

Minimum viable sgkit dataset
tskit-dev · Nov 30, 2022 · 6ca6edc · 6ca6edc
2 parents 53b1866 + 38cd717
commit 6ca6edc
Show file tree

Hide file tree

Showing 11 changed files with 350 additions and 69 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -2,7 +2,7 @@ version: 2
 jobs:
   build:
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     steps:
       - checkout
 

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -29,8 +29,9 @@ jobs:
     name: Python
     runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
-        python: [ 3.7, "3.10" ]
+        python: [ 3.8, "3.10" ]
         os:  [ macos-latest, ubuntu-latest, windows-latest ]
     defaults:
       run:
@@ -55,7 +56,7 @@ jobs:
             /usr/share/miniconda/envs/anaconda-client-env
             ~/osx-conda
             ~/.profile
-          key: ${{ runner.os }}-${{ matrix.python}}-conda-v12-${{ hashFiles('requirements/CI-tests-conda/requirements.txt') }}
+          key: ${{ runner.os }}-${{ matrix.python}}-conda-v13-${{ hashFiles('requirements/CI-tests-conda/requirements.txt') }}
 
       - name: Install Conda
         uses: conda-incubator/setup-miniconda@v2
@@ -85,6 +86,13 @@ jobs:
         shell: bash -l {0} #We need a login shell to get conda
         run: conda install --yes --file=requirements/CI-tests-conda/requirements.txt
 
+      - name: Install cyvcf2 #Fails if done via conda due to no windows support.
+        if: steps.cache.outputs.cache-hit != 'true' && matrix.os != 'windows-latest'
+        run: |
+          source ~/.profile
+          conda activate anaconda-client-env
+          pip install cyvcf2==0.30.18
+
       - name: Fix OSX Cache Write #OSX Won't let the cache restore due to file perms
         if: steps.cache.outputs.cache-hit != 'true' && matrix.os == 'macos-latest'
         run: |

diff --git a/.mergify.yml b/.mergify.yml
@@ -4,11 +4,11 @@ queue_rules:
       - "#approved-reviews-by>=1"
       - "#changes-requested-reviews-by=0"
       - status-success=Lint
-      - status-success=Python (3.7, macos-latest)
+      - status-success=Python (3.8, macos-latest)
       - status-success=Python (3.10, macos-latest)
-      - status-success=Python (3.7, ubuntu-latest)
+      - status-success=Python (3.8, ubuntu-latest)
       - status-success=Python (3.10, ubuntu-latest)
-      - status-success=Python (3.7, windows-latest)
+      - status-success=Python (3.8, windows-latest)
       - status-success=Python (3.10, windows-latest)
       - "status-success=ci/circleci: build"
 
@@ -21,11 +21,11 @@ pull_request_rules:
       - base=main
       - label=AUTOMERGE-REQUESTED
       - status-success=Lint
-      - status-success=Python (3.7, macos-latest)
+      - status-success=Python (3.8, macos-latest)
       - status-success=Python (3.10, macos-latest)
-      - status-success=Python (3.7, ubuntu-latest)
+      - status-success=Python (3.8, ubuntu-latest)
       - status-success=Python (3.10, ubuntu-latest)
-      - status-success=Python (3.7, windows-latest)
+      - status-success=Python (3.8, windows-latest)
       - status-success=Python (3.10, windows-latest)
       - "status-success=ci/circleci: build"
     actions:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,11 @@
 
 In development
 
+**Breaking Changes**
+
+- Remove the `uuid` field from SampleData. SampleData equality is now purely based
+  on data. ({pr}`748`, {user}`benjeffery`)
+
 **Performance improvements**
 
 - Reduce memory usage when running `match_samples` against large cohorts 

diff --git a/requirements/CI-tests-complete/requirements.txt b/requirements/CI-tests-complete/requirements.txt
@@ -7,17 +7,18 @@ h5py==3.6.0
 humanize==4.1.0
 lmdb==1.3.0
 matplotlib==3.4.1
-meson==0.62.0 
+meson==0.62.0
 msprime==1.1.1
 numpy==1.21.6
-pandas==1.2.5
+pandas==1.3.5
 pytest==7.1.2
 pytest-cov==3.0.0
 pytest-xdist==2.5.0
 seaborn==0.11.2
-setuptools==65.4.1
+setuptools==65.5.0
+sgkit[vcf]==0.5.0
 sortedcontainers==2.4.0
 tqdm==4.64.0
 tskit==0.5.3
 twine==4.0.1
-zarr==2.11.3
+zarr==2.10.3
diff --git a/requirements/CI-tests-conda/requirements.txt b/requirements/CI-tests-conda/requirements.txt
@@ -9,7 +9,8 @@ numcodecs==0.10.2
 pytest==7.2.0
 python-lmdb==1.3.0 
 seaborn==0.12.1
-sortedcontainers==2.4.0 
+sgkit==0.5.0
+sortedcontainers==2.4.0
 tqdm==4.64.1
 tskit==0.5.3 
 zarr==2.13.3

diff --git a/requirements/development.txt b/requirements/development.txt
@@ -2,7 +2,8 @@ attrs
 codecov
 coverage
 flake8
-numpy
+# Held at < 1.22 for sgkit compat
+numpy<1.22
 six
 tqdm
 humanize
@@ -29,8 +30,11 @@ setuptools>=45
 setuptools_scm
 cyvcf2
 # Needed for evaluation script.
-pandas
+# Held at < 1.4.0 for sgkit compat
+pandas<1.4.0
 matplotlib
 seaborn
 colorama
-
+sgkit[vcf]
+# Held at zarr<2.11.0,>=2.10.0 for sgkit compat
+zarr<2.11.0,>=2.10.0
diff --git a/tests/test_formats.py b/tests/test_formats.py
@@ -1077,7 +1077,6 @@ def test_copy_new_uuid(self):
         data.finalise()
         copy = data.copy()
         copy.finalise()
-        assert copy.uuid != data.uuid
         assert copy.data_equal(data)
 
     def test_copy_update_sites_time(self):
@@ -1921,8 +1920,6 @@ def verify_data_round_trip(self, sample_data, ancestor_data, ancestors):
         ancestor_data.record_provenance("verify_data_round_trip")
         ancestor_data.finalise()
 
-        assert len(ancestor_data.uuid) > 0
-        assert ancestor_data.sample_data_uuid == sample_data.uuid
         assert ancestor_data.sequence_length == sample_data.sequence_length
         assert ancestor_data.format_name == formats.AncestorData.FORMAT_NAME
         assert ancestor_data.format_version == formats.AncestorData.FORMAT_VERSION
@@ -2195,11 +2192,9 @@ def test_bad_insert_proxy_samples(self):
     def test_insert_proxy_bad_sample_data(self):
         sample_data, _ = self.get_example_data(10, 10, 40)
         ancestors = tsinfer.generate_ancestors(sample_data)
-        # by default, sample_data must be the same
         sd_copy, _ = self.get_example_data(10, 10, num_ancestors=40)
-        with pytest.raises(ValueError):
-            ancestors.insert_proxy_samples(sd_copy)
-        # But works if we don't require same data
+        ancestors.insert_proxy_samples(sd_copy)
+        # Deprecated flag should change nothing
         ancestors.insert_proxy_samples(sd_copy, require_same_sample_data=False)
         # Unless seq lengths differ
         sd_copy, _ = self.get_example_data(10, sequence_length=11, num_ancestors=40)
@@ -2229,8 +2224,8 @@ def test_insert_proxy_no_samples(self):
         sample_data, _ = self.get_example_data(10, 10, 40)
         ancestors = tsinfer.generate_ancestors(sample_data)
         ancestors_extra = ancestors.insert_proxy_samples(sample_data, sample_ids=[])
-        assert ancestors != ancestors_extra  # UUIDs should differ ...
-        assert ancestors.data_equal(ancestors_extra)  # but data be identical
+        assert ancestors == ancestors_extra  # Equality based on data
+        assert ancestors.data_equal(ancestors_extra)  # data should be identical
 
     def test_insert_proxy_1_sample(self):
         sample_data, _ = self.get_example_data(10, 10, 40)

diff --git a/tests/test_sgkit.py b/tests/test_sgkit.py
@@ -0,0 +1,50 @@
+#
+# Copyright (C) 2022 University of Oxford
+#
+# This file is part of tsinfer.
+#
+# tsinfer is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# tsinfer is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with tsinfer.  If not, see <http://www.gnu.org/licenses/>.
+#
+"""
+Tests for the data files.
+"""
+import sys
+
+import msprime
+import numpy as np
+import pytest
+
+import tsinfer
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="No cyvcf2 on windows")
+def test_sgkit_dataset(tmp_path):
+    import sgkit.io.vcf
+
+    ts = msprime.sim_ancestry(
+        samples=50,
+        ploidy=3,
+        recombination_rate=0.25,
+        sequence_length=50,
+        random_seed=100,
+    )
+    ts = msprime.sim_mutations(ts, rate=0.025, model=msprime.BinaryMutationModel())
+    with open(tmp_path / "data.vcf", "w") as f:
+        ts.write_vcf(f)
+    sgkit.io.vcf.vcf_to_zarr(
+        tmp_path / "data.vcf", tmp_path / "data.zarr", ploidy=3, max_alt_alleles=1
+    )
+    samples = tsinfer.SgkitSampleData(tmp_path / "data.zarr")
+    inf_ts = tsinfer.infer(samples)
+    assert np.array_equal(ts.genotype_matrix(), inf_ts.genotype_matrix())
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,7 @@ version: 2 @@
     jobs:
       build:
         docker:
-          - image: cimg/python:3.7
+          - image: cimg/python:3.8
         steps:
           - checkout
@@ Expand Down @@