diff --git a/.circleci/config.yml b/.circleci/config.yml
index 995d9aaf..329b0a22 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -2,7 +2,7 @@ version: 2
 jobs:
   build:
     docker:
-      - image: cimg/python:3.7
+      - image: cimg/python:3.8
     steps:
       - checkout
 
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f1bba77a..5463e6f0 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -29,8 +29,9 @@ jobs:
     name: Python
     runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
-        python: [ 3.7, "3.10" ]
+        python: [ 3.8, "3.10" ]
         os:  [ macos-latest, ubuntu-latest, windows-latest ]
     defaults:
       run:
@@ -55,7 +56,7 @@ jobs:
             /usr/share/miniconda/envs/anaconda-client-env
             ~/osx-conda
             ~/.profile
-          key: ${{ runner.os }}-${{ matrix.python}}-conda-v12-${{ hashFiles('requirements/CI-tests-conda/requirements.txt') }}
+          key: ${{ runner.os }}-${{ matrix.python}}-conda-v13-${{ hashFiles('requirements/CI-tests-conda/requirements.txt') }}
 
       - name: Install Conda
         uses: conda-incubator/setup-miniconda@v2
@@ -85,6 +86,13 @@ jobs:
         shell: bash -l {0} #We need a login shell to get conda
         run: conda install --yes --file=requirements/CI-tests-conda/requirements.txt
 
+      - name: Install cyvcf2 #Fails if done via conda due to no windows support.
+        if: steps.cache.outputs.cache-hit != 'true' && matrix.os != 'windows-latest'
+        run: |
+          source ~/.profile
+          conda activate anaconda-client-env
+          pip install cyvcf2==0.30.18
+
       - name: Fix OSX Cache Write #OSX Won't let the cache restore due to file perms
         if: steps.cache.outputs.cache-hit != 'true' && matrix.os == 'macos-latest'
         run: |
diff --git a/.mergify.yml b/.mergify.yml
index 59affc9c..fae6ff9e 100644
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -4,11 +4,11 @@ queue_rules:
       - "#approved-reviews-by>=1"
       - "#changes-requested-reviews-by=0"
       - status-success=Lint
-      - status-success=Python (3.7, macos-latest)
+      - status-success=Python (3.8, macos-latest)
       - status-success=Python (3.10, macos-latest)
-      - status-success=Python (3.7, ubuntu-latest)
+      - status-success=Python (3.8, ubuntu-latest)
       - status-success=Python (3.10, ubuntu-latest)
-      - status-success=Python (3.7, windows-latest)
+      - status-success=Python (3.8, windows-latest)
       - status-success=Python (3.10, windows-latest)
       - "status-success=ci/circleci: build"
 
@@ -21,11 +21,11 @@ pull_request_rules:
       - base=main
       - label=AUTOMERGE-REQUESTED
       - status-success=Lint
-      - status-success=Python (3.7, macos-latest)
+      - status-success=Python (3.8, macos-latest)
       - status-success=Python (3.10, macos-latest)
-      - status-success=Python (3.7, ubuntu-latest)
+      - status-success=Python (3.8, ubuntu-latest)
       - status-success=Python (3.10, ubuntu-latest)
-      - status-success=Python (3.7, windows-latest)
+      - status-success=Python (3.8, windows-latest)
       - status-success=Python (3.10, windows-latest)
       - "status-success=ci/circleci: build"
     actions:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e9b8fd5d..b26fb161 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,11 @@
 
 In development
 
+**Breaking Changes**
+
+- Remove the `uuid` field from SampleData. SampleData equality is now purely based
+  on data. ({pr}`748`, {user}`benjeffery`)
+
 **Performance improvements**
 
 - Reduce memory usage when running `match_samples` against large cohorts 
diff --git a/requirements/CI-tests-complete/requirements.txt b/requirements/CI-tests-complete/requirements.txt
index 5c859cf8..4d3a6673 100644
--- a/requirements/CI-tests-complete/requirements.txt
+++ b/requirements/CI-tests-complete/requirements.txt
@@ -7,17 +7,18 @@ h5py==3.6.0
 humanize==4.1.0
 lmdb==1.3.0
 matplotlib==3.4.1
-meson==0.62.0 
+meson==0.62.0
 msprime==1.1.1
 numpy==1.21.6
-pandas==1.2.5
+pandas==1.3.5
 pytest==7.1.2
 pytest-cov==3.0.0
 pytest-xdist==2.5.0
 seaborn==0.11.2
-setuptools==65.4.1
+setuptools==65.5.0
+sgkit[vcf]==0.5.0
 sortedcontainers==2.4.0
 tqdm==4.64.0
 tskit==0.5.3
 twine==4.0.1
-zarr==2.11.3
+zarr==2.10.3
diff --git a/requirements/CI-tests-conda/requirements.txt b/requirements/CI-tests-conda/requirements.txt
index b92c2901..38104cdb 100644
--- a/requirements/CI-tests-conda/requirements.txt
+++ b/requirements/CI-tests-conda/requirements.txt
@@ -9,7 +9,8 @@ numcodecs==0.10.2
 pytest==7.2.0
 python-lmdb==1.3.0 
 seaborn==0.12.1
-sortedcontainers==2.4.0 
+sgkit==0.5.0
+sortedcontainers==2.4.0
 tqdm==4.64.1
 tskit==0.5.3 
 zarr==2.13.3
diff --git a/requirements/development.txt b/requirements/development.txt
index 52523426..a1e161e0 100644
--- a/requirements/development.txt
+++ b/requirements/development.txt
@@ -2,7 +2,8 @@ attrs
 codecov
 coverage
 flake8
-numpy
+# Held at < 1.22 for sgkit compat
+numpy<1.22
 six
 tqdm
 humanize
@@ -29,8 +30,11 @@ setuptools>=45
 setuptools_scm
 cyvcf2
 # Needed for evaluation script.
-pandas
+# Held at < 1.4.0 for sgkit compat
+pandas<1.4.0
 matplotlib
 seaborn
 colorama
-
+sgkit[vcf]
+# Held at zarr<2.11.0,>=2.10.0 for sgkit compat
+zarr<2.11.0,>=2.10.0
\ No newline at end of file
diff --git a/tests/test_formats.py b/tests/test_formats.py
index 6cb48e9d..e9b21220 100644
--- a/tests/test_formats.py
+++ b/tests/test_formats.py
@@ -1077,7 +1077,6 @@ def test_copy_new_uuid(self):
         data.finalise()
         copy = data.copy()
         copy.finalise()
-        assert copy.uuid != data.uuid
         assert copy.data_equal(data)
 
     def test_copy_update_sites_time(self):
@@ -1921,8 +1920,6 @@ def verify_data_round_trip(self, sample_data, ancestor_data, ancestors):
         ancestor_data.record_provenance("verify_data_round_trip")
         ancestor_data.finalise()
 
-        assert len(ancestor_data.uuid) > 0
-        assert ancestor_data.sample_data_uuid == sample_data.uuid
         assert ancestor_data.sequence_length == sample_data.sequence_length
         assert ancestor_data.format_name == formats.AncestorData.FORMAT_NAME
         assert ancestor_data.format_version == formats.AncestorData.FORMAT_VERSION
@@ -2195,11 +2192,9 @@ def test_bad_insert_proxy_samples(self):
     def test_insert_proxy_bad_sample_data(self):
         sample_data, _ = self.get_example_data(10, 10, 40)
         ancestors = tsinfer.generate_ancestors(sample_data)
-        # by default, sample_data must be the same
         sd_copy, _ = self.get_example_data(10, 10, num_ancestors=40)
-        with pytest.raises(ValueError):
-            ancestors.insert_proxy_samples(sd_copy)
-        # But works if we don't require same data
+        ancestors.insert_proxy_samples(sd_copy)
+        # Deprecated flag should change nothing
         ancestors.insert_proxy_samples(sd_copy, require_same_sample_data=False)
         # Unless seq lengths differ
         sd_copy, _ = self.get_example_data(10, sequence_length=11, num_ancestors=40)
@@ -2229,8 +2224,8 @@ def test_insert_proxy_no_samples(self):
         sample_data, _ = self.get_example_data(10, 10, 40)
         ancestors = tsinfer.generate_ancestors(sample_data)
         ancestors_extra = ancestors.insert_proxy_samples(sample_data, sample_ids=[])
-        assert ancestors != ancestors_extra  # UUIDs should differ ...
-        assert ancestors.data_equal(ancestors_extra)  # but data be identical
+        assert ancestors == ancestors_extra  # Equality based on data
+        assert ancestors.data_equal(ancestors_extra)  # data should be identical
 
     def test_insert_proxy_1_sample(self):
         sample_data, _ = self.get_example_data(10, 10, 40)
diff --git a/tests/test_sgkit.py b/tests/test_sgkit.py
new file mode 100644
index 00000000..177cf405
--- /dev/null
+++ b/tests/test_sgkit.py
@@ -0,0 +1,50 @@
+#
+# Copyright (C) 2022 University of Oxford
+#
+# This file is part of tsinfer.
+#
+# tsinfer is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# tsinfer is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with tsinfer.  If not, see <http://www.gnu.org/licenses/>.
+#
+"""
+Tests for the data files.
+"""
+import sys
+
+import msprime
+import numpy as np
+import pytest
+
+import tsinfer
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="No cyvcf2 on windows")
+def test_sgkit_dataset(tmp_path):
+    import sgkit.io.vcf
+
+    ts = msprime.sim_ancestry(
+        samples=50,
+        ploidy=3,
+        recombination_rate=0.25,
+        sequence_length=50,
+        random_seed=100,
+    )
+    ts = msprime.sim_mutations(ts, rate=0.025, model=msprime.BinaryMutationModel())
+    with open(tmp_path / "data.vcf", "w") as f:
+        ts.write_vcf(f)
+    sgkit.io.vcf.vcf_to_zarr(
+        tmp_path / "data.vcf", tmp_path / "data.zarr", ploidy=3, max_alt_alleles=1
+    )
+    samples = tsinfer.SgkitSampleData(tmp_path / "data.zarr")
+    inf_ts = tsinfer.infer(samples)
+    assert np.array_equal(ts.genotype_matrix(), inf_ts.genotype_matrix())
diff --git a/tsinfer/formats.py b/tsinfer/formats.py
index 7fcc7ceb..85c3c196 100644
--- a/tsinfer/formats.py
+++ b/tsinfer/formats.py
@@ -28,7 +28,6 @@
 import queue
 import sys
 import threading
-import uuid
 import warnings
 
 import attr
@@ -385,7 +384,6 @@ def __init__(
             self.data = zarr.open_group(store=store, mode="w")
         self.data.attrs[FORMAT_NAME_KEY] = self.FORMAT_NAME
         self.data.attrs[FORMAT_VERSION_KEY] = self.FORMAT_VERSION
-        self.data.attrs["uuid"] = str(uuid.uuid4())
 
         chunks = self._chunk_size
         provenances_group = self.data.create_group("provenances")
@@ -486,8 +484,7 @@ def copy(self, path=None, max_file_size=None):
         """
         Returns a copy of this DataContainer opened in 'edit' mode. If path
         is specified, this must not be equal to the path of the current
-        data container. The new container will have a different UUID to the
-        current.
+        data container.
         """
         if self._mode != self.READ_MODE:
             raise ValueError("Cannot copy unless in read mode.")
@@ -516,8 +513,6 @@ def copy(self, path=None, max_file_size=None):
             store = other._new_lmdb_store(max_file_size)
             zarr.copy_store(self.data.store, store)
             other.data = zarr.group(store)
-        # Set a new UUID
-        other.data.attrs["uuid"] = str(uuid.uuid4())
         other.data.attrs[FINALISED_KEY] = False
         other._mode = self.EDIT_MODE
         return other
@@ -663,10 +658,6 @@ def finalised(self):
             ret = self.data.attrs[FINALISED_KEY]
         return ret
 
-    @property
-    def uuid(self):
-        return str(self.data.attrs["uuid"])
-
     @property
     def num_provenances(self):
         return self.provenances_timestamp.shape[0]
@@ -693,7 +684,7 @@ def _format_str(self, values):
     def __eq__(self, other):
         ret = NotImplemented
         if isinstance(other, type(self)):
-            ret = self.uuid == other.uuid and self.data_equal(other)
+            ret = self.data_equal(other)
         return ret
 
     def __str__(self):
@@ -703,7 +694,6 @@ def __str__(self):
             ("format_name", self.format_name),
             ("format_version", self.format_version),
             ("finalised", self.finalised),
-            ("uuid", self.uuid),
             ("num_provenances", self.num_provenances),
             ("provenances/timestamp", zarr_summary(self.provenances_timestamp)),
             ("provenances/record", zarr_summary(self.provenances_record)),
@@ -1320,10 +1310,10 @@ def data_equal(self, other):
         """
         Returns True if all the data attributes of this input file and the
         specified input file are equal. This compares every attribute except
-        the UUID and provenance.
+        the provenance.
 
         To compare two :class:`SampleData` instances for exact equality of
-        all data including UUIDs and provenance data, use ``s1 == s2``.
+        all data including provenance data, use ``s1 == s2``.
 
         :param SampleData other: The other :class:`SampleData` instance to
             compare with.
@@ -2100,7 +2090,7 @@ def variants(self, sites=None, recode_ancestral=None):
                 genos = geno_map[genos]
             yield Variant(site=site, alleles=alleles, genotypes=genos)
 
-    def __all_haplotypes(self, sites=None, recode_ancestral=None):
+    def _all_haplotypes(self, sites=None, recode_ancestral=None):
         # We iterate over chunks vertically here, and it's not worth complicating
         # the chunk iterator to handle this.
         if recode_ancestral is None:
@@ -2156,7 +2146,7 @@ def haplotypes(self, samples=None, sites=None, recode_ancestral=None):
                 raise ValueError("Sample index too large.")
         j = 0
 
-        for index, a in self.__all_haplotypes(sites, recode_ancestral):
+        for index, a in self._all_haplotypes(sites, recode_ancestral):
             if j == len(samples):
                 break
             if index == samples[j]:
@@ -2225,6 +2215,257 @@ def populations(self):
             yield Population(j, metadata=metadata)
 
 
+class SgkitSampleData(SampleData):
+
+    FORMAT_NAME = "tsinfer-sgkit-sample-data"
+    FORMAT_VERSION = (0, 1)
+
+    def __init__(self, path):
+        self.path = path
+        self.data = zarr.open(path, mode="r")
+        genotypes_arr = self.data["call_genotype"]
+        self._num_sites, self._num_individuals, self.ploidy = genotypes_arr.shape
+        self._num_samples = self._num_individuals * self.ploidy
+
+        assert self.ploidy == self.data["call_genotype"].chunks[2]
+
+    def __metadata_schema_getter(self, zarr_group):
+        try:
+            return self.data[zarr_group].attrs["metadata_schema"]
+        except KeyError:
+            return {"codec": "json"}
+
+    @property
+    def format_name(self):
+        return self.FORMAT_NAME
+
+    @property
+    def format_version(self):
+        return self.FORMAT_VERSION
+
+    @property
+    def finalised(self):
+        return True
+
+    @property
+    def sequence_length(self):
+        try:
+            return self.data.attrs["sequence_length"]
+        except KeyError:
+            return int(np.max(self.data["variant_position"])) + 1
+
+    @property
+    def num_sites(self):
+        return self._num_sites
+
+    @property
+    def sites_metadata_schema(self):
+        return self.__metadata_schema_getter("sites")
+
+    @property
+    def sites_metadata(self):
+        try:
+            return self.data["sites/metadata"]
+        except KeyError:
+            return zarr.array(
+                [{}] * self.num_individuals, object_codec=numcodecs.JSON()
+            )
+
+    @property
+    def sites_time(self):
+        try:
+            return self.data["sites/time"]
+        except KeyError:
+            return np.full(self.data["variant_position"].shape, tskit.UNKNOWN_TIME)
+
+    @property
+    def sites_position(self):
+        return self.data["variant_position"]
+
+    @property
+    def sites_alleles(self):
+        return self.data["variant_allele"]
+
+    @property
+    def sites_ancestral_allele(self):
+        try:
+            return self.data["variant_ancestral_allele_index"]
+        except KeyError:
+            # Maintains backwards compatibility: in previous tsinfer versions the
+            # ancestral allele was always the zeroth element in the alleles list
+            return np.zeros(self.num_sites, dtype=np.int8)
+
+    @property
+    def sites_genotypes(self):
+        gt = self.data["call_genotype"]
+        # This method is only used for test/debug so we retrive and
+        # reshape the entire array.
+        return gt[...].reshape(gt.shape[0], gt.shape[1] * gt.shape[2])
+
+    @property
+    def provenances_timestamp(self):
+        try:
+            return self.data["provenances_timestamp"]
+        except KeyError:
+            return np.array([], dtype=object)
+
+    @property
+    def provenances_record(self):
+        try:
+            return self.data["provenances_record"]
+        except KeyError:
+            return np.array([], dtype=object)
+
+    @property
+    def num_samples(self):
+        return self._num_samples
+
+    @property
+    def samples_individual(self):
+        ret = np.zeros((self.num_samples), dtype=np.int32)
+        for p in range(self.ploidy):
+            ret[p :: self.ploidy] = np.arange(self.num_individuals)
+        return ret
+
+    @property
+    def metadata_schema(self):
+        try:
+            return self.data.attrs["metadata_schema"]
+        except KeyError:
+            None
+
+    @property
+    def metadata(self):
+        try:
+            return self.data.attrs["metadata"]
+        except KeyError:
+            return b""
+
+    @property
+    def populations_metadata(self):
+        try:
+            return self.data["populations/metadata"]
+        except KeyError:
+            return np.array([], dtype=object)
+
+    @property
+    def populations_metadata_schema(self):
+        return self.__metadata_schema_getter("populations")
+
+    @property
+    def num_individuals(self):
+        return self._num_individuals
+
+    @property
+    def individuals_time(self):
+        try:
+            return self.data["individuals/time"]
+        except KeyError:
+            return np.full(self.num_individuals, tskit.UNKNOWN_TIME)
+
+    @property
+    def individuals_metadata_schema(self):
+        return self.__metadata_schema_getter("individuals")
+
+    @property
+    def individuals_metadata(self):
+        try:
+            return self.data["individuals/metadata"]
+        except KeyError:
+            return zarr.array(
+                [{}] * self.num_individuals, object_codec=numcodecs.JSON()
+            )
+
+    @property
+    def individuals_location(self):
+        try:
+            return self.data["individuals/location"]
+        except KeyError:
+            return zarr.array([[]] * self.num_individuals, dtype=float)
+
+    @property
+    def individuals_population(self):
+        try:
+            return self.data["individuals/population"]
+        except KeyError:
+            return np.full((self.num_individuals), tskit.NULL, dtype=np.int32)
+
+    @property
+    def individuals_flags(self):
+        try:
+            return self.data["individuals/population"]
+        except KeyError:
+            return np.full((self.num_individuals), 0, dtype=np.int32)
+
+    def variants(self, sites=None, recode_ancestral=None):
+        """
+        Returns an iterator over the :class:`Variant` objects. This is equivalent to
+        the :meth:`tskit.TreeSequence.variants` iterator. If recode_ancestral is
+        ``True``, the ``.alleles`` attribute of each variant is guaranteed to return
+        the alleles in an order such that the ancestral state is the first item
+        in the list. In this case, ``variant.alleles`` may list the alleles in a
+        different order from the input order as listed in ``variant.site.alleles``,
+        and the values in genotypes array will be recoded so that the ancestral
+        state will have a genotype of 0. If the ancestral state is unknown, the
+        original input order is kept.
+
+        :param array sites: A numpy array of ascending site ids for which to return
+            data. If None (default) return all sites.
+        :param bool recode_ancestral: If True, recode genotypes at sites where the
+            ancestral state is known such that the ancestral state is coded as 0,
+            as described above. Otherwise return genotypes in the input allele encoding.
+            Default: ``None`` treated as ``False``.
+        :return: An iterator over the variants in the sample data file.
+        :rtype: iter(:class:`Variant`)
+        """
+        if recode_ancestral is None:
+            recode_ancestral = False
+        all_genotypes = chunk_iterator(self.data["call_genotype"], indexes=sites)
+        assert MISSING_DATA < 0  # required for geno_map to remap MISSING_DATA
+        for genos, site in zip(all_genotypes, self.sites(ids=sites)):
+            # We have an extra ploidy dimension when coming from sgkit
+            genos = genos.reshape(self.num_samples)
+            aa = site.ancestral_allele
+            alleles = site.alleles
+            if aa != MISSING_DATA and aa > 0 and recode_ancestral:
+                # Need to recode this site
+                alleles = site.reorder_alleles()
+                # re-map the genotypes
+                geno_map = np.arange(len(alleles) - MISSING_DATA, dtype=genos.dtype)
+                geno_map[MISSING_DATA] = MISSING_DATA
+                geno_map[aa] = 0
+                geno_map[0:aa] += 1
+                genos = geno_map[genos]
+            yield Variant(site=site, alleles=alleles, genotypes=genos)
+
+    def _all_haplotypes(self, sites=None, recode_ancestral=None):
+        # We iterate over chunks vertically here, and it's not worth complicating
+        # the chunk iterator to handle this.
+        if recode_ancestral is None:
+            recode_ancestral = False
+        aa_index = self.sites_ancestral_allele[:]
+        # If ancestral allele is missing, keep the order unchanged (aa_index of zero)
+        aa_index[aa_index == MISSING_DATA] = 0
+        gt = self.data["call_genotype"]
+        chunk_size = gt.chunks[1]
+        for j in range(self.num_individuals):
+            if j % chunk_size == 0:
+                chunk = gt[:, j : j + chunk_size, :]
+            indiv_gt = chunk[:, j % chunk_size, :]
+            for k in range(self.ploidy):
+                a = indiv_gt[:, k].T
+                if recode_ancestral:
+                    # Remap the genotypes at all sites, depending on the aa_index
+                    a = np.where(
+                        a == aa_index,
+                        0,
+                        np.where(
+                            np.logical_and(a != MISSING_DATA, a < aa_index), a + 1, a
+                        ),
+                    )
+                yield (j * self.ploidy) + k, a if sites is None else a[sites]
+
+
 @attr.s(order=False, eq=False)
 class Ancestor:
     """
@@ -2291,7 +2532,6 @@ def __init__(self, sample_data, **kwargs):
         super().__init__(**kwargs)
         sample_data._check_finalised()
         self.sample_data = sample_data
-        self.data.attrs["sample_data_uuid"] = sample_data.uuid
         if self.sample_data.sequence_length == 0:
             raise ValueError("Bad samples file: sequence_length cannot be zero")
         self.data.attrs["sequence_length"] = self.sample_data.sequence_length
@@ -2371,7 +2611,6 @@ def summary(self):
     def __str__(self):
         values = [
             ("sequence_length", self.sequence_length),
-            ("sample_data_uuid", self.sample_data_uuid),
             ("num_ancestors", self.num_ancestors),
             ("num_sites", self.num_sites),
             ("sites/position", zarr_summary(self.sites_position)),
@@ -2386,12 +2625,10 @@ def __str__(self):
     def data_equal(self, other):
         """
         Returns True if all the data attributes of this input file and the
-        specified input file are equal. This compares every attribute except
-        the UUID.
+        specified input file are equal. This compares every attribute.
         """
         return (
             self.sequence_length == other.sequence_length
-            and self.sample_data_uuid == other.sample_data_uuid
             and self.format_name == other.format_name
             and self.format_version == other.format_version
             and self.num_ancestors == other.num_ancestors
@@ -2413,10 +2650,6 @@ def sequence_length(self):
         """
         return self.data.attrs["sequence_length"]
 
-    @property
-    def sample_data_uuid(self):
-        return self.data.attrs["sample_data_uuid"]
-
     @property
     def num_ancestors(self):
         return self.ancestors_start.shape[0]
@@ -2531,14 +2764,7 @@ def insert_proxy_samples(
             (i.e. breaking the infinite sites assumption), allowing them to possess
             derived alleles at sites where there are no pre-existing mutations in
             older ancestors.
-        :param bool require_same_sample_data: If ``True`` (default) then the
-            the ``sample_data`` parameter must point to the same :class:`.SampleData`
-            instance as that used to generate the current ancestors. If ``False``,
-            this requirement is not enforced, and it is the user's responsibility
-            to ensure that the encoding of alleles in ``sample_data`` matches the
-            encoding in the current :class:`AncestorData` instance (i.e. that in the
-            original :class:`.SampleData` instance on which the current ancestors
-            are based).
+        :param bool require_same_sample_data: **Deprecated** Has no effect.
         :param \\**kwargs: Further arguments passed to the constructor when creating
             the new :class:`AncestorData` instance which will be returned.
 
@@ -2547,11 +2773,6 @@ def insert_proxy_samples(
         """
         self._check_finalised()
         sample_data._check_finalised()
-        if require_same_sample_data:
-            if sample_data.uuid != self.sample_data_uuid:
-                raise ValueError(
-                    "sample_data differs from that used to build the initial ancestors"
-                )
         if self.sequence_length != sample_data.sequence_length:
             raise ValueError("sample_data does not have the correct sequence length")
         used_sites = np.isin(sample_data.sites_position[:], self.sites_position[:])
@@ -2646,8 +2867,6 @@ def insert_proxy_samples(
             other.clear_provenances()
             for timestamp, record in self.provenances():
                 other.add_provenance(timestamp, record)
-            if sample_data.uuid != self.sample_data_uuid:
-                pass  # TODO: if sample files don't match, we need extra provenance info
             other.record_provenance(command="insert_proxy_samples", **kwargs)
 
         assert other.num_ancestors == self.num_ancestors + len(sample_ids)
diff --git a/tsinfer/inference.py b/tsinfer/inference.py
index a55edec1..26301c02 100644
--- a/tsinfer/inference.py
+++ b/tsinfer/inference.py
@@ -485,9 +485,7 @@ def match_ancestors(
     for timestamp, record in ancestor_data.provenances():
         tables.provenances.add_row(timestamp=timestamp, record=json.dumps(record))
     if record_provenance:
-        record = provenance.get_provenance_dict(
-            command="match_ancestors", source={"uuid": ancestor_data.uuid}
-        )
+        record = provenance.get_provenance_dict(command="match_ancestors")
         tables.provenances.add_row(record=json.dumps(record))
     ts = tables.tree_sequence()
     return ts