Skip to content

Commit

Permalink
Allow empty ancestral alleles
Browse files Browse the repository at this point in the history
Fixes #884
  • Loading branch information
hyanwong committed Jan 19, 2024
1 parent 7db1d38 commit a6ef089
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 3 deletions.
29 changes: 28 additions & 1 deletion tests/test_sgkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,9 +615,36 @@ def test_empty_alleles_not_at_end(self, tmp_path):
)
sgkit.save_dataset(ds, path)
samples = tsinfer.SgkitSampleData(path)
with pytest.raises(ValueError, match="Empty alleles must be at the end"):
with pytest.raises(ValueError, match="empty alleles must be at the end"):
tsinfer.infer(samples)

def test_empty_ancestral_alleles(self, tmp_path):
path = tmp_path / "data.zarr"
ds = sgkit.simulate_genotype_call_dataset(n_variant=3, n_sample=3, n_ploidy=1)
ds["variant_allele"] = (
ds["variant_allele"].dims,
np.array(
[["", "A", "C", ""], ["A", "C", "", ""], ["A", "C", "", ""]], dtype="S1"
),
)
ds["variant_ancestral_allele"] = (
["variants"],
np.array(["", "A", "*"], dtype="S1"),
)
sgkit.save_dataset(ds, path)
samples = tsinfer.SgkitSampleData(path)
for v in samples.variants(recode_ancestral=True):
if v.site.id == 0:
assert v.site.ancestral_state in (b"", "")
assert len(v.alleles) == 3
assert v.alleles[0] in (b"", "")
elif v.site.id == 1:
assert v.site.ancestral_state in (b"A", "A")
assert len(v.alleles) == 2
elif v.site.id == 3:
assert v.site.ancestral_state is None
assert len(v.alleles) == 2


class TestSgkitMatchSamplesToDisk:
@pytest.mark.skipif(sys.platform == "win32", reason="No cyvcf2 on windows")
Expand Down
13 changes: 11 additions & 2 deletions tsinfer/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2584,9 +2584,12 @@ def variants(self, sites=None, recode_ancestral=None):
genos = genos.reshape(self.num_samples)
aa = site.ancestral_allele
alleles = site.alleles
first_empty = False
if aa != MISSING_DATA and aa > 0 and recode_ancestral:
# Need to recode this site
alleles = site.reorder_alleles()
if alleles[0] == b"" or alleles[0] == "":
first_empty = True # Needed for checking later
# re-map the genotypes
geno_map = np.arange(len(alleles) - MISSING_DATA, dtype=genos.dtype)
geno_map[MISSING_DATA] = MISSING_DATA
Expand All @@ -2598,10 +2601,16 @@ def variants(self, sites=None, recode_ancestral=None):
# alleles are at the end of the list, so check this.
non_empty_alleles = []
empty_seen = False
for allele in alleles:
for i, allele in enumerate(alleles):
if allele != b"" and allele != "":
if empty_seen:
raise ValueError("Empty alleles must be at the end")
raise ValueError(
f"Site {site.id} (pos {site.position}): empty alleles "
f"must be at the end, but alleles are {alleles}"
)
non_empty_alleles.append(allele)
elif i == 0 and first_empty:
# Single empty allele allowed if it is the starting ancestral allele
non_empty_alleles.append(allele)
else:
empty_seen = True
Expand Down

0 comments on commit a6ef089

Please sign in to comment.