Skip to content

Commit 78d15c1

Browse files
committed
Add a sample_id column to ancestors
Starts to address #604. Also changes the sampledata format
1 parent 69bb8c7 commit 78d15c1

File tree

3 files changed

+90
-16
lines changed

3 files changed

+90
-16
lines changed

docs/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@
195195
intersphinx_mapping = {
196196
"https://docs.python.org/3/": None,
197197
"https://tskit.dev/tutorials": None,
198+
"https://numpy.org/doc/stable/": None,
198199
"https://tskit.dev/tskit/docs/stable": None,
199200
"https://tskit.dev/msprime/docs/stable": None,
200201
"https://numcodecs.readthedocs.io/en/stable/": None,

tests/test_formats.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2193,13 +2193,42 @@ def test_insert_proxy_1_sample(self):
21932193
sample_data, sample_ids=[i]
21942194
)
21952195
assert ancestors.num_ancestors + 1 == ancestors_extra.num_ancestors
2196-
inserted = -1
2196+
inserted = -1 # inserted one should always be the last
21972197
self.assert_ancestor_full_span(ancestors_extra, [inserted])
21982198
assert np.array_equal(
21992199
ancestors_extra.ancestors_haplotype[inserted],
22002200
sample_data.sites_genotypes[:, i][used_sites],
22012201
)
22022202

2203+
def test_insert_proxy_sample_ids(self):
2204+
ids = [1, 4]
2205+
sd, _ = self.get_example_data(10, 10, 40)
2206+
ancestors = tsinfer.generate_ancestors(sd).insert_proxy_samples(
2207+
sd, sample_ids=ids
2208+
)
2209+
inference_sites = np.isin(sd.sites_position[:], ancestors.sites_position[:])
2210+
anc_sample_ids = ancestors.ancestors_sample_id[:]
2211+
assert np.sum(anc_sample_ids != tskit.NULL) == len(ids)
2212+
for sample_id in ids:
2213+
assert np.sum(anc_sample_ids == sample_id) == 1
2214+
anc_sample = np.where(anc_sample_ids == sample_id)[0][0]
2215+
assert ancestors.ancestors_start[anc_sample] == 0
2216+
assert ancestors.ancestors_end[anc_sample] == ancestors.num_sites
2217+
assert len(ancestors.ancestors_focal_sites[anc_sample]) == 0
2218+
2219+
haplotype = next(sd.haplotypes([sample_id], sites=inference_sites))[1]
2220+
assert np.all(ancestors.ancestors_haplotype[anc_sample] == haplotype)
2221+
2222+
def test_insert_proxy_different_sample_data(self):
2223+
ids = [1, 4]
2224+
sd, _ = self.get_example_data(10, 10, 40)
2225+
ancestors = tsinfer.generate_ancestors(sd)
2226+
sd_copy, _ = self.get_example_data(10, 10, num_ancestors=40)
2227+
ancestors_extra = ancestors.insert_proxy_samples(
2228+
sd_copy, sample_ids=ids, require_same_sample_data=False
2229+
)
2230+
assert np.all(ancestors_extra.ancestors_sample_id[:] == tskit.NULL)
2231+
22032232
def test_insert_proxy_sample_provenance(self):
22042233
sample_data, _ = self.get_example_data(10, 10, 40)
22052234
ancestors = tsinfer.generate_ancestors(sample_data)
@@ -2242,6 +2271,8 @@ def test_insert_proxy_time_historical_samples(self):
22422271
assert np.array_equal(
22432272
ancestors_extra.ancestors_time[-1], historical_sample_time + epsilon
22442273
)
2274+
assert np.sum(ancestors_extra.ancestors_sample_id[:] != tskit.NULL) == 1
2275+
assert ancestors_extra.ancestors_sample_id[-1] == 9
22452276

22462277
# Test 2 proxies, one historical, specified in different ways / orders
22472278
s_ids = np.array([9, 0])
@@ -2263,6 +2294,9 @@ def test_insert_proxy_time_historical_samples(self):
22632294
ancestors_extra.ancestors_haplotype[-1], G[:, 0][used_sites]
22642295
)
22652296
assert np.array_equal(ancestors_extra.ancestors_time[-1], epsilon)
2297+
assert np.sum(ancestors_extra.ancestors_sample_id[:] != tskit.NULL) == 2
2298+
assert ancestors_extra.ancestors_sample_id[-2] == 9
2299+
assert ancestors_extra.ancestors_sample_id[-1] == 0
22662300

22672301
def test_insert_proxy_sample_epsilon(self):
22682302
sample_data, _ = self.get_example_data(10, 10, 40)

tsinfer/formats.py

Lines changed: 54 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2031,6 +2031,10 @@ def haplotypes(self, samples=None, sites=None):
20312031
``None``, return haplotypes for all sample nodes, otherwise this may be a
20322032
numpy array (or array-like) object (converted to dtype=np.int32).
20332033
:param array sites: A numpy array of sites to use.
2034+
2035+
2036+
:return: An iterator returning sucessive instances of (sample_id, haplotype).
2037+
:rtype: iter(int, numpy.ndarray(dtype=int8))
20342038
"""
20352039
if samples is None:
20362040
samples = np.arange(self.num_samples)
@@ -2123,6 +2127,7 @@ class Ancestor:
21232127
time = attr.ib()
21242128
focal_sites = attr.ib()
21252129
haplotype = attr.ib()
2130+
sample_id = attr.ib()
21262131

21272132
def __eq__(self, other):
21282133
return (
@@ -2170,7 +2175,7 @@ class AncestorData(DataContainer):
21702175
"""
21712176

21722177
FORMAT_NAME = "tsinfer-ancestor-data"
2173-
FORMAT_VERSION = (3, 0)
2178+
FORMAT_VERSION = (3, 1)
21742179

21752180
def __init__(self, sample_data, **kwargs):
21762181
super().__init__(**kwargs)
@@ -2229,6 +2234,13 @@ def __init__(self, sample_data, **kwargs):
22292234
dtype="array:i1",
22302235
compressor=self._compressor,
22312236
)
2237+
self.data.create_dataset(
2238+
"ancestors/sample_id",
2239+
shape=(0,),
2240+
chunks=chunks,
2241+
compressor=self._compressor,
2242+
dtype=np.int32,
2243+
)
22322244

22332245
self._alloc_ancestor_writer()
22342246

@@ -2244,6 +2256,7 @@ def _alloc_ancestor_writer(self):
22442256
"time": self.ancestors_time,
22452257
"focal_sites": self.ancestors_focal_sites,
22462258
"haplotype": self.ancestors_haplotype,
2259+
"sample_id": self.ancestors_sample_id,
22472260
},
22482261
num_threads=self._num_flush_threads,
22492262
)
@@ -2265,6 +2278,7 @@ def __str__(self):
22652278
("ancestors/time", zarr_summary(self.ancestors_time)),
22662279
("ancestors/focal_sites", zarr_summary(self.ancestors_focal_sites)),
22672280
("ancestors/haplotype", zarr_summary(self.ancestors_haplotype)),
2281+
("ancestors/sample_id", zarr_summary(self.ancestors_sample_id)),
22682282
]
22692283
return super().__str__() + self._format_str(values)
22702284

@@ -2289,6 +2303,9 @@ def data_equal(self, other):
22892303
self.ancestors_focal_sites[:], other.ancestors_focal_sites[:]
22902304
)
22912305
and np_obj_equal(self.ancestors_haplotype[:], other.ancestors_haplotype[:])
2306+
and np.array_equal(
2307+
self.ancestors_sample_id[:], other.ancestors_sample_id[:]
2308+
)
22922309
)
22932310

22942311
@property
@@ -2340,6 +2357,10 @@ def ancestors_focal_sites(self):
23402357
def ancestors_haplotype(self):
23412358
return self.data["ancestors/haplotype"]
23422359

2360+
@property
2361+
def ancestors_sample_id(self):
2362+
return self.data["ancestors/sample_id"]
2363+
23432364
@property
23442365
def ancestors_length(self):
23452366
"""
@@ -2358,6 +2379,7 @@ def insert_proxy_samples(
23582379
*,
23592380
sample_ids=None,
23602381
epsilon=None,
2382+
map_ancestors=False,
23612383
allow_mutation=False,
23622384
require_same_sample_data=True,
23632385
**kwargs,
@@ -2370,7 +2392,8 @@ def insert_proxy_samples(
23702392
23712393
A *proxy sample ancestor* is an ancestor based upon a known sample. At
23722394
sites used in the full inference process, the haplotype of this ancestor
2373-
is identical to that of the sample on which it is based. The time of the
2395+
is identical to that of the sample on which it is based, and the
2396+
The time of the
23742397
ancestor is taken to be a fraction ``epsilon`` older than the sample on
23752398
which it is based.
23762399
@@ -2384,11 +2407,11 @@ def insert_proxy_samples(
23842407
23852408
.. note::
23862409
2387-
The proxy sample ancestors inserted here will correspond to extra nodes
2388-
in the inferred tree sequence. At sites which are not used in the full
2410+
The proxy sample ancestors inserted here will end up as extra nodes
2411+
in the inferred tree sequence, but at sites which are not used in the full
23892412
inference process (e.g. sites unique to a single historical sample),
2390-
these proxy sample ancestor nodes may have a different genotype from
2391-
their corresponding sample.
2413+
it is possible for these proxy sample ancestor nodes to have a different
2414+
genotype from their corresponding sample.
23922415
23932416
:param SampleData sample_data: The :class:`.SampleData` instance
23942417
from which to select the samples used to create extra ancestors.
@@ -2423,7 +2446,8 @@ def insert_proxy_samples(
24232446
to ensure that the encoding of alleles in ``sample_data`` matches the
24242447
encoding in the current :class:`AncestorData` instance (i.e. that in the
24252448
original :class:`.SampleData` instance on which the current ancestors
2426-
are based).
2449+
are based). Note that in this case, the sample_id is not recorded in the
2450+
returned object.
24272451
:param \\**kwargs: Further arguments passed to the constructor when creating
24282452
the new :class:`AncestorData` instance which will be returned.
24292453
@@ -2521,7 +2545,11 @@ def insert_proxy_samples(
25212545
time=proxy_time,
25222546
focal_sites=[],
25232547
haplotype=haplotype,
2548+
sample_id=sample_id
2549+
if sample_data.uuid == self.sample_data_uuid
2550+
else tskit.NULL,
25242551
)
2552+
25252553
# Add any ancestors remaining in the current instance
25262554
while ancestor is not None:
25272555
other.add_ancestor(**attr.asdict(ancestor, filter=exclude_id))
@@ -2603,7 +2631,6 @@ def truncate_ancestors(
26032631
start = self.ancestors_start[:]
26042632
end = self.ancestors_end[:]
26052633
time = self.ancestors_time[:]
2606-
focal_sites = self.ancestors_focal_sites[:]
26072634
haplotypes = self.ancestors_haplotype[:]
26082635
if upper_time_bound > np.max(time) or lower_time_bound > np.max(time):
26092636
raise ValueError("Time bounds cannot be greater than older ancestor")
@@ -2641,16 +2668,12 @@ def truncate_ancestors(
26412668
)
26422669
start[anc.id] = insert_pos_start
26432670
end[anc.id] = insert_pos_end
2644-
time[anc.id] = anc.time
2645-
focal_sites[anc.id] = anc.focal_sites
26462671
haplotypes[anc.id] = anc.haplotype[
26472672
insert_pos_start - anc.start : insert_pos_end - anc.start
26482673
]
26492674
# TODO - record truncation in ancestors' metadata when supported
26502675
truncated.ancestors_start[:] = start
26512676
truncated.ancestors_end[:] = end
2652-
truncated.ancestors_time[:] = time
2653-
truncated.ancestors_focal_sites[:] = focal_sites
26542677
truncated.ancestors_haplotype[:] = haplotypes
26552678
truncated.record_provenance(command="truncate_ancestors")
26562679
truncated.finalise()
@@ -2671,6 +2694,12 @@ def set_inference_sites(self, site_ids):
26712694
sites in the sample data file, and the IDs must be in increasing order.
26722695
26732696
This must be called before the first call to :meth:`.add_ancestor`.
2697+
2698+
.. note::
2699+
To obtain a list of which sites in a sample data or a tree sequence have
2700+
been placed into the ancestors file for use in inference, you can apply
2701+
:func:`numpy.isin` to the list of positions, e.g.
2702+
``np.isin(sample_data.sites_position[:], ancestors.sites_position[:])``
26742703
"""
26752704
self._check_build_mode()
26762705
position = self.sample_data.sites_position[:][site_ids]
@@ -2679,12 +2708,18 @@ def set_inference_sites(self, site_ids):
26792708
array[:] = position
26802709
self._num_alleles = self.sample_data.num_alleles(site_ids)
26812710

2682-
def add_ancestor(self, start, end, time, focal_sites, haplotype):
2711+
def add_ancestor(
2712+
self, start, end, time, focal_sites, haplotype, sample_id=tskit.NULL
2713+
):
26832714
"""
26842715
Adds an ancestor with the specified haplotype, with ancestral material over the
26852716
interval [start:end], that is associated with the specified timepoint and has new
2686-
mutations at the specified list of focal sites. Ancestors should be added in time
2687-
order, with the oldest first. The id of the added ancestor is returned.
2717+
mutations at the specified list of focal sites. If this ancestor is based on a
2718+
specific sample from the associated sample_data file (i.e. a historical sample)
2719+
then the ``sample_id`` in the sample data file can also be passed as a parameter.
2720+
2721+
The Ancestors should be added in time order, with the oldest first. The id of
2722+
the added ancestor is returned.
26882723
"""
26892724
self._check_build_mode()
26902725
haplotype = tskit.util.safe_np_int_cast(haplotype, dtype=np.int8, copy=True)
@@ -2714,6 +2749,7 @@ def add_ancestor(self, start, end, time, focal_sites, haplotype):
27142749
time=time,
27152750
focal_sites=focal_sites,
27162751
haplotype=haplotype,
2752+
sample_id=sample_id,
27172753
)
27182754

27192755
def finalise(self):
@@ -2739,6 +2775,7 @@ def ancestor(self, id_):
27392775
time=self.ancestors_time[id_],
27402776
focal_sites=self.ancestors_focal_sites[id_],
27412777
haplotype=self.ancestors_haplotype[id_],
2778+
sample_id=self.ancestors_sample_id[id_],
27422779
)
27432780

27442781
def ancestors(self):
@@ -2750,6 +2787,7 @@ def ancestors(self):
27502787
end = self.ancestors_end[:]
27512788
time = self.ancestors_time[:]
27522789
focal_sites = self.ancestors_focal_sites[:]
2790+
sample_id = self.ancestors_sample_id[:]
27532791
for j, h in enumerate(chunk_iterator(self.ancestors_haplotype)):
27542792
yield Ancestor(
27552793
id=j,
@@ -2758,6 +2796,7 @@ def ancestors(self):
27582796
time=time[j],
27592797
focal_sites=focal_sites[j],
27602798
haplotype=h,
2799+
sample_id=sample_id[j],
27612800
)
27622801

27632802

0 commit comments

Comments
 (0)