@@ -2031,6 +2031,10 @@ def haplotypes(self, samples=None, sites=None):
2031
2031
``None``, return haplotypes for all sample nodes, otherwise this may be a
2032
2032
numpy array (or array-like) object (converted to dtype=np.int32).
2033
2033
:param array sites: A numpy array of sites to use.
2034
+
2035
+
2036
+ :return: An iterator returning sucessive instances of (sample_id, haplotype).
2037
+ :rtype: iter(int, numpy.ndarray(dtype=int8))
2034
2038
"""
2035
2039
if samples is None :
2036
2040
samples = np .arange (self .num_samples )
@@ -2123,6 +2127,7 @@ class Ancestor:
2123
2127
time = attr .ib ()
2124
2128
focal_sites = attr .ib ()
2125
2129
haplotype = attr .ib ()
2130
+ sample_id = attr .ib ()
2126
2131
2127
2132
def __eq__ (self , other ):
2128
2133
return (
@@ -2170,7 +2175,7 @@ class AncestorData(DataContainer):
2170
2175
"""
2171
2176
2172
2177
FORMAT_NAME = "tsinfer-ancestor-data"
2173
- FORMAT_VERSION = (3 , 0 )
2178
+ FORMAT_VERSION = (3 , 1 )
2174
2179
2175
2180
def __init__ (self , sample_data , ** kwargs ):
2176
2181
super ().__init__ (** kwargs )
@@ -2229,6 +2234,13 @@ def __init__(self, sample_data, **kwargs):
2229
2234
dtype = "array:i1" ,
2230
2235
compressor = self ._compressor ,
2231
2236
)
2237
+ self .data .create_dataset (
2238
+ "ancestors/sample_id" ,
2239
+ shape = (0 ,),
2240
+ chunks = chunks ,
2241
+ compressor = self ._compressor ,
2242
+ dtype = np .int32 ,
2243
+ )
2232
2244
2233
2245
self ._alloc_ancestor_writer ()
2234
2246
@@ -2244,6 +2256,7 @@ def _alloc_ancestor_writer(self):
2244
2256
"time" : self .ancestors_time ,
2245
2257
"focal_sites" : self .ancestors_focal_sites ,
2246
2258
"haplotype" : self .ancestors_haplotype ,
2259
+ "sample_id" : self .ancestors_sample_id ,
2247
2260
},
2248
2261
num_threads = self ._num_flush_threads ,
2249
2262
)
@@ -2265,6 +2278,7 @@ def __str__(self):
2265
2278
("ancestors/time" , zarr_summary (self .ancestors_time )),
2266
2279
("ancestors/focal_sites" , zarr_summary (self .ancestors_focal_sites )),
2267
2280
("ancestors/haplotype" , zarr_summary (self .ancestors_haplotype )),
2281
+ ("ancestors/sample_id" , zarr_summary (self .ancestors_sample_id )),
2268
2282
]
2269
2283
return super ().__str__ () + self ._format_str (values )
2270
2284
@@ -2289,6 +2303,9 @@ def data_equal(self, other):
2289
2303
self .ancestors_focal_sites [:], other .ancestors_focal_sites [:]
2290
2304
)
2291
2305
and np_obj_equal (self .ancestors_haplotype [:], other .ancestors_haplotype [:])
2306
+ and np .array_equal (
2307
+ self .ancestors_sample_id [:], other .ancestors_sample_id [:]
2308
+ )
2292
2309
)
2293
2310
2294
2311
@property
@@ -2340,6 +2357,10 @@ def ancestors_focal_sites(self):
2340
2357
def ancestors_haplotype (self ):
2341
2358
return self .data ["ancestors/haplotype" ]
2342
2359
2360
+ @property
2361
+ def ancestors_sample_id (self ):
2362
+ return self .data ["ancestors/sample_id" ]
2363
+
2343
2364
@property
2344
2365
def ancestors_length (self ):
2345
2366
"""
@@ -2358,6 +2379,7 @@ def insert_proxy_samples(
2358
2379
* ,
2359
2380
sample_ids = None ,
2360
2381
epsilon = None ,
2382
+ map_ancestors = False ,
2361
2383
allow_mutation = False ,
2362
2384
require_same_sample_data = True ,
2363
2385
** kwargs ,
@@ -2370,7 +2392,8 @@ def insert_proxy_samples(
2370
2392
2371
2393
A *proxy sample ancestor* is an ancestor based upon a known sample. At
2372
2394
sites used in the full inference process, the haplotype of this ancestor
2373
- is identical to that of the sample on which it is based. The time of the
2395
+ is identical to that of the sample on which it is based, and the
2396
+ The time of the
2374
2397
ancestor is taken to be a fraction ``epsilon`` older than the sample on
2375
2398
which it is based.
2376
2399
@@ -2384,11 +2407,11 @@ def insert_proxy_samples(
2384
2407
2385
2408
.. note::
2386
2409
2387
- The proxy sample ancestors inserted here will correspond to extra nodes
2388
- in the inferred tree sequence. At sites which are not used in the full
2410
+ The proxy sample ancestors inserted here will end up as extra nodes
2411
+ in the inferred tree sequence, but at sites which are not used in the full
2389
2412
inference process (e.g. sites unique to a single historical sample),
2390
- these proxy sample ancestor nodes may have a different genotype from
2391
- their corresponding sample.
2413
+ it is possible for these proxy sample ancestor nodes to have a different
2414
+ genotype from their corresponding sample.
2392
2415
2393
2416
:param SampleData sample_data: The :class:`.SampleData` instance
2394
2417
from which to select the samples used to create extra ancestors.
@@ -2423,7 +2446,8 @@ def insert_proxy_samples(
2423
2446
to ensure that the encoding of alleles in ``sample_data`` matches the
2424
2447
encoding in the current :class:`AncestorData` instance (i.e. that in the
2425
2448
original :class:`.SampleData` instance on which the current ancestors
2426
- are based).
2449
+ are based). Note that in this case, the sample_id is not recorded in the
2450
+ returned object.
2427
2451
:param \\ **kwargs: Further arguments passed to the constructor when creating
2428
2452
the new :class:`AncestorData` instance which will be returned.
2429
2453
@@ -2521,7 +2545,11 @@ def insert_proxy_samples(
2521
2545
time = proxy_time ,
2522
2546
focal_sites = [],
2523
2547
haplotype = haplotype ,
2548
+ sample_id = sample_id
2549
+ if sample_data .uuid == self .sample_data_uuid
2550
+ else tskit .NULL ,
2524
2551
)
2552
+
2525
2553
# Add any ancestors remaining in the current instance
2526
2554
while ancestor is not None :
2527
2555
other .add_ancestor (** attr .asdict (ancestor , filter = exclude_id ))
@@ -2603,7 +2631,6 @@ def truncate_ancestors(
2603
2631
start = self .ancestors_start [:]
2604
2632
end = self .ancestors_end [:]
2605
2633
time = self .ancestors_time [:]
2606
- focal_sites = self .ancestors_focal_sites [:]
2607
2634
haplotypes = self .ancestors_haplotype [:]
2608
2635
if upper_time_bound > np .max (time ) or lower_time_bound > np .max (time ):
2609
2636
raise ValueError ("Time bounds cannot be greater than older ancestor" )
@@ -2641,16 +2668,12 @@ def truncate_ancestors(
2641
2668
)
2642
2669
start [anc .id ] = insert_pos_start
2643
2670
end [anc .id ] = insert_pos_end
2644
- time [anc .id ] = anc .time
2645
- focal_sites [anc .id ] = anc .focal_sites
2646
2671
haplotypes [anc .id ] = anc .haplotype [
2647
2672
insert_pos_start - anc .start : insert_pos_end - anc .start
2648
2673
]
2649
2674
# TODO - record truncation in ancestors' metadata when supported
2650
2675
truncated .ancestors_start [:] = start
2651
2676
truncated .ancestors_end [:] = end
2652
- truncated .ancestors_time [:] = time
2653
- truncated .ancestors_focal_sites [:] = focal_sites
2654
2677
truncated .ancestors_haplotype [:] = haplotypes
2655
2678
truncated .record_provenance (command = "truncate_ancestors" )
2656
2679
truncated .finalise ()
@@ -2671,6 +2694,12 @@ def set_inference_sites(self, site_ids):
2671
2694
sites in the sample data file, and the IDs must be in increasing order.
2672
2695
2673
2696
This must be called before the first call to :meth:`.add_ancestor`.
2697
+
2698
+ .. note::
2699
+ To obtain a list of which sites in a sample data or a tree sequence have
2700
+ been placed into the ancestors file for use in inference, you can apply
2701
+ :func:`numpy.isin` to the list of positions, e.g.
2702
+ ``np.isin(sample_data.sites_position[:], ancestors.sites_position[:])``
2674
2703
"""
2675
2704
self ._check_build_mode ()
2676
2705
position = self .sample_data .sites_position [:][site_ids ]
@@ -2679,12 +2708,18 @@ def set_inference_sites(self, site_ids):
2679
2708
array [:] = position
2680
2709
self ._num_alleles = self .sample_data .num_alleles (site_ids )
2681
2710
2682
- def add_ancestor (self , start , end , time , focal_sites , haplotype ):
2711
+ def add_ancestor (
2712
+ self , start , end , time , focal_sites , haplotype , sample_id = tskit .NULL
2713
+ ):
2683
2714
"""
2684
2715
Adds an ancestor with the specified haplotype, with ancestral material over the
2685
2716
interval [start:end], that is associated with the specified timepoint and has new
2686
- mutations at the specified list of focal sites. Ancestors should be added in time
2687
- order, with the oldest first. The id of the added ancestor is returned.
2717
+ mutations at the specified list of focal sites. If this ancestor is based on a
2718
+ specific sample from the associated sample_data file (i.e. a historical sample)
2719
+ then the ``sample_id`` in the sample data file can also be passed as a parameter.
2720
+
2721
+ The Ancestors should be added in time order, with the oldest first. The id of
2722
+ the added ancestor is returned.
2688
2723
"""
2689
2724
self ._check_build_mode ()
2690
2725
haplotype = tskit .util .safe_np_int_cast (haplotype , dtype = np .int8 , copy = True )
@@ -2714,6 +2749,7 @@ def add_ancestor(self, start, end, time, focal_sites, haplotype):
2714
2749
time = time ,
2715
2750
focal_sites = focal_sites ,
2716
2751
haplotype = haplotype ,
2752
+ sample_id = sample_id ,
2717
2753
)
2718
2754
2719
2755
def finalise (self ):
@@ -2739,6 +2775,7 @@ def ancestor(self, id_):
2739
2775
time = self .ancestors_time [id_ ],
2740
2776
focal_sites = self .ancestors_focal_sites [id_ ],
2741
2777
haplotype = self .ancestors_haplotype [id_ ],
2778
+ sample_id = self .ancestors_sample_id [id_ ],
2742
2779
)
2743
2780
2744
2781
def ancestors (self ):
@@ -2750,6 +2787,7 @@ def ancestors(self):
2750
2787
end = self .ancestors_end [:]
2751
2788
time = self .ancestors_time [:]
2752
2789
focal_sites = self .ancestors_focal_sites [:]
2790
+ sample_id = self .ancestors_sample_id [:]
2753
2791
for j , h in enumerate (chunk_iterator (self .ancestors_haplotype )):
2754
2792
yield Ancestor (
2755
2793
id = j ,
@@ -2758,6 +2796,7 @@ def ancestors(self):
2758
2796
time = time [j ],
2759
2797
focal_sites = focal_sites [j ],
2760
2798
haplotype = h ,
2799
+ sample_id = sample_id [j ],
2761
2800
)
2762
2801
2763
2802
0 commit comments