diff --git a/docs/inference.rst b/docs/inference.rst index e0e3828f..0b8adcc3 100644 --- a/docs/inference.rst +++ b/docs/inference.rst @@ -126,12 +126,12 @@ file format, we provide a simple :ref:`Python API ` to allow the user to efficiently construct it from their own data. An example of how to use this API is given in the :ref:`sec_tutorial`. -We do not provide an automatic means of important data from a VCF -intentionally, as we believe that this would be extremely difficult to do. +We do not provide an automatic means of importing data from VCF (or any +other format) intentionally, as we believe that this would be extremely difficult to do. As there is no universally accepted way of encoding ancestral state information in VCF, in practise the user would most often have to write -a new VCF file with ancestral state and metadata information in the form -that we require. Thus, it is more efficient to skip this intermediate +a new VCF file with ancestral state and metadata information in a specific +form that we would require. Thus, it is more efficient to skip this intermediate step and to directly produce a :ref:`format ` that is both compact and very efficient to process. diff --git a/docs/simulation-example.py b/docs/simulation-example.py index 4cb3bb05..6e48dc1a 100644 --- a/docs/simulation-example.py +++ b/docs/simulation-example.py @@ -5,22 +5,19 @@ sys.path.insert(0, os.path.abspath('..')) import tsinfer -if False: +if True: ts = msprime.simulate( sample_size=10000, Ne=10**4, recombination_rate=1e-8, mutation_rate=1e-8, length=10*10**6, random_seed=42) ts.dump("simulation-source.trees") print("Simulation done:", ts.num_trees, "trees and", ts.num_sites) - progress = tqdm.tqdm(total=ts.num_sites) - sample_data = tsinfer.SampleData.initialise( - num_samples=ts.num_samples, sequence_length=ts.sequence_length, - path="simulation.samples", num_flush_threads=2) - for var in ts.variants(): - sample_data.add_site(var.site.position, var.alleles, var.genotypes) - progress.update() - progress.close() - sample_data.finalise() + with tsinfer.SampleData( + sequence_length=ts.sequence_length, path="simulation.samples", + num_flush_threads=2) as samples: + for var in tqdm.tqdm(ts.variants(), total=ts.num_sites): + samples.add_site(var.site.position, var.genotypes, var.alleles) + else: source = msprime.load("simulation-source.trees") inferred = msprime.load("simulation.trees") diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 92bda910..e984b58f 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -28,7 +28,7 @@ scope of this manual. Assuming that we know the ancestral state, we can then imp import tsinfer - with tsinfer.SampleData() as sample_data: + with tsinfer.SampleData(sequence_length=6) as sample_data: sample_data.add_site(0, [0, 1, 0, 0, 0], ["A", "T"]) sample_data.add_site(1, [0, 0, 0, 1, 1], ["G", "C"]) sample_data.add_site(2, [0, 1, 1, 0, 0], ["C", "A"]) diff --git a/tsinfer/formats.py b/tsinfer/formats.py index 7c1836de..8a1fde10 100644 --- a/tsinfer/formats.py +++ b/tsinfer/formats.py @@ -676,8 +676,8 @@ class SampleData(DataContainer): with tsinfer.SampleData(path="mydata.samples") as sample_data: # Define populations - sample_data.population(metadata={"name": "CEU"}) - sample_data.population(metadata={"name": "YRI"}) + sample_data.add_population(metadata={"name": "CEU"}) + sample_data.add_population(metadata={"name": "YRI"}) # Define individuals sample_data.add_individual( ploidy=2, population=0, metadata={"name": "NA12878"})