Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement keep_intervals method #635

Merged
merged 1 commit into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 206 additions & 0 deletions src/table_collection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,17 @@ use crate::metadata::SiteMetadata;
use crate::sys::bindings as ll_bindings;
use crate::sys::TableCollection as LLTableCollection;
use crate::types::Bookmark;
use crate::EdgeTable;
use crate::IndividualTableSortOptions;
use crate::MigrationId;
use crate::MigrationTable;
use crate::MutationId;
use crate::MutationTable;
use crate::PopulationId;
use crate::Position;
use crate::SimplificationOptions;
use crate::SiteId;
use crate::SiteTable;
use crate::TableClearOptions;
use crate::TableEqualityOptions;
use crate::TableIntegrityCheckFlags;
Expand Down Expand Up @@ -1372,4 +1376,206 @@ impl TableCollection {
pub fn as_mut_ptr(&mut self) -> *mut ll_bindings::tsk_table_collection_t {
self.inner.as_mut_ptr()
}

/// Truncate the [TableCollection] to specified genome intervals.
///
/// # Return
/// - `Ok(None)`: when truncation leads to empty edge table.
/// - `Ok(Some(TableCollection))`: when trunction is successfully performed
/// and results in non-empty edge table.
/// - `Error(TskitError)`: Any errors from the C API propagate. An
/// [TskitError::RangeError] will occur when `intervals` are not
/// sorted. Note that as `tskit` currently does not support `simplify`
/// on [TableCollection] with a non-empty migration table, calling
/// `keep_intervals` on those [TableCollection] with `simplify` set to
/// `true` will return an error.
///
/// # Example
/// ```rust
/// # use tskit::*;
/// # let snode = NodeFlags::new_sample();
/// # let anode = NodeFlags::default();
/// # let pop = PopulationId::NULL;
/// # let ind = IndividualId::NULL;
/// # let seqlen = 100.0;
/// # let (t0, t10) = (0.0, 10.0);
/// # let (left, right) = (0.0, 100.0);
/// # let sim_opts = SimplificationOptions::default();
/// #
/// # let mut tables = TableCollection::new(seqlen).unwrap();
/// # let child1 = tables.add_node(snode, t0, pop, ind).unwrap();
/// # let child2 = tables.add_node(snode, t0, pop, ind).unwrap();
/// # let parent = tables.add_node(anode, t10, pop, ind).unwrap();
/// #
/// # tables.add_edge(left, right, parent, child1).unwrap();
/// # tables.add_edge(left, right, parent, child2).unwrap();
/// # tables.full_sort(TableSortOptions::all()).unwrap();
/// # tables.simplify(&[child1, child2], sim_opts, false).unwrap();
/// # tables.build_index().unwrap();
/// #
/// let intervals = [(0.0, 10.0), (90.0, 100.0)].into_iter();
/// tables.keep_intervals(intervals, true).unwrap().unwrap();
/// ```
///
/// Note that no new provenance will be appended.
pub fn keep_intervals<P>(
self,
intervals: impl Iterator<Item = (P, P)>,
simplify: bool,
) -> Result<Option<Self>, TskitError>
where
P: Into<Position>,
{
use streaming_iterator::StreamingIterator;
let mut tables = self;
// use tables from sys to allow easier process with metadata
let options = 0;
let mut new_edges = crate::sys::EdgeTable::new(options)?;
let mut new_migrations = crate::sys::MigrationTable::new(options)?;
let mut new_sites = crate::sys::SiteTable::new(options)?;
let mut new_mutations = crate::sys::MutationTable::new(options)?;

// for old site id to new site id mapping
let mut site_map = vec![-1i32; tables.sites().num_rows().as_usize()];

// logicals to indicate whether a site (old) will be kept in new site table
let mut keep_sites = vec![false; tables.sites().num_rows().try_into()?];

let mut last_interval = (Position::from(0.0), Position::from(0.0));
for (s, e) in intervals {
let (s, e) = (s.into(), e.into());
// make sure intervals are sorted
if (s > e) || (s < last_interval.1) {
return Err(TskitError::RangeError(
"intervals not valid or sorted".into(),
));
}
keep_sites
.iter_mut()
.zip(tables.sites_iter())
.for_each(|(k, site_row)| {
*k = *k || ((site_row.position >= s) && (site_row.position < e));
});

// use stream_iter and while-let pattern for easier ? operator within a loop
let mut edge_iter = tables
.edges()
.lending_iter()
.filter(|edge_row| !((edge_row.right <= s) || (edge_row.left >= e)));

while let Some(edge_row) = edge_iter.next() {
new_edges.add_row_with_metadata(
if edge_row.left < s { s } else { edge_row.left }.into(),
if edge_row.right > e {
e
} else {
edge_row.right
}
.into(),
edge_row.parent.into(),
edge_row.child.into(),
edge_row.metadata.unwrap_or(&[0u8; 0]),
)?;
}

let mut migration_iter = tables
.migrations()
.lending_iter()
.filter(|mrow| !((mrow.right <= s) || (mrow.left >= e)));

while let Some(migration_row) = migration_iter.next() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tests do not cover this block. It may be useful to come up with something to cover this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I left migration table recording out of simulation as it does not work well with simplification as we can see from tskit C api docs for tsk_table_collection_simplify (although the keep_intervals python code deal with migration table):

Note
Migrations are currently not supported by simplify, and an error will be raised if we attempt call simplify on a table collection with greater than zero migrations. See tskit-dev/tskit#20

I can add a note to the Rust keep_intervals docs to reflect this.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fair. One could envision a test of manually-generated data that don't require sorting. But the reality is that that block of code is adding data already present in the tables. The only way for there to be an error is if someone loaded a table collection with invalid row data generated by another tool.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. I will make a test that if some one try to call keep_intervals on treeseq that has nonempty migration and set simplify=true in the argument, it should return a tskit error.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before implementing a manual check, just do the test. tskit-c will, I believe, set an error code for that case, so you can just ? on the simplify call.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before implementing a manual check, just do the test. tskit-c will, I believe, set an error code for that case, so you can just ? on the simplify call.

Sorry I am not sure I understand your suggestion. Did you mean to do test on the method that generates a treesequence with non-empty migration table in src/test_fixtures.rs generate_simple_treesequence function ?

I thought I already used ? for the simplfy call in the keep_intervals method for TableCollection. Am I missing something?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that I misunderstood your comment re: testing. I think that your tests are actually okay except for the issue of adding the migrations after simplification.

new_migrations.add_row_with_metadata(
(migration_row.left.into(), migration_row.right.into()),
migration_row.node.into(),
migration_row.source.into(),
migration_row.dest.into(),
migration_row.time.into(),
migration_row.metadata.unwrap_or(&[0u8; 0]),
)?;
}
last_interval = (s, e);
}

let mut running_site_id = 0;
let mut site_iter = tables.sites().lending_iter();
while let Some(site_row) = site_iter.next() {
let old_id = site_row.id.to_usize().unwrap();
if keep_sites[old_id] {
new_sites.add_row_with_metadata(
site_row.position.into(),
site_row.ancestral_state,
site_row.metadata.unwrap_or(&[0u8; 0]),
)?;
site_map[old_id] = running_site_id;
running_site_id += 1;
}
}

// build mutation_map
let mutation_map: Vec<_> = {
let mut n = 0;
tables
.mutations()
.site_slice()
.iter()
.map(|site| {
if keep_sites[site.as_usize()] {
n += 1
};
n - 1
})
.collect()
};

let mut mutations_iter = tables.mutations().lending_iter();
while let Some(mutation_row) = mutations_iter.next() {
let old_id = mutation_row.site.to_usize().unwrap();
if keep_sites[old_id] {
let new_site = site_map[old_id];
let new_parent = {
if mutation_row.parent.is_null() {
mutation_row.parent.into()
} else {
mutation_map[mutation_row.parent.as_usize()]
}
};
new_mutations.add_row_with_metadata(
new_site,
mutation_row.node.into(),
new_parent,
mutation_row.time.into(),
mutation_row.derived_state,
mutation_row.metadata.unwrap_or(&[0u8; 0]),
)?;
}
}

// convert sys version of tables to non-sys version of tables
let new_edges = EdgeTable::new_from_table(new_edges.as_mut())?;
let new_migrations = MigrationTable::new_from_table(new_migrations.as_mut())?;
let new_mutations = MutationTable::new_from_table(new_mutations.as_mut())?;
let new_sites = SiteTable::new_from_table(new_sites.as_mut())?;

// replace old tables with new tables
tables.set_edges(&new_edges).map(|_| ())?;
tables.set_migrations(&new_migrations).map(|_| ())?;
tables.set_mutations(&new_mutations).map(|_| ())?;
tables.set_sites(&new_sites)?;

// sort tables
tables.full_sort(TableSortOptions::default())?;

// simplify tables
if simplify {
let samples = tables.samples_as_vector();
tables.simplify(samples.as_slice(), SimplificationOptions::default(), false)?;
}

// return None when edge table is empty
if tables.edges().num_rows() == 0 {
Ok(None)
} else {
Ok(Some(tables))
}
}
}
Loading
Loading