Skip to content

Commit

Permalink
Merge pull request #566 from pyinat/root-taxon-filter
Browse files Browse the repository at this point in the history
Add root_id filter to taxon.make_tree()
  • Loading branch information
JWCook authored Jun 30, 2024
2 parents 70acd21 + 2cc0df7 commit 6ec2d4d
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 3 deletions.
1 change: 1 addition & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

### Models
* Add `User.annotated_observations_count` field
* Add `root_id` filter to `taxon.make_tree()` to explicitly set the root taxon instead of determining it automatically

### Rate limits, timeouts, and error handling
* Increase default request timeout from 10 to 20 seconds
Expand Down
30 changes: 27 additions & 3 deletions pyinaturalist/models/taxon.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
from itertools import chain, groupby
from logging import getLogger
from typing import Any, Callable, Dict, Iterable, List, Optional

from pyinaturalist.constants import (
Expand Down Expand Up @@ -33,6 +34,8 @@
field,
)

logger = getLogger(__name__)


@define_model
class Taxon(BaseModel):
Expand Down Expand Up @@ -408,19 +411,23 @@ def make_tree(
taxa: Iterable[Taxon],
include_ranks: Optional[List[str]] = None,
sort_key: Optional[TaxonSortKey] = None,
root_id: Optional[int] = None,
) -> Taxon:
"""Organize a list of taxa into a taxonomic tree. Expects exactly one root taxon.
Args:
taxa: Taxon objects to organize
sort_key: Key function for sorting children; defaults to rank and name
include_ranks: If provided, only include taxa with these ranks; otherwise, include all ranks
root_id: ID of the root taxon; if provided, only that taxon and its descendants will
be included. Otherwise, the root taxon is determined automatically.
Returns:
Root taxon of the tree
"""
include_ranks = [r.lower() for r in include_ranks or []]
root = _find_root(taxa, include_ranks)
sort_key = sort_key if sort_key is not None else _sort_rank_name
root = _find_root(taxa, include_ranks, root_id)

# Group taxa by parent ID, including any ungrafted children added directly to root
taxa_by_parent: Dict[int, List[Taxon]] = _sort_groupby(taxa, key=lambda x: x.parent_id or -1)
Expand All @@ -444,16 +451,33 @@ def add_descendants(taxon, ancestors=None) -> Taxon:
return add_descendants(root)


def _find_root(taxa: Iterable[Taxon], include_ranks: Optional[List[str]] = None) -> Taxon:
def _find_root(
taxa: Iterable[Taxon],
include_ranks: Optional[List[str]] = None,
root_id: Optional[int] = None,
) -> Taxon:
"""Find the root taxon of a list of taxa, optionally filtering by rank.
Handles ungrafted and multiple root taxa by adding under a new root node.
"""
# If a specific root taxon is requested, use that if possible
if root_id:
root = next((t for t in taxa if t.id == root_id), None)
if root and (not include_ranks or root.rank in include_ranks):
return root
else:
logger.warning(f'Root taxon {root_id} not found or filtered out; finding default root')

# Typical case: exactly one root taxon ("Life")
taxa_by_id = {t.id: t for t in taxa}
if ROOT_TAXON_ID in taxa_by_id and (not include_ranks or 'stateofmatter' in include_ranks):
return taxa_by_id[ROOT_TAXON_ID]
# Otherwise, find the highest-ranked taxa and graft them under a new root
else:
return _find_and_graft_root(taxa, include_ranks)


# Otherwise, find the taxa with the highest rank
def _find_and_graft_root(taxa: Iterable[Taxon], include_ranks: Optional[List[str]] = None) -> Taxon:
taxa_by_id = {t.id: t for t in taxa}
max_rank = max(t.rank_level for t in taxa if not include_ranks or t.rank in include_ranks)
root_taxa = [t for t in taxa if t.rank_level == max_rank]

Expand Down
27 changes: 27 additions & 0 deletions test/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1240,6 +1240,33 @@ def test_make_tree__ungrafted():
assert root.children[1].name == 'Monocots'


def test_make_tree__explicit_root():
"""If a root taxon is provided, it should be used as the root node if possible"""
taxa = Taxon.from_json_list(j_life_list_2)
root = make_tree(taxa, root_id=1)
assert root.id == 1
assert len(root.children) == 1
assert root.children[0].name == 'Arthropoda'


def test_make_tree__explicit_root_not_found():
"""If a root taxon is provided but not included in the list, fall back to default behavior"""
taxa = Taxon.from_json_list(j_life_list_2)
root = make_tree(taxa, root_id=12345)
assert root.id == ROOT_TAXON_ID
assert len(root.children) == 1


def test_make_tree__explicit_root_filtered_out():
"""If a root taxon is provided but filtered out, find the next root matching the filter"""
taxa = Taxon.from_json_list(j_life_list_2)
root = make_tree(taxa, root_id=1, include_ranks=['family', 'genus', 'species'])
assert root.id == 47221
assert root.name == 'Apidae'
assert root.rank == 'family'
assert len(root.children) == 1


# Users
# --------------------

Expand Down

0 comments on commit 6ec2d4d

Please sign in to comment.