From fc3c174a3d028172d7b03d7f628f20b1f0243b2b Mon Sep 17 00:00:00 2001 From: Alice Butcher Date: Tue, 9 Apr 2024 11:04:36 +0100 Subject: [PATCH] feat: update the gbif taxonomy script - use canonical names, not vernacular - add "unknown [rank]" items when gbif's api doesn't have a parent available (e.g. actinopterygii has been removed in the backbone so they're all now under "unknown class") - add "gbif" column to taxonomy table to accomodate new non-gbif items (though it should be obvious because the ids are all over 1,000,000,000 - this is not the best idea but the only thing that relies on it is a manually updated ui element, so it should be fine) - the taxonomy tree now shows species as well --- .../versions/8b76ef5c47ef_gbif_taxonomy.py | 29 +++++++ api/phenome10k/cli.py | 18 ++++- api/phenome10k/data/gbif.py | 80 ++++++++++++++++--- api/phenome10k/models/taxonomy.py | 10 +-- 4 files changed, 118 insertions(+), 19 deletions(-) create mode 100644 api/migrations/versions/8b76ef5c47ef_gbif_taxonomy.py diff --git a/api/migrations/versions/8b76ef5c47ef_gbif_taxonomy.py b/api/migrations/versions/8b76ef5c47ef_gbif_taxonomy.py new file mode 100644 index 0000000..473e8fe --- /dev/null +++ b/api/migrations/versions/8b76ef5c47ef_gbif_taxonomy.py @@ -0,0 +1,29 @@ +""" +Add gbif column to taxonomy. + +Revision ID: 8b76ef5c47ef +Revises: 01816b2fcaea +Create Date: 2024-04-05 16:19:09.627686 +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '8b76ef5c47ef' +down_revision = '01816b2fcaea' +branch_labels = None +depends_on = None + + +def upgrade(): + with op.batch_alter_table('taxonomy', schema=None) as batch_op: + batch_op.add_column(sa.Column('gbif', sa.Boolean(), nullable=True)) + + op.execute("update taxonomy set gbif = 't'") + op.alter_column('taxonomy', 'gbif', nullable=False) + + +def downgrade(): + with op.batch_alter_table('taxonomy', schema=None) as batch_op: + batch_op.drop_column('gbif') diff --git a/api/phenome10k/cli.py b/api/phenome10k/cli.py index 4707af6..82e036b 100644 --- a/api/phenome10k/cli.py +++ b/api/phenome10k/cli.py @@ -5,6 +5,7 @@ from phenome10k.extensions import db, security from phenome10k.models import User, Scan, Taxonomy from datetime import datetime as dt +from sqlalchemy import select def create_cli_app(info): @@ -51,10 +52,19 @@ def update_gbif_tags(): Updates taxonomy tags from gbif backbone and deletes unused ones. """ click.echo('Updating tags:') - for scan in Scan.query.filter(Scan.gbif_species_id).all(): - tags = [db.session.merge(tag) for tag in pull_tags(scan.gbif_species_id)] - scan.taxonomy = tags - click.echo(' - ' + scan.scientific_name) + species_ids = db.session.execute( + select(Scan.gbif_species_id) + .where(Scan.gbif_species_id.isnot(None)) + .group_by(Scan.gbif_species_id) + ) + for sid in species_ids: + tags = [db.session.merge(tag) for tag in pull_tags(sid[0])] + db.session.commit() + if len(tags) == 0: + continue + for scan in Scan.query.filter(Scan.gbif_species_id == sid[0]): + scan.taxonomy = tags + click.echo(' - ' + tags[-1].name) click.echo('Deleting tags:') for tax in Taxonomy.query.filter(db.not_(Taxonomy.scans.any())): diff --git a/api/phenome10k/data/gbif.py b/api/phenome10k/data/gbif.py index 732d78d..cfc2e67 100644 --- a/api/phenome10k/data/gbif.py +++ b/api/phenome10k/data/gbif.py @@ -1,6 +1,7 @@ import requests from phenome10k.models import Taxonomy +from sqlalchemy import and_ def fetch_json(url): @@ -15,19 +16,80 @@ def pull_tags(gbif_species_id): if not validate_id('species', gbif_species_id): return [] - gbif_api_url = 'https://api.gbif.org/v1/species/' + str(gbif_species_id) + species_url = 'https://api.gbif.org/v1/species/' + gbif_api_url = species_url + str(gbif_species_id) gbif_api_parents = gbif_api_url + '/parents' - tags = fetch_json(gbif_api_parents) + [fetch_json(gbif_api_url)] + parent_taxa = fetch_json(gbif_api_parents) + taxon = fetch_json(gbif_api_url) + tags = [] - return [ - Taxonomy( - id=tag['key'], - name=tag.get('vernacularName', tag['canonicalName']), - parent_id=tag.get('parentKey'), + def _make_taxonomy_model(json_taxon, is_gbif=True): + return Taxonomy( + id=json_taxon['key'], + name=json_taxon['canonicalName'], + parent_id=json_taxon.get('parentKey'), + gbif=is_gbif, ) - for tag in tags - ] + + # sense check rank vs number of parents + # variety and forma have not been tested; subspecies has + ranks = ['KINGDOM', 'PHYLUM', 'CLASS', 'ORDER', 'FAMILY', 'GENUS', 'SPECIES'] + + if ( + taxon['rank'] == 'SUBSPECIES' + and len(parent_taxa) < len(ranks) + and taxon.get('speciesKey') + ): + # sometimes the parent for subspecies is genus not species, so add the species manually + parent_taxa.append(fetch_json(species_url + str(taxon['speciesKey']))) + + try: + expected_parents = ranks.index(taxon['rank']) + except ValueError: + if taxon['rank'] in ['SUBSPECIES', 'VARIETY', 'FORMA']: + expected_parents = len(ranks) + else: + return [] + if len(parent_taxa) == expected_parents: + tags += [_make_taxonomy_model(t) for t in parent_taxa] + if expected_parents > len(parent_taxa): + for r in ranks[:expected_parents]: + previous_parent = tags[-1].id if len(tags) > 0 else None + try: + parent_taxon = next(t for t in parent_taxa if t['rank'] == r) + parent = _make_taxonomy_model(parent_taxon) + except StopIteration: + # try and find an existing child + parent = Taxonomy.query.filter( + and_(Taxonomy.parent_id == previous_parent, Taxonomy.gbif == False) + ).first() + if not parent: + # create a new item with a very large id + current_max = ( + Taxonomy.query.filter( + and_(Taxonomy.id >= 1000000000, Taxonomy.gbif == False) + ) + .order_by(Taxonomy.id.desc()) + .first() + ) + if current_max: + new_id = ( + current_max.id + 1 + if previous_parent < current_max.id + else previous_parent + 1 + ) + else: + new_id = 1000000000 + parent = _make_taxonomy_model( + {'key': new_id, 'canonicalName': f'Unknown {r.lower()}'}, False + ) + parent.parent_id = previous_parent + tags.append(parent) + + tags.append(_make_taxonomy_model(taxon)) + + return tags def validate_id(gbif_type, gbif_id): diff --git a/api/phenome10k/models/taxonomy.py b/api/phenome10k/models/taxonomy.py index 61c726f..e77a640 100644 --- a/api/phenome10k/models/taxonomy.py +++ b/api/phenome10k/models/taxonomy.py @@ -5,6 +5,7 @@ class Taxonomy(db.Model): id = db.Column(db.Integer, primary_key=True) name = db.Column(db.String(250), nullable=False) parent_id = db.Column(db.Integer, db.ForeignKey('taxonomy.id')) + gbif = db.Column(db.Boolean, nullable=False, default=True) children = db.relationship('Taxonomy') @@ -19,12 +20,9 @@ def serialize_tree(self, depth=float('inf')): data = self.serialize() if depth > 0: - if len(self.children) == 1: - data['children'] = self.children[0].serialize_tree(depth)['children'] - else: - data['children'] = [ - child.serialize_tree(depth - 1) for child in self.children - ] + data['children'] = [ + child.serialize_tree(depth - 1) for child in self.children + ] else: data['children'] = [] return data