merge: #181 from ginger/gbif-tags

NaturalHistoryMuseum · Apr 9, 2024 · ea2faba · ea2faba
2 parents 7bbffb1 + fc3c174
commit ea2faba
Show file tree

Hide file tree

Showing 4 changed files with 118 additions and 19 deletions.
diff --git a/api/migrations/versions/8b76ef5c47ef_gbif_taxonomy.py b/api/migrations/versions/8b76ef5c47ef_gbif_taxonomy.py
@@ -0,0 +1,29 @@
+"""
+Add gbif column to taxonomy.
+
+Revision ID: 8b76ef5c47ef
+Revises: 01816b2fcaea
+Create Date: 2024-04-05 16:19:09.627686
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '8b76ef5c47ef'
+down_revision = '01816b2fcaea'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    with op.batch_alter_table('taxonomy', schema=None) as batch_op:
+        batch_op.add_column(sa.Column('gbif', sa.Boolean(), nullable=True))
+
+    op.execute("update taxonomy set gbif = 't'")
+    op.alter_column('taxonomy', 'gbif', nullable=False)
+
+
+def downgrade():
+    with op.batch_alter_table('taxonomy', schema=None) as batch_op:
+        batch_op.drop_column('gbif')
diff --git a/api/phenome10k/cli.py b/api/phenome10k/cli.py
@@ -5,6 +5,7 @@
 from phenome10k.extensions import db, security
 from phenome10k.models import User, Scan, Taxonomy
 from datetime import datetime as dt
+from sqlalchemy import select
 
 
 def create_cli_app(info):
@@ -51,10 +52,19 @@ def update_gbif_tags():
     Updates taxonomy tags from gbif backbone and deletes unused ones.
     """
     click.echo('Updating tags:')
-    for scan in Scan.query.filter(Scan.gbif_species_id).all():
-        tags = [db.session.merge(tag) for tag in pull_tags(scan.gbif_species_id)]
-        scan.taxonomy = tags
-        click.echo(' - ' + scan.scientific_name)
+    species_ids = db.session.execute(
+        select(Scan.gbif_species_id)
+        .where(Scan.gbif_species_id.isnot(None))
+        .group_by(Scan.gbif_species_id)
+    )
+    for sid in species_ids:
+        tags = [db.session.merge(tag) for tag in pull_tags(sid[0])]
+        db.session.commit()
+        if len(tags) == 0:
+            continue
+        for scan in Scan.query.filter(Scan.gbif_species_id == sid[0]):
+            scan.taxonomy = tags
+        click.echo(' - ' + tags[-1].name)
 
     click.echo('Deleting tags:')
     for tax in Taxonomy.query.filter(db.not_(Taxonomy.scans.any())):

diff --git a/api/phenome10k/data/gbif.py b/api/phenome10k/data/gbif.py
@@ -1,6 +1,7 @@
 import requests
 
 from phenome10k.models import Taxonomy
+from sqlalchemy import and_
 
 
 def fetch_json(url):
@@ -15,19 +16,80 @@ def pull_tags(gbif_species_id):
     if not validate_id('species', gbif_species_id):
         return []
 
-    gbif_api_url = 'https://api.gbif.org/v1/species/' + str(gbif_species_id)
+    species_url = 'https://api.gbif.org/v1/species/'
+    gbif_api_url = species_url + str(gbif_species_id)
     gbif_api_parents = gbif_api_url + '/parents'
 
-    tags = fetch_json(gbif_api_parents) + [fetch_json(gbif_api_url)]
+    parent_taxa = fetch_json(gbif_api_parents)
+    taxon = fetch_json(gbif_api_url)
+    tags = []
 
-    return [
-        Taxonomy(
-            id=tag['key'],
-            name=tag.get('vernacularName', tag['canonicalName']),
-            parent_id=tag.get('parentKey'),
+    def _make_taxonomy_model(json_taxon, is_gbif=True):
+        return Taxonomy(
+            id=json_taxon['key'],
+            name=json_taxon['canonicalName'],
+            parent_id=json_taxon.get('parentKey'),
+            gbif=is_gbif,
         )
-        for tag in tags
-    ]
+
+    # sense check rank vs number of parents
+    # variety and forma have not been tested; subspecies has
+    ranks = ['KINGDOM', 'PHYLUM', 'CLASS', 'ORDER', 'FAMILY', 'GENUS', 'SPECIES']
+
+    if (
+        taxon['rank'] == 'SUBSPECIES'
+        and len(parent_taxa) < len(ranks)
+        and taxon.get('speciesKey')
+    ):
+        # sometimes the parent for subspecies is genus not species, so add the species manually
+        parent_taxa.append(fetch_json(species_url + str(taxon['speciesKey'])))
+
+    try:
+        expected_parents = ranks.index(taxon['rank'])
+    except ValueError:
+        if taxon['rank'] in ['SUBSPECIES', 'VARIETY', 'FORMA']:
+            expected_parents = len(ranks)
+        else:
+            return []
+    if len(parent_taxa) == expected_parents:
+        tags += [_make_taxonomy_model(t) for t in parent_taxa]
+    if expected_parents > len(parent_taxa):
+        for r in ranks[:expected_parents]:
+            previous_parent = tags[-1].id if len(tags) > 0 else None
+            try:
+                parent_taxon = next(t for t in parent_taxa if t['rank'] == r)
+                parent = _make_taxonomy_model(parent_taxon)
+            except StopIteration:
+                # try and find an existing child
+                parent = Taxonomy.query.filter(
+                    and_(Taxonomy.parent_id == previous_parent, Taxonomy.gbif == False)
+                ).first()
+                if not parent:
+                    # create a new item with a very large id
+                    current_max = (
+                        Taxonomy.query.filter(
+                            and_(Taxonomy.id >= 1000000000, Taxonomy.gbif == False)
+                        )
+                        .order_by(Taxonomy.id.desc())
+                        .first()
+                    )
+                    if current_max:
+                        new_id = (
+                            current_max.id + 1
+                            if previous_parent < current_max.id
+                            else previous_parent + 1
+                        )
+                    else:
+                        new_id = 1000000000
+                    parent = _make_taxonomy_model(
+                        {'key': new_id, 'canonicalName': f'Unknown {r.lower()}'}, False
+                    )
+            parent.parent_id = previous_parent
+            tags.append(parent)
+
+    tags.append(_make_taxonomy_model(taxon))
+
+    return tags
 
 
 def validate_id(gbif_type, gbif_id):

diff --git a/api/phenome10k/models/taxonomy.py b/api/phenome10k/models/taxonomy.py
@@ -5,6 +5,7 @@ class Taxonomy(db.Model):
     id = db.Column(db.Integer, primary_key=True)
     name = db.Column(db.String(250), nullable=False)
     parent_id = db.Column(db.Integer, db.ForeignKey('taxonomy.id'))
+    gbif = db.Column(db.Boolean, nullable=False, default=True)
 
     children = db.relationship('Taxonomy')
 
@@ -19,12 +20,9 @@ def serialize_tree(self, depth=float('inf')):
         data = self.serialize()
 
         if depth > 0:
-            if len(self.children) == 1:
-                data['children'] = self.children[0].serialize_tree(depth)['children']
-            else:
-                data['children'] = [
-                    child.serialize_tree(depth - 1) for child in self.children
-                ]
+            data['children'] = [
+                child.serialize_tree(depth - 1) for child in self.children
+            ]
         else:
             data['children'] = []
         return data