Skip to content

Commit

Permalink
merge: #181 from ginger/gbif-tags
Browse files Browse the repository at this point in the history
  • Loading branch information
alycejenni authored Apr 9, 2024
2 parents 7bbffb1 + fc3c174 commit ea2faba
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 19 deletions.
29 changes: 29 additions & 0 deletions api/migrations/versions/8b76ef5c47ef_gbif_taxonomy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""
Add gbif column to taxonomy.
Revision ID: 8b76ef5c47ef
Revises: 01816b2fcaea
Create Date: 2024-04-05 16:19:09.627686
"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = '8b76ef5c47ef'
down_revision = '01816b2fcaea'
branch_labels = None
depends_on = None


def upgrade():
with op.batch_alter_table('taxonomy', schema=None) as batch_op:
batch_op.add_column(sa.Column('gbif', sa.Boolean(), nullable=True))

op.execute("update taxonomy set gbif = 't'")
op.alter_column('taxonomy', 'gbif', nullable=False)


def downgrade():
with op.batch_alter_table('taxonomy', schema=None) as batch_op:
batch_op.drop_column('gbif')
18 changes: 14 additions & 4 deletions api/phenome10k/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from phenome10k.extensions import db, security
from phenome10k.models import User, Scan, Taxonomy
from datetime import datetime as dt
from sqlalchemy import select


def create_cli_app(info):
Expand Down Expand Up @@ -51,10 +52,19 @@ def update_gbif_tags():
Updates taxonomy tags from gbif backbone and deletes unused ones.
"""
click.echo('Updating tags:')
for scan in Scan.query.filter(Scan.gbif_species_id).all():
tags = [db.session.merge(tag) for tag in pull_tags(scan.gbif_species_id)]
scan.taxonomy = tags
click.echo(' - ' + scan.scientific_name)
species_ids = db.session.execute(
select(Scan.gbif_species_id)
.where(Scan.gbif_species_id.isnot(None))
.group_by(Scan.gbif_species_id)
)
for sid in species_ids:
tags = [db.session.merge(tag) for tag in pull_tags(sid[0])]
db.session.commit()
if len(tags) == 0:
continue
for scan in Scan.query.filter(Scan.gbif_species_id == sid[0]):
scan.taxonomy = tags
click.echo(' - ' + tags[-1].name)

click.echo('Deleting tags:')
for tax in Taxonomy.query.filter(db.not_(Taxonomy.scans.any())):
Expand Down
80 changes: 71 additions & 9 deletions api/phenome10k/data/gbif.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import requests

from phenome10k.models import Taxonomy
from sqlalchemy import and_


def fetch_json(url):
Expand All @@ -15,19 +16,80 @@ def pull_tags(gbif_species_id):
if not validate_id('species', gbif_species_id):
return []

gbif_api_url = 'https://api.gbif.org/v1/species/' + str(gbif_species_id)
species_url = 'https://api.gbif.org/v1/species/'
gbif_api_url = species_url + str(gbif_species_id)
gbif_api_parents = gbif_api_url + '/parents'

tags = fetch_json(gbif_api_parents) + [fetch_json(gbif_api_url)]
parent_taxa = fetch_json(gbif_api_parents)
taxon = fetch_json(gbif_api_url)
tags = []

return [
Taxonomy(
id=tag['key'],
name=tag.get('vernacularName', tag['canonicalName']),
parent_id=tag.get('parentKey'),
def _make_taxonomy_model(json_taxon, is_gbif=True):
return Taxonomy(
id=json_taxon['key'],
name=json_taxon['canonicalName'],
parent_id=json_taxon.get('parentKey'),
gbif=is_gbif,
)
for tag in tags
]

# sense check rank vs number of parents
# variety and forma have not been tested; subspecies has
ranks = ['KINGDOM', 'PHYLUM', 'CLASS', 'ORDER', 'FAMILY', 'GENUS', 'SPECIES']

if (
taxon['rank'] == 'SUBSPECIES'
and len(parent_taxa) < len(ranks)
and taxon.get('speciesKey')
):
# sometimes the parent for subspecies is genus not species, so add the species manually
parent_taxa.append(fetch_json(species_url + str(taxon['speciesKey'])))

try:
expected_parents = ranks.index(taxon['rank'])
except ValueError:
if taxon['rank'] in ['SUBSPECIES', 'VARIETY', 'FORMA']:
expected_parents = len(ranks)
else:
return []
if len(parent_taxa) == expected_parents:
tags += [_make_taxonomy_model(t) for t in parent_taxa]
if expected_parents > len(parent_taxa):
for r in ranks[:expected_parents]:
previous_parent = tags[-1].id if len(tags) > 0 else None
try:
parent_taxon = next(t for t in parent_taxa if t['rank'] == r)
parent = _make_taxonomy_model(parent_taxon)
except StopIteration:
# try and find an existing child
parent = Taxonomy.query.filter(
and_(Taxonomy.parent_id == previous_parent, Taxonomy.gbif == False)
).first()
if not parent:
# create a new item with a very large id
current_max = (
Taxonomy.query.filter(
and_(Taxonomy.id >= 1000000000, Taxonomy.gbif == False)
)
.order_by(Taxonomy.id.desc())
.first()
)
if current_max:
new_id = (
current_max.id + 1
if previous_parent < current_max.id
else previous_parent + 1
)
else:
new_id = 1000000000
parent = _make_taxonomy_model(
{'key': new_id, 'canonicalName': f'Unknown {r.lower()}'}, False
)
parent.parent_id = previous_parent
tags.append(parent)

tags.append(_make_taxonomy_model(taxon))

return tags


def validate_id(gbif_type, gbif_id):
Expand Down
10 changes: 4 additions & 6 deletions api/phenome10k/models/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ class Taxonomy(db.Model):
id = db.Column(db.Integer, primary_key=True)
name = db.Column(db.String(250), nullable=False)
parent_id = db.Column(db.Integer, db.ForeignKey('taxonomy.id'))
gbif = db.Column(db.Boolean, nullable=False, default=True)

children = db.relationship('Taxonomy')

Expand All @@ -19,12 +20,9 @@ def serialize_tree(self, depth=float('inf')):
data = self.serialize()

if depth > 0:
if len(self.children) == 1:
data['children'] = self.children[0].serialize_tree(depth)['children']
else:
data['children'] = [
child.serialize_tree(depth - 1) for child in self.children
]
data['children'] = [
child.serialize_tree(depth - 1) for child in self.children
]
else:
data['children'] = []
return data
Expand Down

0 comments on commit ea2faba

Please sign in to comment.