From 15cd240cc628eb1431a626058ffbb976eea9874c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20D=C3=B6ring?= Date: Mon, 13 Jan 2025 19:07:36 +0100 Subject: [PATCH] Prevend highly ambigous genera without well defined parents to be merged, see https://github.com/CatalogueOfLife/backend/issues/1368 --- .../catalogue/matching/TaxGroupAnalyzer.java | 2 +- .../catalogue/assembly/TreeMergeHandler.java | 23 ++++++++++++- .../matching/MatchedParentStack.java | 7 ++++ .../catalogue/assembly/SectorSyncMergeIT.java | 1 + .../test/resources/txtree/doryphora/3i.txtree | 8 +++++ .../resources/txtree/doryphora/coleo.txtree | 8 +++++ .../resources/txtree/doryphora/coleo.yaml | 6 ++++ .../txtree/doryphora/expected.txtree | 34 +++++++++++++++++++ .../resources/txtree/doryphora/pbdb.txtree | 10 ++++++ .../resources/txtree/doryphora/project.txtree | 27 +++++++++++++++ .../test/resources/txtree/doryphora/readme.md | 3 ++ .../resources/txtree/doryphora/wcvp.txtree | 3 ++ .../resources/txtree/doryphora/worms.txtree | 6 ++++ .../resources/txtree/doryphora/zoobank.txtree | 1 + 14 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 webservice/src/test/resources/txtree/doryphora/3i.txtree create mode 100644 webservice/src/test/resources/txtree/doryphora/coleo.txtree create mode 100644 webservice/src/test/resources/txtree/doryphora/coleo.yaml create mode 100644 webservice/src/test/resources/txtree/doryphora/expected.txtree create mode 100644 webservice/src/test/resources/txtree/doryphora/pbdb.txtree create mode 100644 webservice/src/test/resources/txtree/doryphora/project.txtree create mode 100644 webservice/src/test/resources/txtree/doryphora/readme.md create mode 100644 webservice/src/test/resources/txtree/doryphora/wcvp.txtree create mode 100644 webservice/src/test/resources/txtree/doryphora/worms.txtree create mode 100644 webservice/src/test/resources/txtree/doryphora/zoobank.txtree diff --git a/dao/src/main/java/life/catalogue/matching/TaxGroupAnalyzer.java b/dao/src/main/java/life/catalogue/matching/TaxGroupAnalyzer.java index 1945b0e53..f19f251fa 100644 --- a/dao/src/main/java/life/catalogue/matching/TaxGroupAnalyzer.java +++ b/dao/src/main/java/life/catalogue/matching/TaxGroupAnalyzer.java @@ -22,7 +22,7 @@ public class TaxGroupAnalyzer { private static final Logger LOG = LoggerFactory.getLogger(TaxGroupAnalyzer.class); - final TaxGroupParser parser = TaxGroupParser.PARSER; + private static final TaxGroupParser parser = TaxGroupParser.PARSER; private static final Pattern YEAR_PATTERN = Pattern.compile("([12]\\d{3})"); private static final Pattern BAS_COMB_PATTERN = Pattern.compile("\\(\\s*[A-Z].+\\)[^()]*[A-Z]"); // map of suffix to groups, sorted by suffix length diff --git a/webservice/src/main/java/life/catalogue/assembly/TreeMergeHandler.java b/webservice/src/main/java/life/catalogue/assembly/TreeMergeHandler.java index 0c14fe7d6..a286232a6 100644 --- a/webservice/src/main/java/life/catalogue/assembly/TreeMergeHandler.java +++ b/webservice/src/main/java/life/catalogue/assembly/TreeMergeHandler.java @@ -39,6 +39,7 @@ public class TreeMergeHandler extends TreeBaseHandler { private static final Set LOW_RANKS = Set.of(Rank.FAMILY, Rank.SUBFAMILY, Rank.TRIBE, Rank.GENUS); private final MatchedParentStack parents; private final UsageMatcherGlobal matcher; + private final TaxGroupAnalyzer groupAnalyzer; private final UsageCache uCache; private final CacheLoader loader; private int counter = 0; // all source usages @@ -61,6 +62,7 @@ public class TreeMergeHandler extends TreeBaseHandler { this.vKey = DSID.root(sourceDatasetKey); this.matcher = matcher; uCache = matcher.getUCache(); + groupAnalyzer = new TaxGroupAnalyzer(); // figure out the lowest insertion point in the project/release // a) a target is given @@ -320,7 +322,7 @@ public void acceptThrowsNoCatch(NameUsageBase nu) throws Exception { } else { // *** CREATE *** - if ( nu.isTaxon() && syncTaxa || nu.isSynonym() && syncSynonyms) { + if ( nu.isTaxon() && syncTaxa && !isAmbiguousGenus(nu) || nu.isSynonym() && syncSynonyms) { sn = create(nu, parent); } } @@ -328,6 +330,25 @@ public void acceptThrowsNoCatch(NameUsageBase nu) throws Exception { processEnd(sn, mod); } + /** + * Detects unqualified genus usages without authorship + * which are placed under no parents or Biota and similar parents which have no taxonomic group at all. + * + * These ambiguous genera often cause trouble as they match later on to pretty much anything alike + * and also adapt (wrong) authorships. + */ + private boolean isAmbiguousGenus(NameUsageBase nu) { + if (nu.getRank() == Rank.GENUS) { + var psn = parents.matchedParentsOnlySN(); + var group = groupAnalyzer.analyze(nu.toSimpleNameLink(), psn); + if (group == null || group.equals(TaxGroup.Eukaryotes)) { + LOG.info("Ignore canonical genus {} with vague parents: {}", nu.getLabel(), psn); + return true; + } + } + return false; + } + public void acceptName(Name n) throws InterruptedException { try { acceptNameThrowsNoCatch(n); diff --git a/webservice/src/main/java/life/catalogue/matching/MatchedParentStack.java b/webservice/src/main/java/life/catalogue/matching/MatchedParentStack.java index e0c8102da..a05ebc371 100644 --- a/webservice/src/main/java/life/catalogue/matching/MatchedParentStack.java +++ b/webservice/src/main/java/life/catalogue/matching/MatchedParentStack.java @@ -163,6 +163,13 @@ public LinkedList matchedParentsOnly(String... excludeIDs) { return parents.stream().filter(u -> u.match != null && !exclusion.contains(u.match.getId())).collect(Collectors.toCollection(LinkedList::new)); } + public List matchedParentsOnlySN() { + return parents.stream() + .filter(u -> u.match != null) + .map(u -> u.usage) + .collect(Collectors.toList()); + } + public MatchedUsage secondLast() { return parents.isEmpty() ? null : parents.get(parents.size()-2); } diff --git a/webservice/src/test/java/life/catalogue/assembly/SectorSyncMergeIT.java b/webservice/src/test/java/life/catalogue/assembly/SectorSyncMergeIT.java index 5e372ce60..13b86991a 100644 --- a/webservice/src/test/java/life/catalogue/assembly/SectorSyncMergeIT.java +++ b/webservice/src/test/java/life/catalogue/assembly/SectorSyncMergeIT.java @@ -83,6 +83,7 @@ public class SectorSyncMergeIT extends SectorSyncTestBase { @Parameterized.Parameters public static Collection data() { return Arrays.asList(new Object[][] { + {"doryphora", List.of("worms", "wcvp", "3i", "coleo", "pbdb", "zoobank")}, {"bolyeriidae", List.of("itis", "reptiledb", "uksi", "pbdb")}, {"myosotis", List.of("taxref", "uksi", "pbdb", "bavaria")}, {"tetralobus", List.of("wfo", "bouchard", "plazi")}, diff --git a/webservice/src/test/resources/txtree/doryphora/3i.txtree b/webservice/src/test/resources/txtree/doryphora/3i.txtree new file mode 100644 index 000000000..9ded7cc59 --- /dev/null +++ b/webservice/src/test/resources/txtree/doryphora/3i.txtree @@ -0,0 +1,8 @@ +Auchenorrhyncha Duméril, 1805 [suborder] + Fulgoromorpha Evans, 1946 [infraorder] + Fulgoroidea Latreille, 1807 [superfamily] + Dictyopharidae Spinola, 1839 [family] + Dictyopharinae Spinola, 1839 [subfamily] + Dictyopharini Spinola, 1839 [tribe] + Doryphorina Melichar, 1912 [genus] + =Doryphora Melichar, 1912 [genus] diff --git a/webservice/src/test/resources/txtree/doryphora/coleo.txtree b/webservice/src/test/resources/txtree/doryphora/coleo.txtree new file mode 100644 index 000000000..c76e850f4 --- /dev/null +++ b/webservice/src/test/resources/txtree/doryphora/coleo.txtree @@ -0,0 +1,8 @@ +Coleoptera [order] + Polyphaga [suborder] + Cucujiformia [series] + Chrysomeloidea Latreille, 1802 [superfamily] + Chrysomelidae Latreille, 1802 [family] + Chrysomelinae Latreille, 1802 [subfamily] + Chrysomelini Latreille, 1802 [tribe] + Doryphora Illiger, 1807 [genus] diff --git a/webservice/src/test/resources/txtree/doryphora/coleo.yaml b/webservice/src/test/resources/txtree/doryphora/coleo.yaml new file mode 100644 index 000000000..9c377612b --- /dev/null +++ b/webservice/src/test/resources/txtree/doryphora/coleo.yaml @@ -0,0 +1,6 @@ +ranks: + - superfamily + - family + - subfamily + - genus + - species \ No newline at end of file diff --git a/webservice/src/test/resources/txtree/doryphora/expected.txtree b/webservice/src/test/resources/txtree/doryphora/expected.txtree new file mode 100644 index 000000000..0e868cdb7 --- /dev/null +++ b/webservice/src/test/resources/txtree/doryphora/expected.txtree @@ -0,0 +1,34 @@ +Biota [unranked] + Animalia [kingdom] + Arthropoda [phylum] + Insecta [class] + Coleoptera [order] + Chrysomeloidea Latreille, 1802 [superfamily] + Chrysomelidae Latreille, 1802 [family] + Chrysomelinae Latreille, 1802 [subfamily] + Doryphora Illiger, 1807 [genus] + Hemiptera Linnaeus, 1758 [order] + Auchenorrhyncha Duméril, 1805 [suborder] + Fulgoromorpha Evans, 1946 [infraorder] + Fulgoroidea [superfamily] + Dictyopharidae Spinola, 1839 [family] + Doryphorina Melichar, 1912 [genus] + =Doryphora Melichar, 1912 [genus] + Lepidoptera [order] + Gelechioidea [superfamily] + Gelechiidae [family] + Anomologinae [subfamily] + Xystophora Wocke, 1876 [genus] + =Doryphora Heinemann, 1870 [genus] + Zygaenoidea Latreille, 1809 [superfamily] + Limacodidae [family] + Doratifera (&Westwood),1841 Duncan [genus] + =Doryphora (&Westwood),1841 Duncan [genus] + Plantae [kingdom] + Pteridobiotina Britton & Brown [subkingdom] + Tracheophyta [phylum] + Magnoliopsida [class] + Laurales Juss. ex Bercht. & J.Presl [order] + Atherospermataceae R.Br. [family] + Doryphora Endl. [genus] + =Doratophora Lem. [genus] \ No newline at end of file diff --git a/webservice/src/test/resources/txtree/doryphora/pbdb.txtree b/webservice/src/test/resources/txtree/doryphora/pbdb.txtree new file mode 100644 index 000000000..55cc07fe0 --- /dev/null +++ b/webservice/src/test/resources/txtree/doryphora/pbdb.txtree @@ -0,0 +1,10 @@ +Life [unranked] + Eukaryota (Chatton, 1925) [unranked] + Plantae (Haeckel, 1866) [kingdom] + Spermatophyta [phylum] + Angiospermae [class] + Mesangiosperms The Angiosperm Phylogeny Group, 2016 [unranked] + Magnoliid Jud, 2011 [unranked] + Laurales Jussieu, 1820 [order] + Atherospermataceae Brown, 1814 [family] + Doryphora Endlicher, 1837 [genus] \ No newline at end of file diff --git a/webservice/src/test/resources/txtree/doryphora/project.txtree b/webservice/src/test/resources/txtree/doryphora/project.txtree new file mode 100644 index 000000000..00ababab2 --- /dev/null +++ b/webservice/src/test/resources/txtree/doryphora/project.txtree @@ -0,0 +1,27 @@ +Biota [unranked] + Animalia [kingdom] + Arthropoda [phylum] + Insecta [class] + Coleoptera [order] + Chrysomeloidea Latreille, 1802 [superfamily] + Hemiptera Linnaeus, 1758 [order] + Auchenorrhyncha Duméril, 1805 [suborder] + Fulgoromorpha Evans, 1946 [infraorder] + Fulgoroidea [superfamily] + Dictyopharidae Spinola, 1839 [family] + Lepidoptera [order] + Gelechioidea [superfamily] + Gelechiidae [family] + Anomologinae [subfamily] + Xystophora Wocke, [1876] [genus] + =Doryphora Heinemann, 1870 [genus] + Zygaenoidea Latreille, 1809 [superfamily] + Limacodidae [family] + Doratifera Duncan [& Westwood], 1841 [genus] + =Doryphora Duncan [& Westwood], 1841 [genus] + Plantae [kingdom] + Pteridobiotina Britton & Brown [subkingdom] + Tracheophyta [phylum] + Magnoliopsida [class] + Laurales Juss. ex Bercht. & J. Presl [order] + Atherospermataceae R. Br. [family] diff --git a/webservice/src/test/resources/txtree/doryphora/readme.md b/webservice/src/test/resources/txtree/doryphora/readme.md new file mode 100644 index 000000000..85ebad1f1 --- /dev/null +++ b/webservice/src/test/resources/txtree/doryphora/readme.md @@ -0,0 +1,3 @@ +Avoid adding canonical genus names to the root of the tree, e.g. Biota, which has no taxgroup context. +Algae Doryphora from Bacillariophyceae, Chromista cannot be mapped to the project (we intentionally omitted the Chromista kingdom in this test) +https://github.com/CatalogueOfLife/backend/issues/1368 diff --git a/webservice/src/test/resources/txtree/doryphora/wcvp.txtree b/webservice/src/test/resources/txtree/doryphora/wcvp.txtree new file mode 100644 index 000000000..81775ab4e --- /dev/null +++ b/webservice/src/test/resources/txtree/doryphora/wcvp.txtree @@ -0,0 +1,3 @@ +Atherospermataceae [family] + Doryphora Endl. [genus] + =Doratophora Lem. [genus] diff --git a/webservice/src/test/resources/txtree/doryphora/worms.txtree b/webservice/src/test/resources/txtree/doryphora/worms.txtree new file mode 100644 index 000000000..8ffd161b6 --- /dev/null +++ b/webservice/src/test/resources/txtree/doryphora/worms.txtree @@ -0,0 +1,6 @@ +Biota [unranked] + Chromista [kingdom] + Heterokontophyta [phylum] + Bacillariophytina [subphylum] + Bacillariophyceae [class] + Doryphora [genus] \ No newline at end of file diff --git a/webservice/src/test/resources/txtree/doryphora/zoobank.txtree b/webservice/src/test/resources/txtree/doryphora/zoobank.txtree new file mode 100644 index 000000000..232921977 --- /dev/null +++ b/webservice/src/test/resources/txtree/doryphora/zoobank.txtree @@ -0,0 +1 @@ +Doryphora Illiger, 1807 sensu Illiger 1807 [genus]