Skip to content

Commit

Permalink
Prevend highly ambigous genera without well defined parents to be mer…
Browse files Browse the repository at this point in the history
…ged, see #1368
  • Loading branch information
mdoering committed Jan 13, 2025
1 parent 1cb23f2 commit 15cd240
Show file tree
Hide file tree
Showing 14 changed files with 137 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

public class TaxGroupAnalyzer {
private static final Logger LOG = LoggerFactory.getLogger(TaxGroupAnalyzer.class);
final TaxGroupParser parser = TaxGroupParser.PARSER;
private static final TaxGroupParser parser = TaxGroupParser.PARSER;
private static final Pattern YEAR_PATTERN = Pattern.compile("([12]\\d{3})");
private static final Pattern BAS_COMB_PATTERN = Pattern.compile("\\(\\s*[A-Z].+\\)[^()]*[A-Z]");
// map of suffix to groups, sorted by suffix length
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ public class TreeMergeHandler extends TreeBaseHandler {
private static final Set<Rank> LOW_RANKS = Set.of(Rank.FAMILY, Rank.SUBFAMILY, Rank.TRIBE, Rank.GENUS);
private final MatchedParentStack parents;
private final UsageMatcherGlobal matcher;
private final TaxGroupAnalyzer groupAnalyzer;
private final UsageCache uCache;
private final CacheLoader loader;
private int counter = 0; // all source usages
Expand All @@ -61,6 +62,7 @@ public class TreeMergeHandler extends TreeBaseHandler {
this.vKey = DSID.root(sourceDatasetKey);
this.matcher = matcher;
uCache = matcher.getUCache();
groupAnalyzer = new TaxGroupAnalyzer();

// figure out the lowest insertion point in the project/release
// a) a target is given
Expand Down Expand Up @@ -320,14 +322,33 @@ public void acceptThrowsNoCatch(NameUsageBase nu) throws Exception {

} else {
// *** CREATE ***
if ( nu.isTaxon() && syncTaxa || nu.isSynonym() && syncSynonyms) {
if ( nu.isTaxon() && syncTaxa && !isAmbiguousGenus(nu) || nu.isSynonym() && syncSynonyms) {
sn = create(nu, parent);
}
}

processEnd(sn, mod);
}

/**
* Detects unqualified genus usages without authorship
* which are placed under no parents or Biota and similar parents which have no taxonomic group at all.
*
* These ambiguous genera often cause trouble as they match later on to pretty much anything alike
* and also adapt (wrong) authorships.
*/
private boolean isAmbiguousGenus(NameUsageBase nu) {
if (nu.getRank() == Rank.GENUS) {
var psn = parents.matchedParentsOnlySN();
var group = groupAnalyzer.analyze(nu.toSimpleNameLink(), psn);
if (group == null || group.equals(TaxGroup.Eukaryotes)) {
LOG.info("Ignore canonical genus {} with vague parents: {}", nu.getLabel(), psn);
return true;
}
}
return false;
}

public void acceptName(Name n) throws InterruptedException {
try {
acceptNameThrowsNoCatch(n);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,13 @@ public LinkedList<MatchedUsage> matchedParentsOnly(String... excludeIDs) {
return parents.stream().filter(u -> u.match != null && !exclusion.contains(u.match.getId())).collect(Collectors.toCollection(LinkedList::new));
}

public List<SimpleNameCached> matchedParentsOnlySN() {
return parents.stream()
.filter(u -> u.match != null)
.map(u -> u.usage)
.collect(Collectors.toList());
}

public MatchedUsage secondLast() {
return parents.isEmpty() ? null : parents.get(parents.size()-2);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ public class SectorSyncMergeIT extends SectorSyncTestBase {
@Parameterized.Parameters
public static Collection<Object[]> data() {
return Arrays.asList(new Object[][] {
{"doryphora", List.of("worms", "wcvp", "3i", "coleo", "pbdb", "zoobank")},
{"bolyeriidae", List.of("itis", "reptiledb", "uksi", "pbdb")},
{"myosotis", List.of("taxref", "uksi", "pbdb", "bavaria")},
{"tetralobus", List.of("wfo", "bouchard", "plazi")},
Expand Down
8 changes: 8 additions & 0 deletions webservice/src/test/resources/txtree/doryphora/3i.txtree
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Auchenorrhyncha Duméril, 1805 [suborder]
Fulgoromorpha Evans, 1946 [infraorder]
Fulgoroidea Latreille, 1807 [superfamily]
Dictyopharidae Spinola, 1839 [family]
Dictyopharinae Spinola, 1839 [subfamily]
Dictyopharini Spinola, 1839 [tribe]
Doryphorina Melichar, 1912 [genus]
=Doryphora Melichar, 1912 [genus]
8 changes: 8 additions & 0 deletions webservice/src/test/resources/txtree/doryphora/coleo.txtree
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Coleoptera [order]
Polyphaga [suborder]
Cucujiformia [series]
Chrysomeloidea Latreille, 1802 [superfamily]
Chrysomelidae Latreille, 1802 [family]
Chrysomelinae Latreille, 1802 [subfamily]
Chrysomelini Latreille, 1802 [tribe]
Doryphora Illiger, 1807 [genus]
6 changes: 6 additions & 0 deletions webservice/src/test/resources/txtree/doryphora/coleo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ranks:
- superfamily
- family
- subfamily
- genus
- species
34 changes: 34 additions & 0 deletions webservice/src/test/resources/txtree/doryphora/expected.txtree
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
Biota [unranked]
Animalia [kingdom]
Arthropoda [phylum]
Insecta [class]
Coleoptera [order]
Chrysomeloidea Latreille, 1802 [superfamily]
Chrysomelidae Latreille, 1802 [family]
Chrysomelinae Latreille, 1802 [subfamily]
Doryphora Illiger, 1807 [genus]
Hemiptera Linnaeus, 1758 [order]
Auchenorrhyncha Duméril, 1805 [suborder]
Fulgoromorpha Evans, 1946 [infraorder]
Fulgoroidea [superfamily]
Dictyopharidae Spinola, 1839 [family]
Doryphorina Melichar, 1912 [genus]
=Doryphora Melichar, 1912 [genus]
Lepidoptera [order]
Gelechioidea [superfamily]
Gelechiidae [family]
Anomologinae [subfamily]
Xystophora Wocke, 1876 [genus]
=Doryphora Heinemann, 1870 [genus]
Zygaenoidea Latreille, 1809 [superfamily]
Limacodidae [family]
Doratifera (&Westwood),1841 Duncan [genus]
=Doryphora (&Westwood),1841 Duncan [genus]
Plantae [kingdom]
Pteridobiotina Britton & Brown [subkingdom]
Tracheophyta [phylum]
Magnoliopsida [class]
Laurales Juss. ex Bercht. & J.Presl [order]
Atherospermataceae R.Br. [family]
Doryphora Endl. [genus]
=Doratophora Lem. [genus]
10 changes: 10 additions & 0 deletions webservice/src/test/resources/txtree/doryphora/pbdb.txtree
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Life [unranked]
Eukaryota (Chatton, 1925) [unranked]
Plantae (Haeckel, 1866) [kingdom]
Spermatophyta [phylum]
Angiospermae [class]
Mesangiosperms The Angiosperm Phylogeny Group, 2016 [unranked]
Magnoliid Jud, 2011 [unranked]
Laurales Jussieu, 1820 [order]
Atherospermataceae Brown, 1814 [family]
Doryphora Endlicher, 1837 [genus]
27 changes: 27 additions & 0 deletions webservice/src/test/resources/txtree/doryphora/project.txtree
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
Biota [unranked]
Animalia [kingdom]
Arthropoda [phylum]
Insecta [class]
Coleoptera [order]
Chrysomeloidea Latreille, 1802 [superfamily]
Hemiptera Linnaeus, 1758 [order]
Auchenorrhyncha Duméril, 1805 [suborder]
Fulgoromorpha Evans, 1946 [infraorder]
Fulgoroidea [superfamily]
Dictyopharidae Spinola, 1839 [family]
Lepidoptera [order]
Gelechioidea [superfamily]
Gelechiidae [family]
Anomologinae [subfamily]
Xystophora Wocke, [1876] [genus]
=Doryphora Heinemann, 1870 [genus]
Zygaenoidea Latreille, 1809 [superfamily]
Limacodidae [family]
Doratifera Duncan [& Westwood], 1841 [genus]
=Doryphora Duncan [& Westwood], 1841 [genus]
Plantae [kingdom]
Pteridobiotina Britton & Brown [subkingdom]
Tracheophyta [phylum]
Magnoliopsida [class]
Laurales Juss. ex Bercht. & J. Presl [order]
Atherospermataceae R. Br. [family]
3 changes: 3 additions & 0 deletions webservice/src/test/resources/txtree/doryphora/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Avoid adding canonical genus names to the root of the tree, e.g. Biota, which has no taxgroup context.
Algae Doryphora from Bacillariophyceae, Chromista cannot be mapped to the project (we intentionally omitted the Chromista kingdom in this test)
https://github.com/CatalogueOfLife/backend/issues/1368
3 changes: 3 additions & 0 deletions webservice/src/test/resources/txtree/doryphora/wcvp.txtree
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Atherospermataceae [family]
Doryphora Endl. [genus]
=Doratophora Lem. [genus]
6 changes: 6 additions & 0 deletions webservice/src/test/resources/txtree/doryphora/worms.txtree
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Biota [unranked]
Chromista [kingdom]
Heterokontophyta [phylum]
Bacillariophytina [subphylum]
Bacillariophyceae [class]
Doryphora [genus]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Doryphora Illiger, 1807 sensu Illiger 1807 [genus]

0 comments on commit 15cd240

Please sign in to comment.