diff --git a/matching-ws/Dockerfile b/matching-ws/Dockerfile
index 4a0ebe218..224d901d8 100644
--- a/matching-ws/Dockerfile
+++ b/matching-ws/Dockerfile
@@ -27,14 +27,11 @@ RUN git clone https://github.com/CatalogueOfLife/backend.git
WORKDIR /app/backend
RUN git checkout $GIT_BRANCH
-# Build all the CLB modules
-RUN mvn clean install package -DskipTests
-
# Build the Maven project and create a exec file
WORKDIR /app/backend/matching-ws
-# Run tests - full backend tests require additional services (e.g. ES)
-RUN mvn clean install package
+# Build all the CLB modules
+RUN mvn clean install package -DskipTests -DskipITs
# Store git commit id and log
RUN curl -o /app/backend/git.json -H "Accept: application/vnd.github+json" "https://api.github.com/repos/catalogueoflife/backend/commits/$(git rev-parse HEAD)"
@@ -106,4 +103,4 @@ RUN chown -R $USER:$USER /opt/gbif/$APP_ARTIFACT
USER $USER
EXPOSE $SERVER_PORT
-CMD java $JVM_OPTIONS -jar app.jar --server.port=$SERVER_PORT --working.dir=/opt/gbif/$APP_ARTIFACT --mode=RUN --spring.cloud.bootstrap.location=/opt/gbif/$APP_ARTIFACT/bootstrap.yaml
\ No newline at end of file
+CMD java $JVM_OPTIONS -jar app.jar --server.port=$SERVER_PORT --working.dir=/opt/gbif/$APP_ARTIFACT/ --mode=RUN --spring.cloud.bootstrap.location=/opt/gbif/$APP_ARTIFACT/bootstrap.yaml
\ No newline at end of file
diff --git a/matching-ws/pom.xml b/matching-ws/pom.xml
index 19ca20d59..4da425756 100644
--- a/matching-ws/pom.xml
+++ b/matching-ws/pom.xml
@@ -15,6 +15,7 @@
11
9.10.0
2.7.18
+ 2021.0.9
1.18.22
2.43.0
1.2.13
@@ -284,11 +285,6 @@
spring-boot-starter-validation
${spring-boot.version}
-
- net.openhft
- chronicle-map
- 3.25ea6
-
org.springframework.boot
spring-boot-starter-web
@@ -356,14 +352,6 @@
logstash-logback-encoder
${logstash-logback.version}
-
-
-
-
-
-
-
-
org.springframework.boot
spring-boot-configuration-processor
diff --git a/matching-ws/src/main/java/life/catalogue/matching/MatchingApplication.java b/matching-ws/src/main/java/life/catalogue/matching/MatchingApplication.java
index e964e2e02..3f7820022 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/MatchingApplication.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/MatchingApplication.java
@@ -87,7 +87,7 @@ public void run(ApplicationArguments args) {
}
private void initialiseWebapp() {
- Optional metadata = matchingService.getAPIMetadata(false);
+ Optional metadata = matchingService.getAPIMetadata(true);
if (metadata.isEmpty()) {
log.error("No main index found. Cannot start web services");
return;
@@ -136,7 +136,8 @@ private void runIndexingIfRequired(ApplicationArguments args) throws Exception {
indexingService.indexIdentifiers(id);
}
- log.info("Indexing completed");
+ matchingService.getAPIMetadata(true);
+ log.info("Indexing ready");
}
private ExecutionMode getMode(ApplicationArguments args) {
diff --git a/matching-ws/src/main/java/life/catalogue/matching/controller/MatchController.java b/matching-ws/src/main/java/life/catalogue/matching/controller/MatchController.java
index de51ebc67..c2c2df057 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/controller/MatchController.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/controller/MatchController.java
@@ -15,6 +15,7 @@
import java.util.stream.Collectors;
import life.catalogue.matching.model.*;
import life.catalogue.matching.service.MatchingService;
+import life.catalogue.matching.util.IUCNUtils;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
@@ -112,9 +113,13 @@ public NameUsageMatch matchOldPaths(
HttpServletRequest response) {
return matchV2(
usageKey,
- taxonID,taxonConceptID,scientificNameID,
- scientificName2, scientificName,
- authorship, authorship2,
+ taxonID,
+ taxonConceptID,
+ scientificNameID,
+ scientificName2,
+ scientificName,
+ authorship,
+ authorship2,
genericName,
specificEpithet,
infraspecificEpithet,
@@ -279,6 +284,7 @@ public NameUsageMatch matchV2(
taxonID,
taxonConceptID,
scientificNameID,
+
scientificName,
scientificName2,
authorship,
@@ -288,6 +294,7 @@ public NameUsageMatch matchV2(
infraspecificEpithet,
rank,
rank2,
+
classification,
exclude,
strict,
@@ -428,6 +435,7 @@ public Object matchFlatV1(
taxonID,
taxonConceptID,
scientificNameID,
+
scientificName,
scientificName2,
authorship,
@@ -437,6 +445,7 @@ public Object matchFlatV1(
infraspecificEpithet,
rank,
rank2,
+
classification,
exclude != null ? exclude.stream().map(Object::toString).collect(Collectors.toSet()) : Set.of(),
strict,
@@ -598,6 +607,7 @@ public Object matchV1(
taxonID,
taxonConceptID,
scientificNameID,
+
scientificName,
scientificName2,
authorship,
@@ -607,6 +617,7 @@ public Object matchV1(
infraspecificEpithet,
rank,
rank2,
+
classification,
exclude != null ? exclude.stream().map(Object::toString).collect(Collectors.toSet()) : Set.of(),
strict,
@@ -671,7 +682,7 @@ public Map iucnRedListV1(@PathVariable(value = "usageKey", requi
return Map.of();
}
NameUsageMatch.Status status = statusList.get(0);
- String formatted = formatIucn(status.getStatus());
+ String formatted = IUCNUtils.formatIucn(status.getStatus());
if (formatted == null || formatted.isEmpty()) {
return Map.of();
}
@@ -679,7 +690,7 @@ public Map iucnRedListV1(@PathVariable(value = "usageKey", requi
String scientificName = match.getAcceptedUsage() != null ? match.getAcceptedUsage().getCanonicalName() : match.getUsage().getCanonicalName();
try {
- IUCN iucn = IUCN.valueOf(formatted); // throws IllegalArgumentException if not found
+ IUCNUtils.IUCN iucn = IUCNUtils.IUCN.valueOf(formatted); // throws IllegalArgumentException if not found
watch.stop();
log("v1/species/iucnRedListCategory", usageKey, watch);
return Map.of(
@@ -689,7 +700,7 @@ public Map iucnRedListV1(@PathVariable(value = "usageKey", requi
"taxonomicStatus", NameUsageMatchV1.TaxonomicStatusV1.convert(
match.getDiagnostics().getStatus()),
"iucnTaxonID", status.getSourceId(),
- "code", iucn.code
+ "code", iucn.getCode()
);
} catch (IllegalArgumentException e) {
log.error("IUCN category not found: {}", formatted, e);
@@ -751,37 +762,6 @@ private static void addIfNotNull(StringJoiner joiner, Object value) {
}
}
- String formatIucn(String original){
- if (original == null) {
- return null;
- }
- // Trim the string
- String trimmed = original.trim();
- // Convert to uppercase
- String uppercased = trimmed.toUpperCase();
- // Replace any whitespace with a single underscore
- return uppercased.replaceAll("\\s+", "_");
- }
-
- enum IUCN {
- EXTINCT("EX"),
- EXTINCT_IN_THE_WILD("EW"),
- CRITICALLY_ENDANGERED ("CR"),
- ENDANGERED ("EN"),
- VULNERABLE ("VU"),
- NEAR_THREATENED ("NT"),
- CONSERVATION_DEPENDENT ("CD"),
- LEAST_CONCERN ("LC"),
- DATA_DEFICIENT ("DD"),
- NOT_EVALUATED ("NE");
-
- private final String code;
-
- IUCN(String code) {
- this.code = code;
- }
- }
-
@Data
@Builder
@NoArgsConstructor
diff --git a/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java b/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java
index b75dfe85a..23f46f585 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java
@@ -2,8 +2,10 @@
import static life.catalogue.matching.util.IndexConstants.DATASETS_JSON;
import static life.catalogue.matching.util.IndexConstants.*;
+import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.DeserializationFeature;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
@@ -11,6 +13,7 @@
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
+import java.nio.file.Paths;
import java.nio.file.attribute.BasicFileAttributes;
import java.time.Instant;
import java.time.ZoneId;
@@ -21,20 +24,23 @@
import life.catalogue.api.vocab.MatchType;
import life.catalogue.api.vocab.TaxonomicStatus;
import life.catalogue.matching.model.*;
+import life.catalogue.matching.util.IUCNUtils;
import life.catalogue.matching.util.LuceneUtils;
import life.catalogue.matching.Main;
import lombok.extern.slf4j.Slf4j;
import org.apache.lucene.document.Document;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
+import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.MMapDirectory;
+import org.apache.lucene.util.BytesRef;
+
import org.gbif.nameparser.api.NomCode;
import org.gbif.nameparser.api.Rank;
import org.jetbrains.annotations.NotNull;
+import org.jetbrains.annotations.Nullable;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
@@ -57,10 +63,12 @@ public class DatasetIndex {
protected static final ScientificNameAnalyzer scientificNameAnalyzer = new ScientificNameAnalyzer();
+ protected static final ObjectMapper MAPPER = new ObjectMapper();
+
@Value("${index.path:/data/matching-ws/index}")
String indexPath;
- @Value("${working.path:/tmp/}")
+ @Value("${working.dir:/tmp/}")
String workingDir;
private boolean isInitialised = false;
@@ -77,16 +85,11 @@ public boolean getIsInitialised() {
.build())
.build();
- public boolean exists(String indexPath) {
- return new File(indexPath).exists()
- && new File(indexPath + "/" + MAIN_INDEX_DIR).exists()
- && Objects.requireNonNull(new File(indexPath + "/" + MAIN_INDEX_DIR).listFiles()).length > 0;
- }
-
/** Attempts to read the index from disk if it exists. */
@PostConstruct
void init() {
+ MAPPER.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
final String mainIndexPath = getMainIndexPath();
final Map prefixMapping = loadPrefixMapping();
@@ -251,6 +254,31 @@ public APIMetadata getAPIMetadata(){
return metadata;
}
+ public static List distinctValuesForField(String field, String indexPath) throws Exception {
+
+ List distinctValues = new ArrayList<>();
+ FSDirectory directory = FSDirectory.open(Paths.get(indexPath));
+
+ try (DirectoryReader directoryReader = DirectoryReader.open(directory)) {
+
+ // Get the field terms
+ for (LeafReaderContext leafContext : directoryReader.leaves()) {
+ LeafReader leafReader = leafContext.reader();
+ Terms terms = leafReader.terms(field);
+
+ if (terms != null) {
+ TermsEnum termsEnum = terms.iterator();
+ BytesRef byteRef;
+ while ((byteRef = termsEnum.next()) != null) {
+ String termValue = byteRef.utf8ToString();
+ distinctValues.add(termValue);
+ }
+ }
+ }
+ }
+ return distinctValues;
+ }
+
/**
* Returns the metadata of the index. This includes the number of taxa, the size on disk, the
* dataset title and key, and the build information.
@@ -290,16 +318,16 @@ private IndexMetadata getIndexMetadata(String indexPath, IndexSearcher searcher,
try {
Map rankCounts = new LinkedHashMap<>();
- rankCounts.put(Rank.KINGDOM.name(), getCountForRank(searcher, Rank.KINGDOM));
- rankCounts.put(Rank.PHYLUM.name(), getCountForRank(searcher, Rank.PHYLUM));
- rankCounts.put(Rank.CLASS.name(), getCountForRank(searcher, Rank.CLASS));
- rankCounts.put(Rank.ORDER.name(), getCountForRank(searcher, Rank.ORDER));
- rankCounts.put(Rank.FAMILY.name(), getCountForRank(searcher, Rank.FAMILY));
- rankCounts.put(Rank.GENUS.name(), getCountForRank(searcher, Rank.GENUS));
- rankCounts.put(Rank.SPECIES.name(), getCountForRank(searcher, Rank.SPECIES));
- rankCounts.put(Rank.SUBSPECIES.name(), getCountForRank(searcher, Rank.SUBSPECIES));
+ distinctValuesForField(FIELD_RANK, indexPath).stream().sorted( (a, b) -> Rank.valueOf(a).ordinal() - Rank.valueOf(b).ordinal()
+ ).forEach(rank -> {
+ try {
+ rankCounts.put(rank, getCountForRank(searcher, rank));
+ } catch (IOException e) {
+ log.error("Cannot read index information", e);
+ }
+ });
metadata.setNameUsageByRankCount(rankCounts);
- } catch (IOException e) {
+ } catch (Exception e) {
log.error("Cannot read index information", e);
}
return metadata;
@@ -311,7 +339,7 @@ private IndexMetadata getIndexMetadata(String indexPath, IndexSearcher searcher,
*/
private Optional getGitInfo() {
ObjectMapper mapper = new ObjectMapper();
- final String filePath = workingDir + "/" + GIT_JSON;
+ final String filePath = workingDir + GIT_JSON;
try {
if (new File(filePath).exists()) {
// Read JSON file and parse to JsonNode
@@ -358,7 +386,7 @@ public Map getDatasetInfo(String indexPath) {
try {
if (new File(filePath).exists()){
- log.info("Loading dataset info from {}", filePath);
+ log.debug("Loading dataset info from {}", filePath);
// Read JSON file and parse to JsonNode
JsonNode rootNode = mapper.readTree(new File(filePath));
// Navigate to the author node
@@ -383,8 +411,8 @@ public Map getDatasetInfo(String indexPath) {
return Map.of();
}
- private long getCountForRank(IndexSearcher searcher, Rank rank) throws IOException {
- Query query = new TermQuery(new Term(FIELD_RANK, rank.name()));
+ private long getCountForRank(IndexSearcher searcher, String rank) throws IOException {
+ Query query = new TermQuery(new Term(FIELD_RANK, rank));
return searcher.search(query, new TotalHitCountCollectorManager());
}
@@ -416,22 +444,6 @@ public NameUsageMatch matchByUsageKey(String usageKey) {
return matchByKey(usageKey, this::getByUsageKey);
}
- private static String escapeQueryChars(String s) {
- StringBuilder sb = new StringBuilder();
- for (int i = 0; i < s.length(); i++) {
- char c = s.charAt(i);
- // These are the special characters that need to be escaped
- if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' ||
- c == ':' || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' ||
- c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' ||
- c == '/' || Character.isWhitespace(c)) {
- sb.append('\\');
- }
- sb.append(c);
- }
- return sb.toString();
- }
-
private Optional getByUsageKey(String usageKey) {
Query query = new TermQuery(new Term(FIELD_ID, usageKey));
try {
@@ -541,12 +553,36 @@ public List lookupIdentifier(@NotNull String identifier) {
* @return List of ExternalID
*/
public List lookupIdentifier(@NotNull String datasetID, @NotNull String identifier) {
+ return lookupIdentifier(datasetID, identifier, identifierSearchers);
+ }
+
+ /**
+ * Matches an external ID. Intended for debug purposes only, to quickly
+ * check if ids are present and joined to main index or not.
+ *
+ * @param datasetID the datasetKey to match
+ * @param identifier the identifier to match
+ * @return List of ExternalID
+ */
+ public List lookupAncillary(@NotNull String datasetID, @NotNull String identifier) {
+ return lookupIdentifier(datasetID, identifier, ancillarySearchers);
+ }
+
+ /**
+ * Matches an external ID. Intended for debug purposes only, to quickly
+ * check if ids are present and joined to main index or not.
+ *
+ * @param datasetID the datasetKey to match
+ * @param identifier the identifier to match
+ * @return List of ExternalID
+ */
+ public List lookupIdentifier(@NotNull String datasetID, @NotNull String identifier, Map searchers) {
List results = new ArrayList<>();
try {
// if join indexes are present, add them to the match
- if (identifierSearchers != null && !identifierSearchers.isEmpty()) {
- for (Dataset dataset : identifierSearchers.keySet()) {
+ if (searchers != null && !searchers.isEmpty()) {
+ for (Dataset dataset : searchers.keySet()) {
// use the prefix mapping
if (dataset.getKey().toString().equals(datasetID) || (dataset.getGbifKey() != null && dataset.getGbifKey().equals(datasetID))) {
@@ -557,12 +593,12 @@ public List lookupIdentifier(@NotNull String datasetID, @NotNull Str
}
// find the index and search it
- IndexSearcher identifierSearcher = identifierSearchers.get(dataset);
+ IndexSearcher searcher = searchers.get(dataset);
Query identifierQuery = new TermQuery(new Term(FIELD_ID, identifier));
- TopDocs identifierDocs = identifierSearcher.search(identifierQuery, 3);
+ TopDocs identifierDocs = searcher.search(identifierQuery, 3);
if (identifierDocs.totalHits.value > 0) {
- Document identifierDoc = identifierSearcher.storedFields().
+ Document identifierDoc = searcher.storedFields().
document(identifierDocs.scoreDocs[0].doc);
results.add(toExternalID(identifierDoc, dataset));
@@ -598,43 +634,19 @@ private static ExternalID toExternalID(Document doc, Dataset dataset) {
* @param ignoredIssue the issue to add if the identifier is ignored
* @return NameUsageMatch
*/
- public NameUsageMatch matchByExternalKey(String key, Issue notFoundIssue, Issue ignoredIssue) {
-
- NameUsageMatch usageMatch = matchByUsageKey(key);
- if (usageMatch.getDiagnostics().getMatchType() != MatchType.NONE) {
- return usageMatch;
- }
+ public NameUsageMatch matchByExternalKey(String suppliedKey, Issue notFoundIssue, Issue ignoredIssue) {
// if join indexes are present, add them to the match
if (identifierSearchers != null && !identifierSearchers.isEmpty()){
try {
for (Dataset dataset: identifierSearchers.keySet()){
- // use the prefix mapping
- if (dataset.getPrefixMapping() != null && !dataset.getPrefixMapping().isEmpty()) {
- for (String prefix : dataset.getPrefixMapping()) {
- if (key.startsWith(prefix)) {
- key = key.replace(prefix, "");
- }
- }
- }
-
- if (
- (dataset.getPrefix() == null || !key.startsWith(dataset.getPrefix()))
- && !dataset.getPrefix().equals("*")) {
- // only search indexes with matching prefixes
- continue;
- }
-
- log.debug("Searching for identifier {} in dataset {}", key, dataset.getKey());
-
- if (dataset.getRemovePrefixForMatching()){
- key = key.replace(dataset.getPrefix(), "");
- }
+ Optional key = extractKeyForSearch(suppliedKey, dataset);
+ if (key.isEmpty()) continue;
// find the index and search it
IndexSearcher identifierSearcher = identifierSearchers.get(dataset);
- Query identifierQuery = new TermQuery(new Term(FIELD_ID, key));
+ Query identifierQuery = new TermQuery(new Term(FIELD_ID, key.get()));
TopDocs identifierDocs = identifierSearcher.search(identifierQuery, 3);
if (identifierDocs.totalHits.value > 0) {
@@ -667,7 +679,7 @@ public NameUsageMatch matchByExternalKey(String key, Issue notFoundIssue, Issue
}
}
} catch (IOException e) {
- log.error("Problem querying external ID indexes with {}", key, e);
+ log.error("Problem querying external ID indexes with {}", suppliedKey, e);
}
}
@@ -675,6 +687,38 @@ public NameUsageMatch matchByExternalKey(String key, Issue notFoundIssue, Issue
return NO_MATCH;
}
+ public static Optional extractKeyForSearch(String key, Dataset dataset) {
+ if (!hasRecognisedPrefix(key, dataset)) {
+ // only search indexes with matching prefixes
+ return Optional.empty();
+ }
+
+ // use the prefix mapping
+ if (dataset.getPrefixMapping() != null && !dataset.getPrefixMapping().isEmpty()) {
+ for (String prefix : dataset.getPrefixMapping()) {
+ if (key.startsWith(prefix)) {
+ key = key.replace(prefix, dataset.getPrefix());
+ }
+ }
+ }
+
+ // if configured, remove the prefix
+ if (dataset.getRemovePrefixForMatching() != null && dataset.getRemovePrefixForMatching()){
+ key = key.replace(dataset.getPrefix(), "");
+ }
+ log.debug("Searching for identifier {} in dataset {}", key, dataset.getKey());
+ return Optional.of(key);
+ }
+
+ private static boolean hasRecognisedPrefix(String key, Dataset dataset) {
+ if (dataset.getPrefix() == null){
+ return false;
+ }
+ if (key.startsWith(dataset.getPrefix()))
+ return true;
+ return dataset.getPrefixMapping().stream().anyMatch(key::startsWith);
+ }
+
private static NameUsageMatch noMatch(Issue issue, String note) {
return NameUsageMatch.builder()
.diagnostics(
@@ -719,7 +763,6 @@ private List loadHigherTaxa(String parentID) {
higherTaxon.setRank(Rank.valueOf(doc.get(FIELD_RANK)));
higherTaxon.setParentID(doc.get(FIELD_PARENT_ID));
higherTaxa.add(0, higherTaxon);
-// higherTaxonomyCache.put(currentParentID, higherTaxon);
// get next parent
currentParentID = doc.get(FIELD_PARENT_ID);
} else {
@@ -743,16 +786,7 @@ private NameUsageMatch fromDoc(Document doc) {
NameUsageMatch u = NameUsageMatch.builder().build();
u.setDiagnostics(NameUsageMatch.Diagnostics.builder().build());
- // set the usage
- u.setUsage(
- NameUsageMatch.RankedName.builder()
- .key(doc.get(FIELD_ID))
- .name(doc.get(FIELD_SCIENTIFIC_NAME))
- .rank(Rank.valueOf(doc.get(FIELD_RANK)))
- .canonicalName(doc.get(FIELD_CANONICAL_NAME))
- .code(getCode(doc))
- .build()
- );
+ u.setUsage(constructUsage(doc));
String acceptedParentID = null;
@@ -761,15 +795,7 @@ private NameUsageMatch fromDoc(Document doc) {
Optional accDocOpt = getByUsageKey(doc.get(FIELD_ACCEPTED_ID));
if (accDocOpt.isPresent()) {
Document accDoc = accDocOpt.get();
- u.setAcceptedUsage(
- NameUsageMatch.RankedName.builder()
- .key(accDoc.get(FIELD_ID))
- .name(accDoc.get(FIELD_SCIENTIFIC_NAME))
- .rank(Rank.valueOf(accDoc.get(FIELD_RANK)))
- .canonicalName(accDoc.get(FIELD_CANONICAL_NAME))
- .code(getCode(accDoc))
- .build()
- );
+ u.setAcceptedUsage(constructUsage(accDoc));
acceptedParentID = accDoc.get(FIELD_PARENT_ID);
}
}
@@ -798,7 +824,7 @@ private NameUsageMatch fromDoc(Document doc) {
classification.add(
NameUsageMatch.RankedName.builder()
.key(doc.get(FIELD_ID))
- .name( doc.get(FIELD_CANONICAL_NAME))
+ .name(doc.get(FIELD_CANONICAL_NAME))
.rank(Rank.valueOf(doc.get(FIELD_RANK)))
.canonicalName(doc.get(FIELD_CANONICAL_NAME))
.build()
@@ -809,19 +835,26 @@ private NameUsageMatch fromDoc(Document doc) {
// if ancillary join indexes are present, add them to the match
for (Dataset dataset: ancillarySearchers.keySet()){
IndexSearcher ancillarySearcher = ancillarySearchers.get(dataset);
- Query query = new TermQuery(new Term(FIELD_JOIN_ID, doc.get(FIELD_ID) ));
+ Query query = new TermQuery(
+ new Term(FIELD_JOIN_ID, doc.get(FIELD_ID))
+ );
try {
TopDocs docs = ancillarySearcher.search(query, 3);
if (docs.totalHits.value > 0) {
Document ancillaryDoc = ancillarySearcher.storedFields().document(docs.scoreDocs[0].doc);
- String status = ancillaryDoc.get(FIELD_CATEGORY);
NameUsageMatch.Status ancillaryStatus = new NameUsageMatch.Status();
- ancillaryStatus.setStatus(status);
- ancillaryStatus.setDatasetKey(dataset.getKey().toString());
- ancillaryStatus.setGbifKey(dataset.getGbifKey());
- ancillaryStatus.setDatasetAlias(dataset.getAlias());
- ancillaryStatus.setSourceId(ancillaryDoc.get(FIELD_ID));
- u.addAdditionalStatus(ancillaryStatus);
+ ancillaryStatus.setStatus(ancillaryDoc.get(FIELD_CATEGORY));
+ String formattedIUCN = IUCNUtils.formatIucn(ancillaryDoc.get(FIELD_CATEGORY));
+ if (formattedIUCN != null) {
+ IUCNUtils.IUCN iucn = IUCNUtils.IUCN.valueOf(formattedIUCN);
+ ancillaryStatus.setStatus(formattedIUCN);
+ ancillaryStatus.setStatusCode(iucn.getCode());
+ ancillaryStatus.setDatasetKey(dataset.getKey().toString());
+ ancillaryStatus.setGbifKey(dataset.getGbifKey());
+ ancillaryStatus.setDatasetAlias(dataset.getAlias());
+ ancillaryStatus.setSourceId(ancillaryDoc.get(FIELD_ID));
+ u.addAdditionalStatus(ancillaryStatus);
+ }
}
} catch (IOException e) {
log.error("Cannot load usage {} from lucene index", doc.get(FIELD_ID), e);
@@ -834,6 +867,78 @@ private NameUsageMatch fromDoc(Document doc) {
return u;
}
+ private static NameUsageMatch.Usage constructUsage(Document doc) {
+ StoredParsedName pn = null;
+ String parsedNameJson = doc.get(FIELD_PARSED_NAME_JSON);
+ if (parsedNameJson != null) {
+ try {
+ pn = MAPPER.readValue(parsedNameJson, StoredParsedName.class);
+ } catch (Exception e) {
+ log.error("Cannot parse parsed name json", e);
+ }
+ }
+
+ // set the usage
+ NameUsageMatch.Usage.UsageBuilder b = NameUsageMatch.Usage.builder()
+ .key(doc.get(FIELD_ID))
+ .name(doc.get(FIELD_SCIENTIFIC_NAME))
+ .authorship(doc.get(FIELD_AUTHORSHIP))
+ .rank(Rank.valueOf(doc.get(FIELD_RANK)))
+ .canonicalName(doc.get(FIELD_CANONICAL_NAME))
+ .code(getCode(doc));
+
+ if (pn != null) {
+ b.genus(pn.getGenus())
+ .infragenericEpithet(pn.getInfragenericEpithet())
+ .specificEpithet(pn.getSpecificEpithet())
+ .infraspecificEpithet(pn.getInfraspecificEpithet())
+ .cultivarEpithet(pn.getCultivarEpithet())
+ .phrase(pn.getPhrase())
+ .voucher(pn.getVoucher())
+ .nominatingParty(pn.getNominatingParty())
+ .candidatus(pn.isCandidatus())
+ .notho(pn.getNotho())
+ .originalSpelling(pn.getOriginalSpelling())
+ .epithetQualifier(pn.getEpithetQualifier())
+ .type(pn.getType())
+ .extinct(pn.isExtinct())
+
+ .sanctioningAuthor(pn.getSanctioningAuthor())
+ .taxonomicNote(pn.getTaxonomicNote())
+ .nomenclaturalNote(pn.getNomenclaturalNote())
+ .publishedIn(pn.getPublishedIn())
+ .unparsed(pn.getUnparsed())
+ .doubtful(pn.isDoubtful())
+ .manuscript(pn.isManuscript())
+ .state(pn.getState())
+ .warnings(pn.getWarnings());
+
+ if (pn.getCombinationAuthorship() != null
+ && pn.getCombinationAuthorship().getAuthors() != null
+ && !pn.getCombinationAuthorship().getAuthors().isEmpty()
+ ) {
+ b.combinationAuthorship(
+ NameUsageMatch.Authorship.builder()
+ .authors(pn.getCombinationAuthorship().getAuthors())
+ .year(pn.getCombinationAuthorship().getYear())
+ .build());
+ }
+
+ if (pn.getBasionymAuthorship() != null
+ && pn.getBasionymAuthorship().getAuthors() != null
+ && !pn.getBasionymAuthorship().getAuthors().isEmpty()
+ ) {
+ b.basionymAuthorship(
+ NameUsageMatch.Authorship.builder()
+ .authors(pn.getBasionymAuthorship().getAuthors())
+ .year(pn.getBasionymAuthorship().getYear())
+ .build());
+ }
+ }
+
+ return b.build();
+ }
+
private static NomCode getCode(Document doc) {
if (doc.get(FIELD_NOMENCLATURAL_CODE) == null) {
return null;
@@ -883,6 +988,7 @@ public List matchByName(String name, boolean fuzzySearch, int ma
try {
return search(q, name, fuzzySearch, maxMatches);
} catch (RuntimeException e) {
+ log.error("Lucene search error", e);
// for example TooComplexToDeterminizeException, see
// http://dev.gbif.org/issues/browse/POR-2725
log.warn("Lucene failed to fuzzy search for name [{}]. Try a straight match instead", name);
diff --git a/matching-ws/src/main/java/life/catalogue/matching/index/NameNRank.java b/matching-ws/src/main/java/life/catalogue/matching/index/NameNRank.java
index e4a36cd12..0b54c2a4b 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/index/NameNRank.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/index/NameNRank.java
@@ -12,7 +12,6 @@
import life.catalogue.matching.util.CleanupUtils;
import org.apache.commons.lang3.StringUtils;
-import org.gbif.nameparser.api.Authorship;
import org.gbif.nameparser.api.NamePart;
import org.gbif.nameparser.api.ParsedName;
import org.gbif.nameparser.api.Rank;
@@ -214,28 +213,19 @@ private static boolean exists(String x) {
@VisibleForTesting
public static String expandAbbreviatedGenus(String scientificName, String genus) {
- if (exists(scientificName) && exists(genus)) {
+ if (exists(scientificName) && exists(genus) && !scientificName.equalsIgnoreCase(genus)) {
String[] parts = scientificName.split(" +", 2);
- if (parts[0].length() <= 2) {
- String genusCorrect = StringUtils.capitalize(genus.trim().toLowerCase());
+ String genusCorrect = StringUtils.capitalize(genus.trim().toLowerCase());
+ if (parts[0].length() <= 2 && genusCorrect.length() > 2 && (
+ parts[0].equals("?") // is the genus missing alltogether?
+ || parts[0].length() == 2 && parts[0].charAt(1) == '.' && parts[0].charAt(0) == genusCorrect.charAt(0)
+ || parts[0].length() == 1 && parts[0].charAt(0) == genusCorrect.charAt(0)
+ )) {
StringBuilder sb = new StringBuilder();
- // is the genus missing alltogether?
- if (parts[0].equals("?")) {
- sb.append(genusCorrect);
- } else if (genusCorrect.length() > 1) {
- // test if name has an abbreviated genus
- if (parts[0].length() == 2
- && parts[0].charAt(1) == '.'
- && parts[0].charAt(0) == genusCorrect.charAt(0)
- || parts[0].length() == 1 && parts[0].charAt(0) == genusCorrect.charAt(0)) {
- sb.append(genusCorrect);
- }
- } else {
- sb.append(parts[0]);
- }
+ sb.append(genus);
if (parts.length > 1) {
- sb.append(" ");
- sb.append(parts[1]);
+ sb.append(" ")
+ .append(parts[1]);
}
return sb.toString();
}
diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/APIMetadata.java b/matching-ws/src/main/java/life/catalogue/matching/model/APIMetadata.java
index b632ecf29..63fbccd78 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/model/APIMetadata.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/model/APIMetadata.java
@@ -4,13 +4,10 @@
import com.fasterxml.jackson.annotation.JsonInclude;
import io.swagger.v3.oas.annotations.media.Schema;
-import lombok.Builder;
import lombok.Data;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
/**
* Metadata about this API and about the indexes behind the API.
diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/Dataset.java b/matching-ws/src/main/java/life/catalogue/matching/model/Dataset.java
index 0ccb698a8..24c81c74a 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/model/Dataset.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/model/Dataset.java
@@ -3,7 +3,10 @@
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonInclude;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
import lombok.Data;
+import lombok.NoArgsConstructor;
import java.util.List;
@@ -13,6 +16,9 @@
@Data
@JsonInclude(JsonInclude.Include.NON_NULL)
@JsonIgnoreProperties(ignoreUnknown = true)
+@Builder
+@NoArgsConstructor
+@AllArgsConstructor
public class Dataset {
Integer key;
String gbifKey;
diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java b/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java
index d15b8149a..88760abbc 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java
@@ -5,9 +5,8 @@
import com.fasterxml.jackson.annotation.JsonInclude;
import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Optional;
+import java.util.*;
+
import io.swagger.v3.oas.annotations.media.Schema;
import life.catalogue.api.vocab.MatchType;
@@ -31,9 +30,9 @@ public class NameUsageMatch implements LinneanClassification {
@Schema(description = "If the matched usage is a synonym")
boolean synonym;
@Schema(description = "The matched name usage")
- RankedName usage;
+ Usage usage;
@Schema(description = "The accepted name usage for the match. This will only be populated when we've matched a synonym name usage.")
- RankedName acceptedUsage;
+ Usage acceptedUsage;
@Schema(description = "The classification of the accepted name usage.")
List classification;
@Schema(description = "Diagnostics for a name match including the type of match and confidence level", implementation = Diagnostics.class)
@@ -311,6 +310,82 @@ public static class Diagnostics {
List alternatives;
}
+ /**
+ * A name with an identifier and a taxonomic rank.
+ */
+ @Schema(description = "A name with an identifier and a taxonomic rank", title = "Usage", type = "object")
+ @JsonInclude(JsonInclude.Include.NON_NULL)
+ @Data
+ @AllArgsConstructor
+ @NoArgsConstructor
+ @ToString
+ @Builder
+ public static class Usage implements Serializable {
+
+ private static final long serialVersionUID = 3423423423423L;
+
+ @Schema(description = "The identifier for the name usage")
+ private String key;
+ @Schema(description = "The name usage")
+ private String name;
+ private String canonicalName;
+ private String authorship;
+ @JsonIgnore private String parentID;
+ @Schema(description = "The taxonomic rank for the name usage")
+ private Rank rank;
+ @Schema(description = "The nomenclatural code for the name usage")
+ private NomCode code;
+ private String uninomial;
+ private String genus;
+ private String infragenericEpithet;
+ private String specificEpithet;
+ private String infraspecificEpithet;
+ private String cultivarEpithet;
+ private String phrase;
+ private String voucher;
+ private String nominatingParty;
+ private boolean candidatus;
+ private String notho;
+ private Boolean originalSpelling;
+ private Map epithetQualifier;
+ private String type;
+ protected boolean extinct;
+ private Authorship combinationAuthorship;
+ private Authorship basionymAuthorship;
+ private String sanctioningAuthor;
+ private String taxonomicNote;
+ private String nomenclaturalNote;
+ private String publishedIn;
+ private String unparsed;
+ private boolean doubtful;
+ private boolean manuscript;
+ private String state;
+ private Set warnings;
+
+ //additional flags
+ private boolean isAbbreviated;
+ private boolean isAutonym;
+ private boolean isBinomial;
+ private boolean isTrinomial;
+ private boolean isIncomplete;
+ private boolean isIndetermined;
+ private boolean isPhraseName;
+ private String terminalEpithet;
+ }
+
+ @Schema(description = "An scientific name authorship for a name usage, split into components", title = "Authorship", type = "object")
+ @JsonInclude(JsonInclude.Include.NON_NULL)
+ @Data
+ @AllArgsConstructor
+ @NoArgsConstructor
+ @ToString
+ @Builder
+ public static class Authorship {
+ private List authors = new ArrayList();
+ private List exAuthors = new ArrayList();
+ private String year;
+ }
+
/**
* A name with an identifier and a taxonomic rank.
*/
@@ -343,7 +418,7 @@ public static class RankedName implements Serializable {
@Data
@JsonIgnoreProperties(ignoreUnknown = true)
@JsonInclude(JsonInclude.Include.NON_EMPTY)
- @Schema(description = "A status value derived from a dataset or external source. E.g. IUCN Red List status.",
+ @Schema(description = "A status value derived from a dataset or external source. E.g. IUCN Red List.",
title = "Status", type = "object")
public static class Status {
@Schema(description = "The dataset key for the dataset that the status is associated with")
@@ -354,6 +429,8 @@ public static class Status {
private String gbifKey;
@Schema(description = "The status value")
private String status;
+ @Schema(description = "The status code value")
+ private String statusCode;
@Schema(description = "The ID in the source dataset for this status. e.g. the IUCN ID for this taxon")
private String sourceId;
}
diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageQuery.java b/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageQuery.java
index 7a05095a9..b4a8c55e6 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageQuery.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageQuery.java
@@ -31,15 +31,15 @@ public static NameUsageQuery create(
String taxonID,
String taxonConceptID,
String scientificNameID,
- String scientificName2,
String scientificName,
- String authorship2,
+ String scientificName2,
String authorship,
- String rank2,
- String rank,
+ String authorship2,
String genericName,
String specificEpithet,
String infraspecificEpithet,
+ String rank,
+ String rank2,
Classification classification,
Set exclude,
Boolean strict,
diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/StoredParsedName.java b/matching-ws/src/main/java/life/catalogue/matching/model/StoredParsedName.java
new file mode 100644
index 000000000..ff2639fcc
--- /dev/null
+++ b/matching-ws/src/main/java/life/catalogue/matching/model/StoredParsedName.java
@@ -0,0 +1,64 @@
+package life.catalogue.matching.model;
+
+import lombok.*;
+
+import lombok.Data;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+@Builder
+@AllArgsConstructor
+@NoArgsConstructor
+@Data
+public class StoredParsedName {
+ private String rank;
+ private String code;
+ private String uninomial;
+ private String genus;
+ private String infragenericEpithet;
+ private String specificEpithet;
+ private String infraspecificEpithet;
+ private String cultivarEpithet;
+ private String phrase;
+ private String voucher;
+ private String nominatingParty;
+ private boolean candidatus;
+ private String notho;
+ private Boolean originalSpelling;
+ private Map epithetQualifier;
+ private String type;
+ protected boolean extinct;
+ private StoredAuthorship combinationAuthorship;
+ private StoredAuthorship basionymAuthorship;
+ private String sanctioningAuthor;
+ private String taxonomicNote;
+ private String nomenclaturalNote;
+ private String publishedIn;
+ private String unparsed;
+ private boolean doubtful;
+ private boolean manuscript;
+ private String state;
+ private Set warnings;
+
+ //additional flags
+ private boolean isAbbreviated;
+ private boolean isAutonym;
+ private boolean isBinomial;
+ private boolean isTrinomial;
+ private boolean isIncomplete;
+ private boolean isIndetermined;
+ private boolean isPhraseName;
+ private String terminalEpithet;
+
+ @Builder
+ @AllArgsConstructor
+ @NoArgsConstructor
+ @Data
+ public static class StoredAuthorship {
+ private List authors = new ArrayList();
+ private List exAuthors = new ArrayList();
+ private String year;
+ }
+}
diff --git a/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java b/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java
index 6cdc0b8c8..39581338e 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java
@@ -14,10 +14,9 @@
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
-import java.util.function.Consumer;
-import java.util.function.Supplier;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.opencsv.CSVWriterBuilder;
@@ -29,23 +28,17 @@
import life.catalogue.api.model.ReleaseAttempt;
import life.catalogue.api.vocab.DatasetOrigin;
import life.catalogue.api.vocab.TaxonomicStatus;
-
import life.catalogue.matching.db.DatasetMapper;
-
import life.catalogue.matching.index.ScientificNameAnalyzer;
import life.catalogue.matching.model.Classification;
+import life.catalogue.matching.model.StoredParsedName;
import life.catalogue.matching.model.Dataset;
import life.catalogue.matching.model.NameUsage;
-
import life.catalogue.matching.model.NameUsageMatch;
-
import life.catalogue.matching.util.NameParsers;
-
import lombok.extern.slf4j.Slf4j;
-
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
-import org.apache.ibatis.cursor.Cursor;
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.lucene.analysis.Analyzer;
@@ -111,6 +104,8 @@ public class IndexingService {
protected final MatchingService matchingService;
+ protected static final ObjectMapper MAPPER = new ObjectMapper();
+
private static final String REL_PATTERN_STR = "(\\d+)(?:LX?RC?|R(\\d+))";
private static final Pattern REL_PATTERN = Pattern.compile("^" + REL_PATTERN_STR + "$");
@@ -208,7 +203,7 @@ private Optional lookupDataset(SqlSessionFactory factory, Integer key)
}
/**
- * Writes an export of the name usages in a checklist bank dataset to a CSV file.
+ * Writes an export of the name usages in a checklist bank dataset to a CSV file.
*
* @param datasetKeyInput a dataset key or a release key
* @throws Exception if the dataset key is invalid or the export fails
@@ -664,6 +659,10 @@ public void run() {
nameUsageMatch.getAcceptedUsage() != null ? nameUsageMatch.getAcceptedUsage().getKey() :
nameUsageMatch.getUsage().getKey(), Field.Store.YES)
);
+
+ // reduce the side of these indexes by removing the parsed name
+ doc.removeField(FIELD_PARSED_NAME_JSON);
+
writer.addDocument(doc);
matchedCounter.incrementAndGet();
} else {
@@ -680,7 +679,7 @@ public void run() {
}
private boolean isAccepted(String status) {
- return status != null && !status.equals(TaxonomicStatus.ACCEPTED.name());
+ return status != null && status.equals(TaxonomicStatus.ACCEPTED.name());
}
}
@@ -801,12 +800,6 @@ private void indexFile(String exportPath, String indexPath) throws Exception {
mapper.writeValue(new File(indexPath + "/" + METADATA_JSON), metadata);
}
- class YourThreadFactory implements ThreadFactory {
- public Thread newThread(Runnable r) {
- return new Thread(r, "NameUsage-Indexing-taskThread");
- }
- }
-
static class IndexingTask implements Runnable {
private final IndexWriter writer;
private final List nameUsages;
@@ -846,6 +839,11 @@ private static void finishIndex(IndexWriter indexWriter) throws IOException {
return Paths.get(indexPath);
}
+ /**
+ * Generate the lucene document for a name usage
+ * @param nameUsage to convert to lucene document
+ * @return lucene document
+ */
protected static Document toDoc(NameUsage nameUsage) {
Document doc = new Document();
@@ -853,18 +851,18 @@ protected static Document toDoc(NameUsage nameUsage) {
Porting notes: The canonical name *sensu strictu* with nothing else but three name parts at
most (genus, species, infraspecific). No rank or hybrid markers and no authorship,
cultivar or strain information. Infrageneric names are represented without a
- leading genus. Unicode characters are replaced by their matching ASCII characters."
+ leading genus. Unicode characters are replaced by their matching ASCII characters.
*/
- Rank rank = Rank.valueOf(nameUsage.getRank());
+ Rank rank = Rank.valueOf(nameUsage.getRank());
Optional optCanonical = Optional.empty();
+ ParsedName pn = null;
+ NomCode nomCode = null;
try {
- NomCode nomCode = null;
if (!StringUtils.isEmpty(nameUsage.getNomenclaturalCode())) {
nomCode = NomCode.valueOf(nameUsage.getNomenclaturalCode());
}
- ParsedName pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode);
-
+ pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode);
// canonicalMinimal will construct the name without the hybrid marker and authorship
String canonical = NameFormatter.canonicalMinimal(pn);
optCanonical = Optional.ofNullable(canonical);
@@ -873,6 +871,25 @@ protected static Document toDoc(NameUsage nameUsage) {
log.debug("Unable to parse name to create canonical: {}", nameUsage.getScientificName());
}
+ if (pn != null){
+ try {
+ // if there an authorship, reparse with it to get the component authorship parts
+ StoredParsedName storedParsedName = StringUtils.isBlank(nameUsage.getAuthorship()) ?
+ getStoredParsedName(pn) : constructParsedName(nameUsage, rank, nomCode);
+ // store the parsed name components in JSON
+ doc.add(new StoredField(
+ FIELD_PARSED_NAME_JSON,
+ MAPPER.writeValueAsString(storedParsedName))
+ );
+ } catch (UnparsableNameException | InterruptedException e) {
+ // do nothing
+ log.debug("Unable to parse name to create canonical: {}", nameUsage.getScientificName());
+ } catch ( JsonProcessingException e) {
+ // do nothing
+ log.debug("Unable to parse name to create canonical: {}", nameUsage.getScientificName());
+ }
+ }
+
final String canonical = optCanonical.orElse(nameUsage.getScientificName());
// use custom precision step as we do not need range queries and prefer to save memory usage
@@ -895,7 +912,9 @@ protected static Document toDoc(NameUsage nameUsage) {
String nameComplete = nameUsage.getScientificName();
if (StringUtils.isNotBlank(nameUsage.getAuthorship())) {
nameComplete += " " + nameUsage.getAuthorship();
+ doc.add(new TextField(FIELD_AUTHORSHIP, nameUsage.getAuthorship(), Field.Store.YES));
}
+
doc.add(new TextField(FIELD_SCIENTIFIC_NAME, nameComplete, Field.Store.YES));
// this lucene index is not persistent, so not risk in changing ordinal numbers
@@ -920,11 +939,60 @@ protected static Document toDoc(NameUsage nameUsage) {
return doc;
}
- public static void consume(Supplier> cursorSupplier, Consumer handler) {
- try (Cursor cursor = cursorSupplier.get()) {
- cursor.forEach(handler);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
+ @NotNull
+ private static StoredParsedName constructParsedName(NameUsage nameUsage, Rank rank, NomCode nomCode) throws UnparsableNameException, InterruptedException {
+ ParsedName pn = !StringUtils.isBlank(nameUsage.getAuthorship()) ?
+ NameParsers.INSTANCE.parse(nameUsage.getScientificName() + " " + nameUsage.getAuthorship(), rank, nomCode)
+ : NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode);
+ return getStoredParsedName(pn);
+ }
+
+ @NotNull
+ private static StoredParsedName getStoredParsedName(ParsedName pn) {
+ StoredParsedName storedParsedName = new StoredParsedName();
+ storedParsedName.setAbbreviated(pn.isAbbreviated());
+ storedParsedName.setAutonym(pn.isAutonym());
+ storedParsedName.setBinomial(pn.isBinomial());
+ storedParsedName.setCandidatus(pn.isCandidatus());
+ storedParsedName.setCultivarEpithet(pn.getCultivarEpithet());
+ storedParsedName.setDoubtful(pn.isDoubtful());
+ storedParsedName.setGenus(pn.getGenus());
+ storedParsedName.setUninomial(pn.getUninomial());
+ storedParsedName.setUnparsed(pn.getUnparsed());
+ storedParsedName.setTrinomial(pn.isTrinomial());
+ storedParsedName.setIncomplete(pn.isIncomplete());
+ storedParsedName.setIndetermined(pn.isIndetermined());
+ storedParsedName.setTerminalEpithet(pn.getTerminalEpithet());
+ storedParsedName.setInfragenericEpithet(pn.getInfragenericEpithet());
+ storedParsedName.setInfraspecificEpithet(pn.getInfraspecificEpithet());
+ storedParsedName.setExtinct(pn.isExtinct());
+ storedParsedName.setPublishedIn(pn.getPublishedIn());
+ storedParsedName.setSanctioningAuthor(pn.getSanctioningAuthor());
+ storedParsedName.setSpecificEpithet(pn.getSpecificEpithet());
+ storedParsedName.setPhrase(pn.getPhrase());
+ storedParsedName.setPhraseName(pn.isPhraseName());
+ storedParsedName.setVoucher(pn.getVoucher());
+ storedParsedName.setNominatingParty(pn.getNominatingParty());
+ storedParsedName.setNomenclaturalNote(pn.getNomenclaturalNote());
+ storedParsedName.setWarnings(pn.getWarnings());
+ if (pn.getBasionymAuthorship() != null) {
+ storedParsedName.setBasionymAuthorship(
+ StoredParsedName.StoredAuthorship.builder()
+ .authors(pn.getBasionymAuthorship().getAuthors())
+ .exAuthors(pn.getBasionymAuthorship().getExAuthors())
+ .year(pn.getBasionymAuthorship().getYear()).build()
+ );
+ }
+ if (pn.getCombinationAuthorship() != null) {
+ storedParsedName.setCombinationAuthorship(
+ StoredParsedName.StoredAuthorship.builder()
+ .authors(pn.getCombinationAuthorship().getAuthors())
+ .exAuthors(pn.getCombinationAuthorship().getExAuthors())
+ .year(pn.getCombinationAuthorship().getYear()).build()
+ );
+ }
+ storedParsedName.setType(pn.getType() != null ? pn.getType().name() : null);
+ storedParsedName.setNotho(pn.getNotho() != null ? pn.getNotho().name() : null);
+ return storedParsedName;
}
}
diff --git a/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java b/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java
index 08dec6c28..515d41a3c 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java
@@ -55,7 +55,7 @@
@Service
public class MatchingService {
- @Value("${working.path:/tmp/}")
+ @Value("${working.dir:/tmp/}")
protected String metadataFilePath;
@Value("${online.dictionary.url:'https://rs.gbif.org/dictionaries/'}")
@@ -174,7 +174,7 @@ public Optional getAPIMetadata(boolean regenerate) {
File metadata = new File(metadataFilePath + "/index-metadata.json");
try {
- if (!metadata.exists() || regenerate) {
+ if (regenerate || !metadata.exists()) {
APIMetadata metadata1 = datasetIndex.getAPIMetadata();
//serialise to file
ObjectMapper mapper = new ObjectMapper();
@@ -199,7 +199,6 @@ private static boolean isMatch(@Nullable NameUsageMatch match) {
private static NameUsageMatch higherMatch(NameUsageMatch match, NameUsageMatch firstMatch) {
match.getDiagnostics().setMatchType(MatchType.HIGHERRANK);
- // FIXME
addAlternatives(match, firstMatch.getDiagnostics().getAlternatives());
return match;
}
@@ -262,7 +261,9 @@ public List lookupJoins(String identifier){
* @return the list of matches
*/
public List matchID(String datasetID, String identifier){
- return datasetIndex.lookupIdentifier(datasetID, identifier);
+ List ids = datasetIndex.lookupIdentifier(datasetID, identifier);
+ List ancillary = datasetIndex.lookupAncillary(datasetID, identifier);
+ return ImmutableList.builder().addAll(ids).addAll(ancillary).build();
}
public NameUsageMatch match(
@@ -507,7 +508,12 @@ && getGenusOrAbove(parsedName) != null
if (rank == null) {
if (parsedName.isBinomial()
|| parsedName.isTrinomial()
- || (parsedName.getRank() != null && parsedName.getRank().ordinal() >= Rank.SPECIES.ordinal())) {
+ || (
+ parsedName.getRank() != null
+ && parsedName.getRank().ordinal() >= Rank.SPECIES.ordinal()
+ && parsedName.getEpithet(NamePart.SPECIFIC) != null //see https://github.com/CatalogueOfLife/data/issues/719
+ )
+ ) {
rank = Rank.valueOf(parsedName.getRank().name());
}
}
@@ -577,6 +583,18 @@ && nextAboveGenusDiffers(classification, match1)) {
// for strict matching do not try higher ranks
if (isMatch(match1) || strict) {
+ // https://github.com/CatalogueOfLife/data/issues/719
+ // this caters for the scenario where the Taxacrum sp.
+ // and the only sensible match is to a higher rank (genus)
+ if (
+ isMatch(match1)
+ && parsedName != null
+ && parsedName.getRank() != null
+ && parsedName.getRank().ordinal() >= Rank.SPECIES.ordinal()
+ && parsedName.getEpithet(NamePart.SPECIFIC) == null
+ ){
+ match1.getDiagnostics().setMatchType(MatchType.HIGHERRANK);
+ }
return match1;
}
@@ -712,7 +730,9 @@ private List queryIndex(Rank rank, String canonicalName, boolean
m -> {
if (m.getDiagnostics().getMatchType() == MatchType.EXACT
&& rank == Rank.SPECIES_AGGREGATE
- && m.getUsage().getRank() != Rank.SPECIES_AGGREGATE) {
+ && (m.getUsage().getRank() != Rank.SPECIES_AGGREGATE
+ || m.getAcceptedUsage().getRank() != Rank.SPECIES_AGGREGATE)
+ ) {
log.info(
"Species aggregate match found for {} {}. Ignore and prefer higher matches",
m.getUsage().getRank(),
diff --git a/matching-ws/src/main/java/life/catalogue/matching/util/IUCNUtils.java b/matching-ws/src/main/java/life/catalogue/matching/util/IUCNUtils.java
new file mode 100644
index 000000000..27fec73fc
--- /dev/null
+++ b/matching-ws/src/main/java/life/catalogue/matching/util/IUCNUtils.java
@@ -0,0 +1,40 @@
+package life.catalogue.matching.util;
+
+public class IUCNUtils {
+
+ public static String formatIucn(String original){
+ if (original == null) {
+ return null;
+ }
+ // Trim the string
+ String trimmed = original.trim();
+ // Convert to uppercase
+ String uppercased = trimmed.toUpperCase();
+ // Replace any whitespace with a single underscore
+ return uppercased.replaceAll("\\s+", "_");
+ }
+
+ public enum IUCN {
+ EXTINCT("EX"),
+ EXTINCT_IN_THE_WILD("EW"),
+ CRITICALLY_ENDANGERED ("CR"),
+ ENDANGERED ("EN"),
+ VULNERABLE ("VU"),
+ NEAR_THREATENED ("NT"),
+ CONSERVATION_DEPENDENT ("CD"),
+ LEAST_CONCERN ("LC"),
+ DATA_DEFICIENT ("DD"),
+ NOT_EVALUATED ("NE");
+
+ private final String code;
+
+ IUCN(String code) {
+ this.code = code;
+ }
+
+ public String getCode() {
+ return code;
+ }
+
+ }
+}
diff --git a/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java b/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java
index 0d59bca61..b6334e6fc 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java
@@ -8,10 +8,12 @@ public class IndexConstants {
public static final String FIELD_ACCEPTED_ID = "accid";
public static final String FIELD_CANONICAL_NAME = "canonical";
public static final String FIELD_SCIENTIFIC_NAME = "sciname";
+ public static final String FIELD_AUTHORSHIP = "authorship";
public static final String FIELD_RANK = "rank";
public static final String FIELD_STATUS = "status";
public static final String FIELD_PARENT_ID = "parentId";
public static final String FIELD_NOMENCLATURAL_CODE = "nomcode";
+ public static final String FIELD_PARSED_NAME_JSON = "parsedName";
public static final String FIELD_CATEGORY = "category";
public static final String FIELD_JOIN_ID = "joinId";
public static final String MAIN_INDEX_DIR = "main";
diff --git a/matching-ws/src/main/resources/datasets.json b/matching-ws/src/main/resources/datasets.json
index 6c17a925c..afd996ec3 100644
--- a/matching-ws/src/main/resources/datasets.json
+++ b/matching-ws/src/main/resources/datasets.json
@@ -15,6 +15,7 @@
"title": "IPNI",
"prefix": "urn:lsid:ipni.org:names:",
"prefixMapping": [
+ "ipni:",
"https://www.ipni.org/n/"
],
"removePrefixForMatching": true
@@ -25,6 +26,7 @@
"title": "WoRMS",
"prefix": "urn:lsid:marinespecies.org:taxname:",
"prefixMapping": [
+ "worms:",
"http://marinespecies.org/data.php?id=",
"https://marinespecies.org/data.php?id=",
"https://www.marinespecies.org/aphia.php?p=taxdetails&id="
@@ -40,6 +42,31 @@
"key": "2041",
"gbifKey": "de8934f4-a136-481c-a87a-b0b202b80a31",
"title": "Dyntaxa. Svensk taxonomisk databas",
- "prefix": "urn:lsid:dyntaxa.se:Taxon:"
+ "prefix": "urn:lsid:dyntaxa.se:Taxon:",
+ "prefixMapping": [
+ "dyntaxa:"
+ ]
+ },
+ {
+ "key": "2144",
+ "gbifKey": "9ca92552-f23a-41a8-a140-01abaa31c931",
+ "title": "ITIS",
+ "prefixMapping": [
+ "itis:",
+ "tsn:",
+ "https://www.itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=",
+ "https://marinespecies.org/data.php?id=",
+ "https://www.marinespecies.org/aphia.php?p=taxdetails&id="
+ ]
+ },
+ {
+ "key": "139831",
+ "title": "iNaturalist",
+ "prefix": "https://www.inaturalist.org/taxa/",
+ "prefixMapping": [
+ "inat:",
+ "iNaturalist:",
+ "https://www.inaturalist.org/taxa/"
+ ]
}
]
diff --git a/matching-ws/src/test/java/life/catalogue/matching/IDMatchingIT.java b/matching-ws/src/test/java/life/catalogue/matching/IDMatchingIT.java
index 6e0112c83..cea714ce1 100644
--- a/matching-ws/src/test/java/life/catalogue/matching/IDMatchingIT.java
+++ b/matching-ws/src/test/java/life/catalogue/matching/IDMatchingIT.java
@@ -56,7 +56,7 @@ public static void buildMatcher() throws IOException {
1
);
- Dataset dataset = new Dataset();
+ Dataset dataset = Dataset.builder().build();
dataset.setKey(1);
dataset.setAlias("DUMMY_IDS");
dataset.setTitle("Dummy dataset for testing");
diff --git a/matching-ws/src/test/java/life/catalogue/matching/NameUsageBuilder.java b/matching-ws/src/test/java/life/catalogue/matching/NameUsageBuilder.java
index 9f85865de..8d568c078 100644
--- a/matching-ws/src/test/java/life/catalogue/matching/NameUsageBuilder.java
+++ b/matching-ws/src/test/java/life/catalogue/matching/NameUsageBuilder.java
@@ -40,12 +40,12 @@ public static NameUsageMatch newNameUsageMatch(
String speciesKey) {
NameUsageMatch m = NameUsageMatch.builder().diagnostics(NameUsageMatch.Diagnostics.builder().build()).build();
- m.setUsage(NameUsageMatch.RankedName.builder()
+ m.setUsage(NameUsageMatch.Usage.builder()
.key(usageKey)
.name(scientificName)
.canonicalName(canonicalName)
.rank(rank).build());
- m.setAcceptedUsage(NameUsageMatch.RankedName.builder().key(acceptedUsageKey).build());
+ m.setAcceptedUsage(NameUsageMatch.Usage.builder().key(acceptedUsageKey).build());
m.getDiagnostics().setStatus(status);
m.getDiagnostics().setConfidence(confidence);
m.getDiagnostics().setNote(note);