diff --git a/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java b/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java index e99ab1a74..b9dee3169 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java +++ b/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java @@ -524,12 +524,36 @@ public List lookupIdentifier(@NotNull String identifier) { * @return List of ExternalID */ public List lookupIdentifier(@NotNull String datasetID, @NotNull String identifier) { + return lookupIdentifier(datasetID, identifier, identifierSearchers); + } + + /** + * Matches an external ID. Intended for debug purposes only, to quickly + * check if ids are present and joined to main index or not. + * + * @param datasetID the datasetKey to match + * @param identifier the identifier to match + * @return List of ExternalID + */ + public List lookupAncillary(@NotNull String datasetID, @NotNull String identifier) { + return lookupIdentifier(datasetID, identifier, ancillarySearchers); + } + + /** + * Matches an external ID. Intended for debug purposes only, to quickly + * check if ids are present and joined to main index or not. + * + * @param datasetID the datasetKey to match + * @param identifier the identifier to match + * @return List of ExternalID + */ + public List lookupIdentifier(@NotNull String datasetID, @NotNull String identifier, Map searchers) { List results = new ArrayList<>(); try { // if join indexes are present, add them to the match - if (identifierSearchers != null && !identifierSearchers.isEmpty()) { - for (Dataset dataset : identifierSearchers.keySet()) { + if (searchers != null && !searchers.isEmpty()) { + for (Dataset dataset : searchers.keySet()) { // use the prefix mapping if (dataset.getKey().toString().equals(datasetID) || (dataset.getGbifKey() != null && dataset.getGbifKey().equals(datasetID))) { @@ -540,12 +564,12 @@ public List lookupIdentifier(@NotNull String datasetID, @NotNull Str } // find the index and search it - IndexSearcher identifierSearcher = identifierSearchers.get(dataset); + IndexSearcher searcher = searchers.get(dataset); Query identifierQuery = new TermQuery(new Term(FIELD_ID, identifier)); - TopDocs identifierDocs = identifierSearcher.search(identifierQuery, 3); + TopDocs identifierDocs = searcher.search(identifierQuery, 3); if (identifierDocs.totalHits.value > 0) { - Document identifierDoc = identifierSearcher.storedFields(). + Document identifierDoc = searcher.storedFields(). document(identifierDocs.scoreDocs[0].doc); results.add(toExternalID(identifierDoc, dataset)); @@ -560,6 +584,7 @@ public List lookupIdentifier(@NotNull String datasetID, @NotNull Str return results; } + private static ExternalID toExternalID(Document doc, Dataset dataset) { return ExternalID.builder() .id(doc.get(FIELD_ID)) @@ -774,7 +799,9 @@ private NameUsageMatch fromDoc(Document doc) { // if ancillary join indexes are present, add them to the match for (Dataset dataset: ancillarySearchers.keySet()){ IndexSearcher ancillarySearcher = ancillarySearchers.get(dataset); - Query query = new TermQuery(new Term(FIELD_JOIN_ID, doc.get(FIELD_ID) )); + Query query = new TermQuery( + new Term(FIELD_JOIN_ID, doc.get(FIELD_ID)) + ); try { TopDocs docs = ancillarySearcher.search(query, 3); if (docs.totalHits.value > 0) { @@ -814,6 +841,7 @@ private static NameUsageMatch.Usage constructUsage(Document doc) { NameUsageMatch.Usage.UsageBuilder b = NameUsageMatch.Usage.builder() .key(doc.get(FIELD_ID)) .name(doc.get(FIELD_SCIENTIFIC_NAME)) + .authorship(doc.get(FIELD_AUTHORSHIP)) .rank(Rank.valueOf(doc.get(FIELD_RANK))) .canonicalName(doc.get(FIELD_CANONICAL_NAME)) .code(getCode(doc)); @@ -867,8 +895,7 @@ private static NameUsageMatch.Usage constructUsage(Document doc) { } } - NameUsageMatch.Usage usage = b.build(); - return usage; + return b.build(); } private static NomCode getCode(Document doc) { diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java b/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java index 0351ee43c..e2a330118 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java +++ b/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java @@ -330,7 +330,8 @@ public static class Usage implements Serializable { private String key; @Schema(description = "The name usage") private String name; - @JsonIgnore private String canonicalName; + private String canonicalName; + private String authorship; @JsonIgnore private String parentID; @Schema(description = "The taxonomic rank for the name usage") private Rank rank; diff --git a/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java b/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java index fdd4ca88f..18f85ce4c 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java +++ b/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java @@ -679,7 +679,7 @@ public void run() { } private boolean isAccepted(String status) { - return status != null && !status.equals(TaxonomicStatus.ACCEPTED.name()); + return status != null && status.equals(TaxonomicStatus.ACCEPTED.name()); } } @@ -845,6 +845,11 @@ private static void finishIndex(IndexWriter indexWriter) throws IOException { return Paths.get(indexPath); } + /** + * Generate the lucene document for a name usage + * @param nameUsage + * @return lucene document + */ protected static Document toDoc(NameUsage nameUsage) { Document doc = new Document(); @@ -852,9 +857,9 @@ protected static Document toDoc(NameUsage nameUsage) { Porting notes: The canonical name *sensu strictu* with nothing else but three name parts at most (genus, species, infraspecific). No rank or hybrid markers and no authorship, cultivar or strain information. Infrageneric names are represented without a - leading genus. Unicode characters are replaced by their matching ASCII characters." + leading genus. Unicode characters are replaced by their matching ASCII characters. */ - Rank rank = Rank.valueOf(nameUsage.getRank()); + Rank rank = Rank.valueOf(nameUsage.getRank()); Optional optCanonical = Optional.empty(); try { @@ -862,7 +867,13 @@ protected static Document toDoc(NameUsage nameUsage) { if (!StringUtils.isEmpty(nameUsage.getNomenclaturalCode())) { nomCode = NomCode.valueOf(nameUsage.getNomenclaturalCode()); } - ParsedName pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode); + + ParsedName pn = null; + if (StringUtils.isBlank(nameUsage.getAuthorship())) { + pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode); + } else{ + pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName() + " " + nameUsage.getAuthorship(), rank, nomCode); + } StoredParsedName storedParsedName = new StoredParsedName(); storedParsedName.setAbbreviated(pn.isAbbreviated()); @@ -945,7 +956,9 @@ protected static Document toDoc(NameUsage nameUsage) { String nameComplete = nameUsage.getScientificName(); if (StringUtils.isNotBlank(nameUsage.getAuthorship())) { nameComplete += " " + nameUsage.getAuthorship(); + doc.add(new TextField(FIELD_AUTHORSHIP, nameUsage.getAuthorship(), Field.Store.YES)); } + doc.add(new TextField(FIELD_SCIENTIFIC_NAME, nameComplete, Field.Store.YES)); // this lucene index is not persistent, so not risk in changing ordinal numbers @@ -969,75 +982,4 @@ protected static Document toDoc(NameUsage nameUsage) { return doc; } - -// /** -// * Converts a {@link org.gbif.nameparser.api.ParsedName} into {@link -// * org.gbif.pipelines.io.avro.ParsedName}. -// */ -// private static ParsedName toParsedNameAvro(org.gbif.nameparser.api.ParsedName pn) { -// ParsedName.Builder builder = -// ParsedName.newBuilder() -// .setAbbreviated(pn.isAbbreviated()) -// .setAutonym(pn.isAutonym()) -// .setBinomial(pn.isBinomial()) -// .setCandidatus(pn.isCandidatus()) -// .setCultivarEpithet(pn.getCultivarEpithet()) -// .setDoubtful(pn.isDoubtful()) -// .setGenus(pn.getGenus()) -// .setUninomial(pn.getUninomial()) -// .setUnparsed(pn.getUnparsed()) -// .setTrinomial(pn.isTrinomial()) -// .setIncomplete(pn.isIncomplete()) -// .setIndetermined(pn.isIndetermined()) -// .setTerminalEpithet(pn.getTerminalEpithet()) -// .setInfragenericEpithet(pn.getInfragenericEpithet()) -// .setInfraspecificEpithet(pn.getInfraspecificEpithet()) -// .setExtinct(pn.isExtinct()) -// .setPublishedIn(pn.getPublishedIn()) -// .setSanctioningAuthor(pn.getSanctioningAuthor()) -// .setSpecificEpithet(pn.getSpecificEpithet()) -// .setPhrase(pn.getPhrase()) -// .setPhraseName(pn.isPhraseName()) -// .setVoucher(pn.getVoucher()) -// .setNominatingParty(pn.getNominatingParty()) -// .setNomenclaturalNote(pn.getNomenclaturalNote()); -// -// // Nullable fields -// Optional.ofNullable(pn.getWarnings()) -// .ifPresent(w -> builder.setWarnings(new ArrayList<>(pn.getWarnings()))); -// Optional.ofNullable(pn.getBasionymAuthorship()) -// .ifPresent(authorship -> builder.setBasionymAuthorship(toAuthorshipAvro(authorship))); -// Optional.ofNullable(pn.getCombinationAuthorship()) -// .ifPresent(authorship -> builder.setCombinationAuthorship(toAuthorshipAvro(authorship))); -// Optional.ofNullable(pn.getCode()) -// .ifPresent(code -> builder.setCode(NomCode.valueOf(code.name()))); -// Optional.ofNullable(pn.getType()) -// .ifPresent(type -> builder.setType(NameType.valueOf(type.name()))); -// Optional.ofNullable(pn.getNotho()) -// .ifPresent(notho -> builder.setNotho(NamePart.valueOf(notho.name()))); -// Optional.ofNullable(pn.getRank()) -// .ifPresent(rank -> builder.setRank(NameRank.valueOf(rank.name()))); -// Optional.ofNullable(pn.getState()) -// .ifPresent(state -> builder.setState(State.valueOf(state.name()))); -// Optional.ofNullable(pn.getEpithetQualifier()) -// .map( -// eq -> -// eq.entrySet().stream() -// .collect(Collectors.toMap(e -> e.getKey().name(), Map.Entry::getValue))) -// .ifPresent(builder::setEpithetQualifier); -// return builder.build(); -// } -// -// -// * Converts a {@link org.gbif.nameparser.api.Authorship} into {@link -// * org.gbif.pipelines.io.avro.Authorship}. -// */ -// private static Authorship toAuthorshipAvro(org.gbif.nameparser.api.Authorship authorship) { -// return Authorship.newBuilder() -// .setEmpty(authorship.isEmpty()) -// .setYear(authorship.getYear()) -// .setAuthors(authorship.getAuthors()) -// .setExAuthors(authorship.getExAuthors()) -// .build(); -// } } diff --git a/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java b/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java index 08dec6c28..9accfc451 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java +++ b/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java @@ -262,7 +262,9 @@ public List lookupJoins(String identifier){ * @return the list of matches */ public List matchID(String datasetID, String identifier){ - return datasetIndex.lookupIdentifier(datasetID, identifier); + List ids = datasetIndex.lookupIdentifier(datasetID, identifier); + List ancillary = datasetIndex.lookupAncillary(datasetID, identifier); + return ImmutableList.builder().addAll(ids).addAll(ancillary).build(); } public NameUsageMatch match( diff --git a/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java b/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java index a690e4dcf..b6334e6fc 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java +++ b/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java @@ -8,6 +8,7 @@ public class IndexConstants { public static final String FIELD_ACCEPTED_ID = "accid"; public static final String FIELD_CANONICAL_NAME = "canonical"; public static final String FIELD_SCIENTIFIC_NAME = "sciname"; + public static final String FIELD_AUTHORSHIP = "authorship"; public static final String FIELD_RANK = "rank"; public static final String FIELD_STATUS = "status"; public static final String FIELD_PARENT_ID = "parentId";