From 21725e2cf0ae253c544d3e5feb6062dd3082e13f Mon Sep 17 00:00:00 2001 From: Patrice Lopez Date: Thu, 9 May 2024 22:15:01 +0200 Subject: [PATCH 1/8] add tini for kubernetes support --- Dockerfile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Dockerfile b/Dockerfile index 1c648e9f..7a697c77 100644 --- a/Dockerfile +++ b/Dockerfile @@ -77,6 +77,12 @@ RUN apt-get update && \ RUN apt-get update -y && \ apt-get clean all -y +# Add Tini +ENV TINI_VERSION v0.19.0 +ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini +RUN chmod +x /tini +ENTRYPOINT ["/tini", "-s", "--"] + WORKDIR /opt/grobid COPY --from=builder /opt/grobid . From 48ec918e3aa2e600a4d579c0a7a24fb096c2470f Mon Sep 17 00:00:00 2001 From: Patrice Lopez Date: Thu, 4 Jul 2024 20:12:00 +0200 Subject: [PATCH 2/8] remove jdk.incubator.foreign module --- build.gradle | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/build.gradle b/build.gradle index 9c8aa35e..3cd1874e 100644 --- a/build.gradle +++ b/build.gradle @@ -164,7 +164,7 @@ test { if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) { jvmArgs "--add-opens", "java.base/java.util.stream=ALL-UNNAMED", - "--add-opens", "java.base/java.io=ALL-UNNAMED", "--add-modules", "jdk.incubator.foreign" + "--add-opens", "java.base/java.io=ALL-UNNAMED" } } @@ -210,7 +210,7 @@ task(train_corpus, dependsOn: 'classes', type: JavaExec, group: 'training') { args getArg('corpus', ''), getArg('lang', 'en') jvmArgs '-Djdk.xml.totalEntitySizeLimit=2147480000', '-Djdk.xml.totalEntitySizeLimit=2147480000', '-Xms2g', '-Xmx8g' if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) { - jvmArgs '-Djdk.xml.totalEntitySizeLimit=2147480000', '-Djdk.xml.totalEntitySizeLimit=2147480000', '-Xms2g', '-Xmx8g', "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-modules", "jdk.incubator.foreign", "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED" + jvmArgs '-Djdk.xml.totalEntitySizeLimit=2147480000', '-Djdk.xml.totalEntitySizeLimit=2147480000', '-Xms2g', '-Xmx8g', "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED" } else { jvmArgs '-Djdk.xml.totalEntitySizeLimit=2147480000', '-Djdk.xml.totalEntitySizeLimit=2147480000', '-Xms2g', '-Xmx8g' } @@ -224,7 +224,7 @@ task(evaluation, dependsOn: 'classes', type: JavaExec, group: 'evaluation') { classpath = sourceSets.main.runtimeClasspath args getArg('corpus', '') if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) { - jvmArgs '--Xms2g', '-Xmx8g', "--add-opens", "--add-modules", "jdk.incubator.foreign", "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED" + jvmArgs '--Xms2g', '-Xmx8g', "--add-opens", "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED" } else { jvmArgs '--Xms2g', '-Xmx8g' } @@ -238,7 +238,7 @@ task(annotatedDataGeneration, dependsOn: 'classes', type: JavaExec, group: 'trai classpath = sourceSets.main.runtimeClasspath args getArg('corpus', '') if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) { - jvmArgs '--Xms2g', '-Xmx8g', "--add-opens", "--add-modules", "jdk.incubator.foreign", "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED" + jvmArgs '--Xms2g', '-Xmx8g', "--add-opens", "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED" } else { jvmArgs '--Xms2g', '-Xmx8g' } @@ -253,7 +253,7 @@ task(generate_entity_description, dependsOn: 'classes', type: JavaExec, group: ' classpath = sourceSets.main.runtimeClasspath args 'data/embeddings/', getArg('lang', 'en') if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) { - jvmArgs '--Xms2g', '-Xmx8g', "--add-opens", "--add-modules", "jdk.incubator.foreign", "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED" + jvmArgs '--Xms2g', '-Xmx8g', "--add-opens", "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED" } else { jvmArgs '--Xms2g', '-Xmx8g' } @@ -268,7 +268,7 @@ task(quantize_word_embeddings, dependsOn: 'classes', type: JavaExec, group: 'emb classpath = sourceSets.main.runtimeClasspath args '-i', getArg('i', 'word.embeddings.vec'), '-o', getArg('o', 'word.embeddings.quantized'), '-error', getArg('e', '0.01'), '-hashheader' if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) { - jvmArgs '--Xms2g', '-Xmx8g', "--add-opens", "--add-modules", "jdk.incubator.foreign", "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED" + jvmArgs '--Xms2g', '-Xmx8g', "--add-opens", "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED" } else { jvmArgs '--Xms2g', '-Xmx8g' } @@ -281,7 +281,7 @@ task(generate_entity_embeddings, dependsOn: 'classes', type: JavaExec, group: 'e classpath = sourceSets.main.runtimeClasspath args '-in', getArg('in', 'entity.description'), '-v', getArg('v', 'word.embeddings.quantized'), '-out', getArg('out', 'entity.embeddings.vec'), '-n', getArg('n', '8') if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) { - jvmArgs '--Xms2g', '-Xmx8g', "--add-opens", "--add-modules", "jdk.incubator.foreign", "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED" + jvmArgs '--Xms2g', '-Xmx8g', "--add-opens", "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED" } else { jvmArgs '--Xms2g', '-Xmx8g' } @@ -294,7 +294,7 @@ task(quantize_entity_embeddings, dependsOn: 'classes', type: JavaExec, group: 'e classpath = sourceSets.main.runtimeClasspath args '-i', getArg('i', 'entity.embeddings.vec'), '-o', getArg('o', 'entity.embeddings.quantized'), '-error', getArg('e', '0.01'), '-hashheader' if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) { - jvmArgs '--Xms2g', '-Xmx8g', "--add-opens", "--add-modules", "jdk.incubator.foreign", "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED" + jvmArgs '--Xms2g', '-Xmx8g', "--add-opens", "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED" } else { jvmArgs '--Xms2g', '-Xmx8g' } @@ -307,7 +307,7 @@ application { run { if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) { - jvmArgs "--add-modules", "jdk.incubator.foreign", "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED" + jvmArgs "--add-opens", "java.base/java.nio=ALL-UNNAMED", "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.text=ALL-UNNAMED", "--add-opens", "java.base/java.net=ALL-UNNAMED", "--add-opens", "java.base/java.lang=ALL-UNNAMED", "--add-opens", "java.base/java.math=ALL-UNNAMED", "--add-opens", "java.base/java.util=ALL-UNNAMED", "--add-opens", "java.base/java.util.concurrent=ALL-UNNAMED" } args = ['server', 'data/config/service.yaml'] From 704103e7024a6f5162f1c2df811e2ec58ab90a5f Mon Sep 17 00:00:00 2001 From: Patrice Lopez Date: Sat, 14 Sep 2024 15:43:31 +0200 Subject: [PATCH 3/8] update to grobid 0.8.1 --- build.gradle | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/build.gradle b/build.gradle index 3cd1874e..045a8a9e 100644 --- a/build.gradle +++ b/build.gradle @@ -5,6 +5,7 @@ buildscript { maven { url 'https://plugins.gradle.org/m2/' } + maven { url "https://grobid.s3.eu-west-1.amazonaws.com/repo/" } } dependencies { classpath 'gradle.plugin.org.kt3k.gradle.plugin:coveralls-gradle-plugin:2.12.0' @@ -51,15 +52,15 @@ dependencies { testImplementation group: 'com.googlecode.json-simple', name: 'json-simple', version: '1.1.1' // GROBID - implementation (group: 'org.grobid', name: 'grobid-core', version: '0.8.0') { + implementation (group: 'org.grobid', name: 'grobid-core', version: '0.8.1') { //exclude(module: 'log4j-over-slf4j') exclude(group: 'ch.qos.logback', module: 'logback-classic') } - implementation (group: 'org.grobid', name: 'grobid-trainer', version: '0.8.0') { + implementation (group: 'org.grobid', name: 'grobid-trainer', version: '0.8.1') { //exclude(module: 'log4j-over-slf4j') exclude(group: 'ch.qos.logback', module: 'logback-classic') } - implementation (group: 'org.grobid', name: 'grobid-ner', version: '0.8.0') { + implementation (group: 'org.grobid', name: 'grobid-ner', version: '0.8.1') { //exclude(module: 'log4j-over-slf4j') exclude(group: 'ch.qos.logback', module: 'logback-classic') } From cda18d5d470670b0edebf4f2af52e54f729a6f49 Mon Sep 17 00:00:00 2001 From: Patrice Lopez Date: Wed, 13 Nov 2024 18:57:27 +0100 Subject: [PATCH 4/8] update dockerfile following last upgrade --- Dockerfile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7a697c77..1ae66c66 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,8 @@ ARG BUILD_VERSION=0.0.6 # ------------- # builder image # ------------- -FROM openjdk:8u275-jdk as builder +#FROM openjdk:8u275-jdk as builder +FROM openjdk:17-jdk-slim as builder USER root ENV LANG="en_US.UTF-8" \ @@ -21,8 +22,8 @@ RUN apt-get update -y && \ WORKDIR /opt/ # install GROBID -RUN wget --tries=10 --read-timeout=10 https://github.com/kermitt2/grobid/archive/refs/tags/0.7.2.zip -RUN unzip -o 0.7.2.zip && mv grobid-* grobid +RUN wget --tries=10 --read-timeout=10 https://github.com/kermitt2/grobid/archive/refs/tags/0.8.1.zip +RUN unzip -o 0.8.1.zip && mv grobid-* grobid WORKDIR /opt/grobid @@ -34,7 +35,7 @@ RUN rm -rf grobid-home/pdfalto/win-* RUN rm -rf grobid-home/lib/lin-32 RUN rm -rf grobid-home/lib/win-* RUN rm -rf grobid-home/lib/mac-64 -RUN rm -rf ../0.7.2.zip +RUN rm -rf ../0.8.1.zip # cleaning DeLFT models RUN rm -rf grobid-home/models/*-BidLSTM_CRF* From 4d6a786543cc679f78562a420ec1cce198fe2fde Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 23 Dec 2024 17:46:54 +0100 Subject: [PATCH 5/8] add API entry point to list concepts size by language --- .../nerd/kb/LowerKnowledgeBase.java | 11 ++++-- .../nerd/mention/ProcessText.java | 2 +- .../scienceminer/nerd/service/NerdPaths.java | 5 +++ .../scienceminer/nerd/service/NerdRestKB.java | 35 +++++++++++++++++-- .../nerd/service/NerdRestService.java | 20 ++++++++++- 5 files changed, 67 insertions(+), 6 deletions(-) diff --git a/src/main/java/com/scienceminer/nerd/kb/LowerKnowledgeBase.java b/src/main/java/com/scienceminer/nerd/kb/LowerKnowledgeBase.java index f903ab86..aa7472b5 100644 --- a/src/main/java/com/scienceminer/nerd/kb/LowerKnowledgeBase.java +++ b/src/main/java/com/scienceminer/nerd/kb/LowerKnowledgeBase.java @@ -46,8 +46,15 @@ public LowerKnowledgeBase(NerdConfig conf) { } public int getArticleCount() { - if (wikipediaArticleCount == -1) - wikipediaArticleCount = this.env.retrieveStatistic(StatisticName.articleCount).intValue(); + if (wikipediaArticleCount == -1) { + Long articleCountStats = this.env.retrieveStatistic(StatisticName.articleCount); + if (articleCountStats != null) { + wikipediaArticleCount = articleCountStats.intValue(); + } else { + return 0; + } + } + return wikipediaArticleCount; } diff --git a/src/main/java/com/scienceminer/nerd/mention/ProcessText.java b/src/main/java/com/scienceminer/nerd/mention/ProcessText.java index e9f133a8..25ac1292 100644 --- a/src/main/java/com/scienceminer/nerd/mention/ProcessText.java +++ b/src/main/java/com/scienceminer/nerd/mention/ProcessText.java @@ -94,7 +94,7 @@ private static synchronized void getNewInstance() { private ProcessText() { String grobidHome = com.scienceminer.nerd.utilities.Utilities.initGrobid(); Path grobidHomePath = Paths.get(grobidHome); - Path grobidNerPath = grobidHomePath.resolve("../grobid-ner/"); + Path grobidNerPath = grobidHomePath.resolve("../../grobid-ner/"); // the following will ensure that the Grobid environment and config are loaded // independently from the NER models diff --git a/src/main/java/com/scienceminer/nerd/service/NerdPaths.java b/src/main/java/com/scienceminer/nerd/service/NerdPaths.java index 8df4070f..8bc1a30b 100644 --- a/src/main/java/com/scienceminer/nerd/service/NerdPaths.java +++ b/src/main/java/com/scienceminer/nerd/service/NerdPaths.java @@ -20,6 +20,11 @@ public interface NerdPaths { */ String VERSION = "version"; + /** + * path extension for data information request + */ + String DATA = "data"; + /** * Language identification entry point */ diff --git a/src/main/java/com/scienceminer/nerd/service/NerdRestKB.java b/src/main/java/com/scienceminer/nerd/service/NerdRestKB.java index 6af78ce0..e9fc47d7 100644 --- a/src/main/java/com/scienceminer/nerd/service/NerdRestKB.java +++ b/src/main/java/com/scienceminer/nerd/service/NerdRestKB.java @@ -276,7 +276,7 @@ public String getTermLookup(String term, String lang) { JsonStringEncoder encoder = JsonStringEncoder.getInstance(); byte[] encodedTerm = encoder.quoteAsUTF8(term); - String outputTerm = new String(encodedTerm); + String outputTerm = new String(encodedTerm); jsonBuilder.append("{ \"term\": \"" + outputTerm + "\", \"lang\": \"" + lang + "\", \"senses\" : ["); Label lbl = new Label(wikipedia.getEnvironment(), term.trim()); @@ -331,7 +331,7 @@ public String getTermLookup(String term, String lang) { jsonBuilder.append(", "); byte[] encodedPreferred = encoder.quoteAsUTF8(sense.getTitle()); - String outputPreferred = new String(encodedPreferred); + String outputPreferred = new String(encodedPreferred); jsonBuilder.append("{ \"pageid\": " + sense.getId() + ", \"preferred\" : \"" + outputPreferred + "\", \"prob_c\" : " + sense.getPriorProbability() + " }"); @@ -372,4 +372,35 @@ public String getWikidataIDByDOI(String doi) { sb.append("}"); return sb.toString(); } + + public String getKbStatistics() { + StringBuilder sb = new StringBuilder(); + + UpperKnowledgeBase upperKb = UpperKnowledgeBase.getInstance(); + long entityCount = upperKb.getEntityCount(); + sb.append("{"); + sb.append("\"") + .append("wikidata_concepts") + .append("\"") + .append(":") + .append("\"") + .append(entityCount) + .append("\"") + .append(","); + + Map lowerKbWikipedias = upperKb.getWikipediaConfs(); + + for (Map.Entry entry : lowerKbWikipedias.entrySet()) { + String wikipediaName = entry.getKey(); + LowerKnowledgeBase kb = entry.getValue(); + int articleCount = kb.getArticleCount(); + + sb.append("\"").append(wikipediaName).append("\"").append(":").append("\"").append(articleCount).append("\"").append(","); + } + + sb.append("}"); + + return sb.toString().replace(",}", "}"); + + } } \ No newline at end of file diff --git a/src/main/java/com/scienceminer/nerd/service/NerdRestService.java b/src/main/java/com/scienceminer/nerd/service/NerdRestService.java index 271059b8..62fdcfed 100644 --- a/src/main/java/com/scienceminer/nerd/service/NerdRestService.java +++ b/src/main/java/com/scienceminer/nerd/service/NerdRestService.java @@ -8,7 +8,6 @@ import com.scienceminer.nerd.kb.Lexicon; import com.scienceminer.nerd.kb.UpperKnowledgeBase; import com.scienceminer.nerd.mention.ProcessText; -import org.glassfish.jersey.media.multipart.FormDataBodyPart; import org.glassfish.jersey.media.multipart.FormDataParam; import com.codahale.metrics.annotation.Timed; @@ -114,6 +113,25 @@ public Response getVersion() { return response; } + @GET + @Path(NerdPaths.DATA) + @Produces(MediaType.APPLICATION_JSON) + public Response getDataStatistics() { + Response response = null; + try { + response = Response.status(Response.Status.OK) + .entity(nerdRestKB.getKbStatistics()) + .type(MediaType.APPLICATION_JSON) + .build(); + + } catch (Exception e) { + LOGGER.error("An unexpected exception occurs. ", e); + response = Response.status(Response.Status.INTERNAL_SERVER_ERROR).build(); + } + + return response; + } + /** * Sentence Segmentation **/ From 096e51cdc6ceb06c3bac7c96480038e0dbb5786d Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 27 Dec 2024 20:20:28 +0100 Subject: [PATCH 6/8] collect statistical information and publish them via the API --- .../nerd/kb/LowerKnowledgeBase.java | 75 +++++++++++-------- .../nerd/kb/UpperKnowledgeBase.java | 14 ++++ .../nerd/kb/db/KBEnvironment.java | 2 +- .../nerd/kb/db/KBUpperEnvironment.java | 2 +- .../nerd/kb/db/StatementDatabase.java | 37 ++++----- .../nerd/kb/model/KBStatistics.java | 25 +++++++ .../scienceminer/nerd/service/NerdRestKB.java | 43 ++++++----- 7 files changed, 119 insertions(+), 79 deletions(-) create mode 100644 src/main/java/com/scienceminer/nerd/kb/model/KBStatistics.java diff --git a/src/main/java/com/scienceminer/nerd/kb/LowerKnowledgeBase.java b/src/main/java/com/scienceminer/nerd/kb/LowerKnowledgeBase.java index aa7472b5..f3e8ce1a 100644 --- a/src/main/java/com/scienceminer/nerd/kb/LowerKnowledgeBase.java +++ b/src/main/java/com/scienceminer/nerd/kb/LowerKnowledgeBase.java @@ -16,9 +16,9 @@ import java.util.List; /** - * Represent the language specific resources of the Knowledge Base, e.g. a - * Wikipedia instance, including corresponding word and entity embeddings. - * + * Represent the language specific resources of the Knowledge Base, e.g. a + * Wikipedia instance, including corresponding word and entity embeddings. + * */ public class LowerKnowledgeBase { @@ -26,15 +26,16 @@ public class LowerKnowledgeBase { private KBLowerEnvironment env = null; private int wikipediaArticleCount = -1; + private int wikipediaPageCount = -1; public enum Direction { - In, + In, Out } /** - * Initialises a newly created Wikipedia according to the given configuration. - * + * Initialises a newly created Wikipedia according to the given configuration. + * */ public LowerKnowledgeBase(NerdConfig conf) { this.env = new KBLowerEnvironment(conf); @@ -42,7 +43,7 @@ public LowerKnowledgeBase(NerdConfig conf) { this.env.buildEnvironment(conf, false); } catch(Exception e) { LOGGER.error("Environment for Wikipedia cannot be built", e); - } + } } public int getArticleCount() { @@ -58,6 +59,18 @@ public int getArticleCount() { return wikipediaArticleCount; } + public int getPageCount() { + if (wikipediaPageCount == -1) { + Long pageCount = this.env.getDbPage().getDatabaseSize(); + if (pageCount != null) { + wikipediaPageCount = pageCount.intValue(); + } else { + return 0; + } + } + return wikipediaPageCount; + } + /** * Returns the environment that this is connected to */ @@ -67,17 +80,17 @@ public KBLowerEnvironment getEnvironment() { /** * Make ready the full content database of articles - * + * */ public void loadFullContentDB() { try { if (this.env != null) this.env.buildFullMarkup(false); - else + else LOGGER.error("Environment for Wikipedia full content article DB is null"); } catch(Exception e) { LOGGER.error("Environment for Wikipedia full content cannot be built", e); - } + } } /** @@ -89,7 +102,7 @@ public NerdConfig getConfig() { /** * Returns the root Category from which all other categories can be browsed. - * + * */ public com.scienceminer.nerd.kb.model.Category getRootCategory() { return new com.scienceminer.nerd.kb.model.Category(env, env.retrieveStatistic(StatisticName.rootCategoryId).intValue()); @@ -103,11 +116,11 @@ public Page getPageById(int id) { } /** - * Returns the Page referenced by the given Wikidata id for the language of the Wikipedia. - * The page can be cast into the appropriate type for more specific functionality. - * + * Returns the Page referenced by the given Wikidata id for the language of the Wikipedia. + * The page can be cast into the appropriate type for more specific functionality. + * * @param id the Wikidata id of the Page to retrieve. - * @return the Page referenced by the given id, or null if one does not exist. + * @return the Page referenced by the given id, or null if one does not exist. */ /*public Page getPageByWikidataId(String wikidataId) { return Page.createPage(env, wikidataId); @@ -128,7 +141,7 @@ public Page getPageById(int id) { /** * Returns the Article referenced by the given (case sensitive) title. If the title * matches a redirect, this will be resolved to return the final target. - * + * */ public Article getArticleByTitle(String title) { if (title == null || title.length() == 0) @@ -151,7 +164,7 @@ public Article getArticleByTitle(String title) { } /** - * Returns the Category referenced by the given (case sensitive) title. + * Returns the Category referenced by the given (case sensitive) title. * */ public com.scienceminer.nerd.kb.model.Category getCategoryByTitle(String title) { @@ -169,8 +182,8 @@ public com.scienceminer.nerd.kb.model.Category getCategoryByTitle(String title) } /** - * Returns the Template referenced by the given (case sensitive) title. - * + * Returns the Template referenced by the given (case sensitive) title. + * */ public Template getTemplateByTitle(String title) { title = title.substring(0,1).toUpperCase() + title.substring(1); @@ -188,11 +201,11 @@ public Template getTemplateByTitle(String title) { /** - * Returns the most probable article for a given term. + * Returns the most probable article for a given term. */ public Article getMostProbableArticle(String term) { Label label = new Label(env, term); - if (!label.exists()) + if (!label.exists()) return null; return label.getSenses()[0]; @@ -200,8 +213,8 @@ public Article getMostProbableArticle(String term) { /** * A convenience method for quickly finding out if the given text is ever used as a label - * in Wikipedia. If this returns false, then all of the getArticle methods will return null or empty sets. - * + * in Wikipedia. If this returns false, then all of the getArticle methods will return null or empty sets. + * */ /*public boolean isLabel(String text) { DbLabel lbl = env.getDbLabel().retrieve(text); @@ -214,7 +227,7 @@ public Label getLabel(String text) { /** * Returns an iterator for all pages in the database, in order of ascending ids. - * + * */ public PageIterator getPageIterator() { return new PageIterator(env); @@ -222,15 +235,15 @@ public PageIterator getPageIterator() { /** * Returns an iterator for all pages in the database of the given type, in order of ascending ids. - * + * */ public PageIterator getPageIterator(PageType type) { - return new PageIterator(env, type); + return new PageIterator(env, type); } /** * Returns an iterator for all labels in the database, processed according to the given text processor (may be null), in alphabetical order. - * + * */ public LabelIterator getLabelIterator() { return new LabelIterator(env); @@ -238,7 +251,7 @@ public LabelIterator getLabelIterator() { /** * Returns the list of links in relation to artId with the specified direction (in or out). - * + * */ public List getLinks(int artId, Direction dir) { DbIntList ids = null; @@ -247,7 +260,7 @@ public List getLinks(int artId, Direction dir) { else ids = env.getDbPageLinkOutNoSentences().retrieve(artId); - if (ids == null || ids.getValues() == null) + if (ids == null || ids.getValues() == null) return new ArrayList(); return ids.getValues(); @@ -293,7 +306,7 @@ public double getWordFrequency(String word) { return 0.0; else return Utilities.cBToFrequency(cB.intValue()); - } + } /** * @return frequency of word for the language @@ -304,7 +317,7 @@ public double getWordZipf(String word) { return 0.0; else return Utilities.cBToZipf(cB.intValue()); - } + } public void close() { env.close(); diff --git a/src/main/java/com/scienceminer/nerd/kb/UpperKnowledgeBase.java b/src/main/java/com/scienceminer/nerd/kb/UpperKnowledgeBase.java index db8498df..1f75f827 100644 --- a/src/main/java/com/scienceminer/nerd/kb/UpperKnowledgeBase.java +++ b/src/main/java/com/scienceminer/nerd/kb/UpperKnowledgeBase.java @@ -38,6 +38,8 @@ public class UpperKnowledgeBase { private Map wikipediaDomainMaps = null; private long conceptCount = -1; + private long statementCount = -1; + private long labelCount = -1; // this is the list of supported languages public static final List TARGET_LANGUAGES = Arrays.asList( @@ -230,6 +232,18 @@ public long getEntityCount() { return conceptCount; } + public long getStatementCount() { + if (statementCount == -1) + statementCount = env.getDbStatements().getDatabaseSize(); + return statementCount; + } + + public long getLabelCount() { + if (labelCount == -1) + labelCount = env.getDbLabels().getDatabaseSize(); + return labelCount; + } + /** * Return the concept object corresponding to a given wikidata ID */ diff --git a/src/main/java/com/scienceminer/nerd/kb/db/KBEnvironment.java b/src/main/java/com/scienceminer/nerd/kb/db/KBEnvironment.java index 40cb9577..05bcdebe 100644 --- a/src/main/java/com/scienceminer/nerd/kb/db/KBEnvironment.java +++ b/src/main/java/com/scienceminer/nerd/kb/db/KBEnvironment.java @@ -101,7 +101,7 @@ protected static File getDataFile(File dataDirectory, String fileName) { */ public enum StatisticName { /** - * number of articles (not disambiguations or redirects) + * number of articles (not disambiguation or redirects) */ articleCount, diff --git a/src/main/java/com/scienceminer/nerd/kb/db/KBUpperEnvironment.java b/src/main/java/com/scienceminer/nerd/kb/db/KBUpperEnvironment.java index 1b50ff68..7b1d7293 100644 --- a/src/main/java/com/scienceminer/nerd/kb/db/KBUpperEnvironment.java +++ b/src/main/java/com/scienceminer/nerd/kb/db/KBUpperEnvironment.java @@ -102,7 +102,7 @@ public TaxonDatabase getDbTaxonParent() { } /** - * Returns the {@link DatabaseType#labels} database + * Returns the {@link DatabaseType#label} database */ public ConceptLabelDatabase getDbLabels() { return dbLabels; diff --git a/src/main/java/com/scienceminer/nerd/kb/db/StatementDatabase.java b/src/main/java/com/scienceminer/nerd/kb/db/StatementDatabase.java index 6249ed6e..83d1350a 100644 --- a/src/main/java/com/scienceminer/nerd/kb/db/StatementDatabase.java +++ b/src/main/java/com/scienceminer/nerd/kb/db/StatementDatabase.java @@ -1,32 +1,21 @@ package com.scienceminer.nerd.kb.db; -import java.io.*; -import java.util.List; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.Map; -import java.util.TreeMap; - +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.scienceminer.nerd.exceptions.NerdResourceException; +import com.scienceminer.nerd.kb.Property; +import com.scienceminer.nerd.kb.Statement; +import com.scienceminer.nerd.kb.UpperKnowledgeBase; +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.apache.hadoop.record.CsvRecordInput; +import org.fusesource.lmdbjni.Entry; +import org.fusesource.lmdbjni.Transaction; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.commons.compress.compressors.*; -import org.apache.hadoop.record.CsvRecordInput; - -import com.scienceminer.nerd.kb.db.*; -import com.scienceminer.nerd.kb.db.KBDatabase.DatabaseType; -import com.scienceminer.nerd.utilities.*; -import com.scienceminer.nerd.kb.*; -import com.scienceminer.nerd.exceptions.NerdResourceException; - -import com.fasterxml.jackson.core.*; -import com.fasterxml.jackson.databind.*; -import com.fasterxml.jackson.databind.node.*; -import com.fasterxml.jackson.annotation.*; -import com.fasterxml.jackson.core.io.*; - -import org.fusesource.lmdbjni.*; -import static org.fusesource.lmdbjni.Constants.*; +import java.io.*; +import java.util.*; public class StatementDatabase extends StringRecordDatabase> { private static final Logger logger = LoggerFactory.getLogger(StatementDatabase.class); diff --git a/src/main/java/com/scienceminer/nerd/kb/model/KBStatistics.java b/src/main/java/com/scienceminer/nerd/kb/model/KBStatistics.java new file mode 100644 index 00000000..ccf70e21 --- /dev/null +++ b/src/main/java/com/scienceminer/nerd/kb/model/KBStatistics.java @@ -0,0 +1,25 @@ +package com.scienceminer.nerd.kb.model; + + +import java.util.HashMap; +import java.util.Map; + +public class KBStatistics { + + public static final String CONCEPTS ="Concepts"; + public static final String LABELS ="Labels"; + public static final String STATEMENTS ="Statements"; + public static final String ARTICLES = "Articles"; + public static final String PAGES = "Pages"; + + private final Map upperKnowledgeBaseStatisticsCount = new HashMap<>(); + private final Map> lowerKnowledgeBaseStatisticsCount = new HashMap<>(); + + public Map getUpperKnowledgeBaseStatisticsCount() { + return upperKnowledgeBaseStatisticsCount; + } + + public Map> getLowerKnowledgeBaseStatisticsCount() { + return lowerKnowledgeBaseStatisticsCount; + } +} diff --git a/src/main/java/com/scienceminer/nerd/service/NerdRestKB.java b/src/main/java/com/scienceminer/nerd/service/NerdRestKB.java index e9fc47d7..27136622 100644 --- a/src/main/java/com/scienceminer/nerd/service/NerdRestKB.java +++ b/src/main/java/com/scienceminer/nerd/service/NerdRestKB.java @@ -7,11 +7,13 @@ import com.scienceminer.nerd.kb.*; import com.scienceminer.nerd.kb.db.WikipediaDomainMap; import com.scienceminer.nerd.kb.model.Article; +import com.scienceminer.nerd.kb.model.KBStatistics; import com.scienceminer.nerd.kb.model.Label; import com.scienceminer.nerd.kb.model.Page; import com.scienceminer.nerd.kb.model.Page.PageType; import com.scienceminer.nerd.utilities.mediaWiki.MediaWikiParser; import org.apache.commons.lang3.ArrayUtils; +import org.apache.commons.lang3.tuple.Pair; import org.grobid.core.lang.Language; import static com.scienceminer.nerd.kb.UpperKnowledgeBase.TARGET_LANGUAGES; @@ -23,9 +25,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; +import static com.scienceminer.nerd.kb.model.KBStatistics.*; import static org.apache.commons.lang3.StringUtils.isBlank; import static org.apache.commons.lang3.StringUtils.isEmpty; @@ -373,34 +378,28 @@ public String getWikidataIDByDOI(String doi) { return sb.toString(); } - public String getKbStatistics() { - StringBuilder sb = new StringBuilder(); - + public KBStatistics getKbStatistics() { + KBStatistics statistics = new KBStatistics(); UpperKnowledgeBase upperKb = UpperKnowledgeBase.getInstance(); - long entityCount = upperKb.getEntityCount(); - sb.append("{"); - sb.append("\"") - .append("wikidata_concepts") - .append("\"") - .append(":") - .append("\"") - .append(entityCount) - .append("\"") - .append(","); - - Map lowerKbWikipedias = upperKb.getWikipediaConfs(); - - for (Map.Entry entry : lowerKbWikipedias.entrySet()) { + + statistics.getUpperKnowledgeBaseStatisticsCount().put(CONCEPTS, upperKb.getEntityCount()); + statistics.getUpperKnowledgeBaseStatisticsCount().put(LABELS, upperKb.getLabelCount()); + statistics.getUpperKnowledgeBaseStatisticsCount().put(STATEMENTS, upperKb.getStatementCount()); + + Map lowerKbsByLang = upperKb.getWikipediaConfs(); + + for (Map.Entry entry : lowerKbsByLang.entrySet()) { + Map wikipediaCounter= new HashMap<>(); String wikipediaName = entry.getKey(); LowerKnowledgeBase kb = entry.getValue(); int articleCount = kb.getArticleCount(); + wikipediaCounter.put(ARTICLES, articleCount); + int pageCount = kb.getPageCount(); + wikipediaCounter.put(PAGES, pageCount); - sb.append("\"").append(wikipediaName).append("\"").append(":").append("\"").append(articleCount).append("\"").append(","); + statistics.getLowerKnowledgeBaseStatisticsCount().put(wikipediaName, wikipediaCounter); } - sb.append("}"); - - return sb.toString().replace(",}", "}"); - + return statistics; } } \ No newline at end of file From 9b7eeff29b0232aeec75ba28b0916c7de750ec7d Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 27 Dec 2024 20:20:46 +0100 Subject: [PATCH 7/8] typo --- src/main/java/com/scienceminer/nerd/kb/db/BiblioDatabase.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/scienceminer/nerd/kb/db/BiblioDatabase.java b/src/main/java/com/scienceminer/nerd/kb/db/BiblioDatabase.java index c72d0bc1..9a5f1603 100644 --- a/src/main/java/com/scienceminer/nerd/kb/db/BiblioDatabase.java +++ b/src/main/java/com/scienceminer/nerd/kb/db/BiblioDatabase.java @@ -35,7 +35,7 @@ public KBEntry deserialiseCsvRecord( } /** - * Load the bilbiographical index + * Load the bibliographical index */ public void fillBiblioDb(ConceptDatabase conceptDb, StatementDatabase statementDb, boolean overwrite) throws Exception { if (isLoaded && !overwrite) From 2a849b9da9d9cc689b3c83af37aa8253624300aa Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 23 Dec 2024 17:50:41 +0100 Subject: [PATCH 8/8] add documentation --- doc/build.rst | 8 ++++- doc/restAPI.rst | 82 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/doc/build.rst b/doc/build.rst index 826a025a..5645cf32 100644 --- a/doc/build.rst +++ b/doc/build.rst @@ -137,4 +137,10 @@ Prometheus metrics (e.g. for Graphana monitoring) are available at http://localh Creating a new Knowledge Base version ************************************* -The knowledge base used by *entity-fishing* can be updated with new versions of Wikidata and Wikipedia using the pre-processing from the library `GRISP `_, see `https://github.com/kermitt2/grisp `_. +The knowledge base used by *entity-fishing* can be updated with new versions of Wikidata and Wikipedia using the pre-processing from the library `GRISP `_. + +The files generated by GRISP (see `listing all necessary files `_) should be used via the configuration: + + - ``dataDirectory`` in the files ``wikipedia-XY.yml`` (with XY equal to the language, e.g. ``en``, ``fr``) for the Wikipedia related knowledge base. Note: The ``XYwiki-latest-pages-articles-multistream.xml.bz2`` can be left compressed + + - ``dataDirectory`` in the file ``kb.yml`` for the Wikidata knowledge base (db-kb) diff --git a/doc/restAPI.rst b/doc/restAPI.rst index d3b263e0..25d6e7a9 100644 --- a/doc/restAPI.rst +++ b/doc/restAPI.rst @@ -1295,3 +1295,85 @@ Or in case of issues: "ok": "false", "message": "The customisation already exists." } + +Data and statistics API +*********************** + +GET /data +^^^^^^^^^ + +Retrieve information about the loaded data, showing the number of concept per Knowledge base. + +(1) Example response + +Here a sample of the response +:: + { + "upperKnowledgeBaseStatisticsCount": { + "Concepts": 113276007, + "Labels": 113331134, + "Statements": 112505569 + }, + "lowerKnowledgeBaseStatisticsCount": { + "de": { + "Pages": 0, + "Articles": 0 + }, + "hi": { + "Pages": 0, + "Articles": 0 + }, + "ru": { + "Pages": 0, + "Articles": 0 + }, + "sv": { + "Pages": 0, + "Articles": 0 + }, + "pt": { + "Pages": 0, + "Articles": 0 + }, + "en": { + "Pages": 20279663, + "Articles": 6649343 + }, + "it": { + "Pages": 0, + "Articles": 0 + }, + "fr": { + "Pages": 0, + "Articles": 0 + }, + "bn": { + "Pages": 0, + "Articles": 0 + }, + "es": { + "Pages": 0, + "Articles": 0 + }, + "zh": { + "Pages": 0, + "Articles": 0 + }, + "ar": { + "Pages": 0, + "Articles": 0 + }, + "uk": { + "Pages": 0, + "Articles": 0 + }, + "ja": { + "Pages": 0, + "Articles": 0 + }, + "fa": { + "Pages": 0, + "Articles": 0 + } + } + }