diff --git a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java index 365e5bee56..ae728df75f 100755 --- a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java +++ b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java @@ -6,7 +6,6 @@ import org.grobid.core.data.util.AuthorEmailAssigner; import org.grobid.core.data.util.ClassicAuthorEmailAssigner; import org.grobid.core.data.util.EmailSanitizer; -import org.grobid.core.data.CopyrightsLicense; import org.grobid.core.document.*; import org.grobid.core.engines.config.GrobidAnalysisConfig; import org.grobid.core.exceptions.GrobidException; @@ -22,7 +21,6 @@ import org.grobid.core.utilities.KeyGen; import org.grobid.core.utilities.LayoutTokensUtil; import org.grobid.core.GrobidModels; -import org.grobid.core.engines.label.TaggingLabel; import org.grobid.core.engines.label.TaggingLabels; import java.net.URLEncoder; @@ -382,6 +380,9 @@ public String toString() { // Copyrights/license information object CopyrightsLicense copyrightsLicense = null; + // All the tokens that are considered noise will be collected here + private List> discardedPiecesTokens = new ArrayList<>(); + public static final List confPrefixes = Arrays.asList("Proceedings of", "proceedings of", "In Proceedings of the", "In: Proceeding of", "In Proceedings, ", "In Proceedings of", "In Proceeding of", "in Proceeding of", "in Proceeding", "In Proceeding", "Proceedings", @@ -4522,4 +4523,16 @@ public void setCopyrightsLicense(CopyrightsLicense copyrightsLicense) { public CopyrightsLicense getCopyrightsLicense() { return this.copyrightsLicense; } + + public List> getDiscardedPiecesTokens() { + return discardedPiecesTokens; + } + + public void setDiscardedPiecesTokens(List> discardedPiecesTokens) { + this.discardedPiecesTokens = discardedPiecesTokens; + } + + public void addDiscardedPieceTokens(List pieceToken) { + this.discardedPiecesTokens.add(pieceToken); + } } diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java index ef4117e93c..3b7666c1e9 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Figure.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Figure.java @@ -88,6 +88,8 @@ public boolean apply(GraphicObject graphicObject) { private List textArea; private List layoutTokens; + private List> discardedPiecesTokens = new ArrayList<>(); + // coordinates private int page = -1; private double y = 0.0; @@ -568,4 +570,16 @@ public void setLabel(StringBuilder label) { public void setUri(URI uri) { this.uri = uri; } + + public List> getDiscardedPiecesTokens() { + return discardedPiecesTokens; + } + + public void setDiscardedPiecesTokens(List> discardedPiecesTokens) { + this.discardedPiecesTokens = discardedPiecesTokens; + } + + public void addDiscardedPieceTokens(List pieceToken) { + this.discardedPiecesTokens.add(pieceToken); + } } diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java index 14d468418c..db0f5e9446 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Table.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java @@ -49,6 +49,8 @@ public class Table extends Figure { private List noteLayoutTokens = null; private String labeledNote = null; + private List> discardedPiecesTokens = new ArrayList<>(); + public void setGoodTable(boolean goodTable) { this.goodTable = goodTable; @@ -423,4 +425,16 @@ public boolean isGoodTable() { public String getTeiId() { return "tab_" + this.id; } + + public List> getDiscardedPiecesTokens() { + return discardedPiecesTokens; + } + + public void setDiscardedPiecesTokens(List> discardedPiecesTokens) { + this.discardedPiecesTokens = discardedPiecesTokens; + } + + public void addDiscardedPieceTokens(List pieceToken) { + this.discardedPiecesTokens.add(pieceToken); + } } \ No newline at end of file diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index d9f2c46006..d546ba7040 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -103,16 +103,25 @@ public StringBuilder toTEIHeader(BiblioItem biblio, List markerTypes, List fundings, GrobidAnalysisConfig config) { - return toTEIHeader(biblio, SchemaDeclaration.XSD, defaultPublicationStatement, bds, markerTypes, fundings, config); + return toTEIHeader( + biblio, + SchemaDeclaration.XSD, + defaultPublicationStatement, + bds, + markerTypes, + fundings, + config + ); } - public StringBuilder toTEIHeader(BiblioItem biblio, - SchemaDeclaration schemaDeclaration, - String defaultPublicationStatement, - List bds, - List markerTypes, - List fundings, - GrobidAnalysisConfig config) { + public StringBuilder toTEIHeader( + BiblioItem biblio, + SchemaDeclaration schemaDeclaration, + String defaultPublicationStatement, + List bds, + List markerTypes, + List fundings, + GrobidAnalysisConfig config) { StringBuilder tei = new StringBuilder(); tei.append("\n"); if (config.isWithXslStylesheet()) { @@ -153,11 +162,9 @@ public StringBuilder toTEIHeader(BiblioItem biblio, if (config.isGenerateTeiCoordinates("title")) { List titleTokens = biblio.getLayoutTokens(TaggingLabels.HEADER_TITLE); - if (titleTokens != null && titleTokens.size()>0) { + if (CollectionUtils.isNotEmpty(titleTokens)) { String coords = LayoutTokensUtil.getCoordsString(titleTokens); - if (coords != null) { - tei.append(" coords=\"" + coords + "\""); - } + tei.append(" coords=\"" + coords + "\""); } } @@ -174,8 +181,7 @@ public StringBuilder toTEIHeader(BiblioItem biblio, tei.append("\n"); - if (fundings != null && fundings.size()>0) { - + if (CollectionUtils.isNotEmpty(fundings)) { Map funderSignatures = new TreeMap<>(); for(Funding funding : fundings) { if (funding.getFunder() != null && funding.getFunder().getFullName() != null) { @@ -218,8 +224,9 @@ public StringBuilder toTEIHeader(BiblioItem biblio, for (Map.Entry> entry : fundingRelation.entrySet()) { String funderPiece = null; Funder consolidatedFunder = null; - if (consolidatedFunders != null) + if (consolidatedFunders != null) { consolidatedFunder = consolidatedFunders.get(n); + } if (consolidatedFunder != null && config.getConsolidateFunders() == 1) { funderPiece = consolidatedFunder.toTEI(4); @@ -227,19 +234,20 @@ public StringBuilder toTEIHeader(BiblioItem biblio, Funder localFunder = entry.getKey(); localFunder.setDoi(consolidatedFunder.getDoi()); funderPiece = localFunder.toTEI(4); - } else + } else { funderPiece = entry.getKey().toTEI(4); + } // inject funding ref in the funder entries - String referenceString = ""; + StringBuilder referenceString = new StringBuilder(); for(Funding funderFunding : entry.getValue()) { if (funderFunding.isNonEmptyFunding()) - referenceString += " #" + funderFunding.getIdentifier(); + referenceString.append(" #").append(funderFunding.getIdentifier()); } if (funderPiece != null) { if (referenceString.length()>0) - funderPiece = funderPiece.replace("", ""); + funderPiece = funderPiece.replace("", ""); tei.append(funderPiece); } n++; @@ -440,11 +448,9 @@ else if (biblio.getE_Year().length() == 4) if (config.isGenerateTeiCoordinates("title")) { List titleTokens = biblio.getLayoutTokens(TaggingLabels.HEADER_TITLE); - if (titleTokens != null && titleTokens.size()>0) { + if (CollectionUtils.isNotEmpty(titleTokens)) { String coords = LayoutTokensUtil.getCoordsString(titleTokens); - if (coords != null) { - tei.append(" coords=\"" + coords + "\""); - } + tei.append(" coords=\"" + coords + "\""); } } @@ -537,7 +543,7 @@ else if (biblio.getE_Year().length() == 4) // // TODO: // } - // in case the booktitle corresponds to a proceedings, we can try to indicate the meeting title + // in case the book title corresponds to a proceedings, we can try to indicate the meeting title String meeting = biblio.getBookTitle(); boolean meetLoc = false; if (biblio.getEvent() != null) @@ -592,11 +598,14 @@ else if (meeting != null) { String pageRange = biblio.getPageRange(); - if ((biblio.getVolumeBlock() != null) | (biblio.getPublicationDate() != null) | - (biblio.getNormalizedPublicationDate() != null) | - (pageRange != null) | (biblio.getIssue() != null) | - (biblio.getBeginPage() != -1) | - (biblio.getPublisher() != null)) { + if (biblio.getVolumeBlock() != null + || biblio.getPublicationDate() != null + || biblio.getNormalizedPublicationDate() != null + || pageRange != null + || biblio.getIssue() != null + || biblio.getBeginPage() != -1 + || biblio.getPublisher() != null) { + tei.append("\t\t\t\t\t\t\n"); if (biblio.getPublisher() != null) { @@ -814,6 +823,24 @@ else if (biblio.getE_Year().length() == 4) } tei.append("\t\t\t\n"); + + // We collect the discarded text from the header and add it as a + if(config.isIncludeDiscardedText()) { + tei.append("\t\t\t\n"); + for (List discardedPieceTokens : biblio.getDiscardedPiecesTokens()) { + LayoutToken first = Iterables.getFirst(discardedPieceTokens, null); + String place = first == null ? "unknown" : first.getLabels().get(0).getGrobidModel().getModelName(); + + tei.append("\t\t\t\t" + TextUtilities.HTMLEncode(normalizeText(LayoutTokensUtil.toText(discardedPieceTokens))) + "\n"); + } + tei.append("\t\t\t\n"); + } + tei.append("\t\t\n"); // encodingDesc gives info about the producer of the file @@ -923,24 +950,25 @@ else if (biblio.getE_Year().length() == 4) tei.append("\t\t\t\n"); } - if ((abstractText != null) && (abstractText.length() != 0)) { - if ( (biblio.getLabeledAbstract() != null) && (biblio.getLabeledAbstract().length() > 0) ) { + if (StringUtils.isNotBlank(abstractText)) { + if (StringUtils.isNotBlank (biblio.getLabeledAbstract()) ) { // we have available structured abstract, which can be serialized as a full text "piece" StringBuilder buffer = new StringBuilder(); try { - buffer = toTEITextPiece(buffer, - biblio.getLabeledAbstract(), - biblio, - bds, - false, - new LayoutTokenization(biblio.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT)), - null, - null, - null, - null, - markerTypes, - doc, - config); // no figure, no table, no equation + buffer = toTEITextPiece( + buffer, + biblio.getLabeledAbstract(), + biblio, + bds, + false, + new LayoutTokenization(biblio.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT)), + null, + null, + null, + null, + markerTypes, + doc, + config); // no figure, no table, no equation } catch(Exception e) { throw new GrobidException("An exception occurred while serializing TEI.", e); } @@ -1065,8 +1093,21 @@ public StringBuilder toTEIBody(StringBuilder buffer, List notes = getTeiNotes(doc); - buffer = toTEITextPiece(buffer, result, biblio, bds, true, - layoutTokenization, figures, tables, equations, notes, markerTypes, doc, config); + buffer = toTEITextPiece( + buffer, + result, + biblio, + bds, + true, + layoutTokenization, + figures, + tables, + equations, + notes, + markerTypes, + doc, + config + ); // notes are still in the body buffer = toTEINote(buffer, notes, doc, markerTypes, config); @@ -1396,19 +1437,20 @@ public StringBuilder toTEIAnnex(StringBuilder buffer, return buffer; } - public StringBuilder toTEITextPiece(StringBuilder buffer, - String result, - BiblioItem biblio, - List bds, - boolean keepUnsolvedCallout, - LayoutTokenization layoutTokenization, - List
figures, - List tables, - List equations, - List notes, - List markerTypes, - Document doc, - GrobidAnalysisConfig config) throws Exception { + public StringBuilder toTEITextPiece( + StringBuilder buffer, + String result, + BiblioItem biblio, + List bds, + boolean keepUnsolvedCallout, + LayoutTokenization layoutTokenization, + List
figures, + List
tables, + List equations, + List notes, + List markerTypes, + Document doc, + GrobidAnalysisConfig config) throws Exception { TaggingLabel lastClusterLabel = null; int startPosition = buffer.length(); @@ -1472,7 +1514,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, clusterLabel.equals(TaggingLabels.EQUATION_LABEL)) { // get starting position of the cluster int start = -1; - if ( (cluster.concatTokens() != null) && (cluster.concatTokens().size() > 0) ) { + if (CollectionUtils.isNotEmpty (cluster.concatTokens()) ) { start = cluster.concatTokens().get(0).getOffset(); } // get the corresponding equation @@ -2505,7 +2547,7 @@ public List markReferencesEquationTEI(String text, String bestFormula = null; if (equations != null) { for (Equation equation : equations) { - if ((equation.getLabel() != null) && (equation.getLabel().length() > 0)) { + if (StringUtils.isNotBlank(equation.getLabel())) { String label = TextUtilities.cleanField(equation.getLabel(), false); Matcher m2 = patternNumber.matcher(label); String labelNumber = null; diff --git a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java index 296b685114..71bc477d1e 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java @@ -344,6 +344,7 @@ public String processHeader( int consolidate, boolean includeRawAffiliations, boolean includeRawCopyrights, + boolean includeDiscardedText, BiblioItem result ) { GrobidAnalysisConfig config = new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() @@ -352,6 +353,7 @@ public String processHeader( .consolidateHeader(consolidate) .includeRawAffiliations(includeRawAffiliations) .includeRawCopyrights(includeRawCopyrights) + .includeRawCopyrights(includeDiscardedText) .build(); return processHeader(inputFile, null, config, result); } @@ -376,13 +378,15 @@ public String processHeaderFunding( int consolidateHeader, int consolidateFunders, boolean includeRawAffiliations, - boolean includeRawCopyrights + boolean includeRawCopyrights, + boolean includeDiscardedText ) throws Exception { GrobidAnalysisConfig config = new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() .consolidateHeader(consolidateHeader) .consolidateFunders(consolidateFunders) .includeRawAffiliations(includeRawAffiliations) .includeRawCopyrights(includeRawCopyrights) + .includeDiscardedText(includeDiscardedText) .build(); return processHeaderFunding(inputFile, null, config); } @@ -406,6 +410,7 @@ public String processHeader( int consolidate, boolean includeRawAffiliations, boolean includeRawCopyrights, + boolean includeDiscardedText, BiblioItem result ) { GrobidAnalysisConfig config = new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() @@ -414,6 +419,7 @@ public String processHeader( .consolidateHeader(consolidate) .includeRawAffiliations(includeRawAffiliations) .includeRawCopyrights(includeRawCopyrights) + .includeDiscardedText(includeDiscardedText) .build(); return processHeader(inputFile, md5Str, config, result); } @@ -440,13 +446,15 @@ public String processHeaderFunding( int consolidateHeader, int consolidateFunders, boolean includeRawAffiliations, - boolean includeRawCopyrights + boolean includeRawCopyrights, + boolean includeDiscardedText ) throws Exception { GrobidAnalysisConfig config = new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() .consolidateHeader(consolidateHeader) .consolidateFunders(consolidateFunders) .includeRawAffiliations(includeRawAffiliations) .includeRawCopyrights(includeRawCopyrights) + .includeDiscardedText(includeDiscardedText) .build(); return processHeaderFunding(inputFile, md5Str, config); } @@ -456,15 +464,12 @@ public String processHeaderFunding( * dynamic range of pages as header * * @param inputFile : the path of the PDF file to be processed - * @param consolidate the consolidation option allows GROBID to exploit Crossref web services for improving header - * information. 0 (no consolidation, default value), 1 (consolidate the citation and inject extra - * metadata) or 2 (consolidate the citation and inject DOI only) * @param result bib result * * @return the TEI representation of the extracted bibliographical * information */ - public String processHeader(String inputFile, int consolidate, BiblioItem result) { + public String processHeader(String inputFile, BiblioItem result) { return processHeader(inputFile, null, GrobidAnalysisConfig.defaultInstance(), result); } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FigureParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FigureParser.java index 49511969e7..5ff7f386b9 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FigureParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FigureParser.java @@ -74,7 +74,7 @@ private Figure getExtractionResult(List tokenizations, String resul //label should also go to head figure.appendHeader(" " + clusterContent + " "); } else if (clusterLabel.equals(FIG_OTHER)) { - + figure.addDiscardedPieceTokens(cluster.concatTokens()); } else if (clusterLabel.equals(FIG_CONTENT)) { figure.appendContent(clusterContent); } else { diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index b102b20795..28f0f00031 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -57,19 +57,14 @@ import java.io.OutputStreamWriter; import java.io.Writer; -import java.util.Arrays; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.HashMap; -import java.util.SortedSet; -import java.util.StringTokenizer; -import java.util.TreeSet; +import java.util.*; import java.util.regex.Matcher; +import java.util.stream.Collectors; import nu.xom.Element; import static org.apache.commons.lang3.StringUtils.*; +import static org.grobid.core.engines.label.TaggingLabels.PARAGRAPH_LABEL; public class FullTextParser extends AbstractParser { private static final Logger LOGGER = LoggerFactory.getLogger(FullTextParser.class); @@ -276,6 +271,9 @@ else if (config.getConsolidateCitations() == 2) figure.setLabeledCaption(captionProcess.getLeft()); figure.setCaptionLayoutTokens(captionProcess.getRight()); } + if (CollectionUtils.isNotEmpty(figure.getDiscardedPiecesTokens())) { + resHeader.getDiscardedPiecesTokens().addAll(figure.getDiscardedPiecesTokens()); + } } tables = processTables(resultBody, layoutTokenization.getTokenization(), doc); @@ -291,6 +289,9 @@ else if (config.getConsolidateCitations() == 2) table.setLabeledNote(noteProcess.getLeft()); table.setNoteLayoutTokens(noteProcess.getRight()); } + if (CollectionUtils.isNotEmpty(table.getDiscardedPiecesTokens())) { + resHeader.getDiscardedPiecesTokens().addAll(table.getDiscardedPiecesTokens()); + } } equations = processEquations(resultBody, layoutTokenization.getTokenization(), doc); @@ -316,15 +317,21 @@ else if (config.getConsolidateCitations() == 2) // callout in superscript is by error labeled as a numerical reference callout) List markerTypes = null; - if (resultBody != null) + if (resultBody != null) { markerTypes = postProcessCallout(resultBody, layoutTokenization); + } // final combination toTEI(doc, // document - resultBody, resultAnnex, // labeled data for body and annex - layoutTokenization, tokenizationsBody2, // tokenization for body and annex + resultBody, + resultAnnex, // labeled data for body and annex + layoutTokenization, + tokenizationsBody2, // tokenization for body and annex resHeader, // header - figures, tables, equations, markerTypes, + figures, + tables, + equations, + markerTypes, config); return doc; } catch (GrobidException e) { @@ -355,7 +362,6 @@ public Document processingHeaderFunding(DocumentSource documentSource, try { // general segmentation Document doc = parsers.getSegmentationParser().processing(documentSource, config); - SortedSet documentBodyParts = doc.getDocumentPart(SegmentationLabels.BODY); // header processing BiblioItem resHeader = new BiblioItem(); @@ -1571,9 +1577,6 @@ public Document createTraining(File inputFile, /** * Extract results from a labelled full text in the training format without any string modification. * - * @param result reult - * @param tokenizations toks - * @return extraction */ private StringBuilder trainingExtraction(String result, List tokenizations) { @@ -2687,7 +2690,14 @@ private void toTEI(Document doc, } } - tei.append(teiFormatter.toTEIHeader(resHeader, null, resCitations, markerTypes, fundings, config)); + tei.append(teiFormatter.toTEIHeader( + resHeader, + null, + resCitations, + markerTypes, + fundings, + config) + ); tei = teiFormatter.toTEIBody(tei, reseBody, resHeader, resCitations, layoutTokenization, figures, tables, equations, markerTypes, doc, config); @@ -2699,7 +2709,7 @@ private void toTEI(Document doc, tei.append(annexStatement); } - if (fundings != null && fundings.size() >0) { + if (CollectionUtils.isNotEmpty(fundings)) { tei.append("\n\t\t\t\n"); for(Funding funding : fundings) { if (funding.isNonEmptyFunding()) @@ -2708,7 +2718,7 @@ private void toTEI(Document doc, tei.append("\t\t\t\n"); } - if (affiliations != null && affiliations.size() >0) { + if (CollectionUtils.isNotEmpty(affiliations)) { // check if we have at least one acknowledged research infrastructure here List filteredInfrastructures = new ArrayList<>(); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java index 185f3714d5..4636a7459f 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java @@ -93,7 +93,12 @@ public Pair processing(File input, String md5Str, BiblioItem r /** * Header processing after application of the segmentation model */ - public String processingHeaderSection(GrobidAnalysisConfig config, Document doc, BiblioItem resHeader, boolean serialize) { + public String processingHeaderSection( + GrobidAnalysisConfig config, + Document doc, + BiblioItem resHeader, + boolean serialize + ) { try { SortedSet documentHeaderParts = doc.getDocumentPart(SegmentationLabels.HEADER); List tokenizations = doc.getTokenizations(); @@ -1047,7 +1052,9 @@ else if (biblio.getPublicationDate() == null) }*/ if (biblio.getJournal() == null) biblio.setJournal(clusterContent); - } + } else if (clusterLabel.equals(TaggingLabels.HEADER_OTHER)) { + biblio.addDiscardedPieceTokens(cluster.concatTokens()); + } /*else if (clusterLabel.equals(TaggingLabels.HEADER_INTRO)) { return biblio; }*/ diff --git a/grobid-core/src/main/java/org/grobid/core/engines/ProcessEngine.java b/grobid-core/src/main/java/org/grobid/core/engines/ProcessEngine.java index 435948ee44..f7719262d7 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/ProcessEngine.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/ProcessEngine.java @@ -91,7 +91,7 @@ private void processHeaderDirectory(File[] files, final GrobidMainArgs pGbdArgs, for (final File currPdf : files) { try { if (currPdf.getName().toLowerCase().endsWith(".pdf")) { - result = getEngine().processHeader(currPdf.getAbsolutePath(), 0, null); + result = getEngine().processHeader(currPdf.getAbsolutePath(), null); File outputPathFile = new File(outputPath); if (!outputPathFile.exists()) { outputPathFile.mkdirs(); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java b/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java index d1841f5e86..c58b96efa4 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java @@ -103,7 +103,7 @@ private List
getExtractionResult(List tokenizations, String table.addAllNoteLayoutTokens(tokens); table.addLayoutTokens(tokens); } else if (clusterLabel.equals(TBL_OTHER)) { - table.addLayoutTokens(tokens); + table.addDiscardedPieceTokens(cluster.concatTokens()); } else if (clusterLabel.equals(TBL_CONTENT)) { table.appendContent(clusterContent); table.getContentTokens().addAll(tokens); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/config/GrobidAnalysisConfig.java b/grobid-core/src/main/java/org/grobid/core/engines/config/GrobidAnalysisConfig.java index 52938eca81..72bd7856a2 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/config/GrobidAnalysisConfig.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/config/GrobidAnalysisConfig.java @@ -59,6 +59,9 @@ private GrobidAnalysisConfig() { // if the raw copyrights/license string should be included in the parsed results private boolean includeRawCopyrights = false; + //if the text marked as in fulltext and header should be retained + private boolean includeDiscardedText = false; + /// === TEI-specific settings == // if true, generate random attribute id on the textual elements of @@ -90,6 +93,14 @@ private GrobidAnalysisConfig() { // if true, the TEI text will be segmented into sentences private boolean withSentenceSegmentation = false; + public boolean isIncludeDiscardedText() { + return includeDiscardedText; + } + + public void setIncludeDiscardedText(boolean includeDiscardedText) { + this.includeDiscardedText = includeDiscardedText; + } + // BUILDER public static class GrobidAnalysisConfigBuilder { @@ -139,6 +150,11 @@ public GrobidAnalysisConfigBuilder includeRawCopyrights(boolean rawCopyrights) { return this; } + public GrobidAnalysisConfigBuilder includeDiscardedText(boolean includeDiscardedText) { + config.includeDiscardedText = includeDiscardedText; + return this; + } + public GrobidAnalysisConfigBuilder startPage(int p) { config.startPage = p; return this; diff --git a/grobid-core/src/test/java/org/grobid/core/test/TestHeaderParser.java b/grobid-core/src/test/java/org/grobid/core/test/TestHeaderParser.java index 8414d5d4cc..6b38781eb7 100755 --- a/grobid-core/src/test/java/org/grobid/core/test/TestHeaderParser.java +++ b/grobid-core/src/test/java/org/grobid/core/test/TestHeaderParser.java @@ -39,7 +39,7 @@ public void testHeaderHeader() throws Exception { File pdfFile = new File(pdfPath); BiblioItem resHeader = new BiblioItem(); - String tei = engine.processHeader(pdfFile.getAbsolutePath(), 0, resHeader); + String tei = engine.processHeader(pdfFile.getAbsolutePath(), resHeader); assertNotNull(resHeader); assertThat(resHeader.getTitle(), is("Information Synthesis for Answer Validation")); @@ -50,21 +50,21 @@ public void testHeaderHeader() throws Exception { String absolutePath = FileSystems.getDefault().getPath(testPath).normalize().toAbsolutePath().toString(); pdfPath = absolutePath + File.separator + "ZFN-A-054-0304-0272.pdf"; resHeader = new BiblioItem(); - tei = engine.processHeader(pdfPath, 0, resHeader); + tei = engine.processHeader(pdfPath, resHeader); assertNotNull(resHeader); //System.out.println(tei); pdfPath = absolutePath + File.separator + "ZNC-1988-43c-0034.pdf"; resHeader = new BiblioItem(); - tei = engine.processHeader(pdfPath, 0, resHeader); + tei = engine.processHeader(pdfPath, resHeader); //System.out.println(tei); //assertNotNull(resHeader); pdfPath = absolutePath + File.separator + "ZNC-1988-43c-0065.pdf"; resHeader = new BiblioItem(); - tei = engine.processHeader(pdfPath, 0, resHeader); + tei = engine.processHeader(pdfPath, resHeader); assertNotNull(resHeader); //System.out.println(tei); diff --git a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java index 5659d02393..73fbdf34e1 100755 --- a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java +++ b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java @@ -54,6 +54,7 @@ public class GrobidRestService implements GrobidPaths { public static final String CONSOLIDATE_FUNDERS = "consolidateFunders"; public static final String INCLUDE_RAW_AFFILIATIONS = "includeRawAffiliations"; public static final String INCLUDE_RAW_CITATIONS = "includeRawCitations"; + public static final String INCLUDE_DISCARDED_TEXT = "includeDiscardedText"; public static final String INCLUDE_RAW_COPYRIGHTS = "includeRawCopyrights"; public static final String INCLUDE_FIGURES_TABLES = "includeFiguresTables"; @@ -136,12 +137,14 @@ public Response processHeaderDocumentReturnXml_post( @FormDataParam(INPUT) InputStream inputStream, @DefaultValue("0") @FormDataParam(CONSOLIDATE_HEADER) String consolidate, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, - @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights) { + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, + @DefaultValue("0") @FormDataParam(INCLUDE_DISCARDED_TEXT) String includeDiscardedText) { int consol = validateConsolidationParam(consolidate); return restProcessFiles.processStatelessHeaderDocument( inputStream, consol, validateIncludeRawParam(includeRawAffiliations), validateIncludeRawParam(includeRawCopyrights), + validateIncludeRawParam(includeDiscardedText), ExpectedResponseType.XML ); } @@ -155,12 +158,15 @@ public Response processHeaderFundingDocumentReturnXml_post( @DefaultValue("0") @FormDataParam(CONSOLIDATE_HEADER) String consolidateHeader, @DefaultValue("0") @FormDataParam(CONSOLIDATE_FUNDERS) String consolidateFunders, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, - @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights) { + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, + @DefaultValue("0") @FormDataParam(INCLUDE_DISCARDED_TEXT) String includeDiscardedText) { int consolHeader = validateConsolidationParam(consolidateHeader); int consolFunders = validateConsolidationParam(consolidateFunders); return restProcessFiles.processStatelessHeaderFundingDocument( inputStream, consolHeader, consolFunders, - validateIncludeRawParam(includeRawAffiliations), validateIncludeRawParam(includeRawCopyrights) + validateIncludeRawParam(includeRawAffiliations), + validateIncludeRawParam(includeRawCopyrights), + validateIncludeRawParam(includeDiscardedText) ); } @@ -173,8 +179,9 @@ public Response processStatelessHeaderDocumentReturnXml( @FormDataParam(INPUT) InputStream inputStream, @DefaultValue("0") @FormDataParam(CONSOLIDATE_HEADER) String consolidate, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, - @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights) { - return processHeaderDocumentReturnXml_post(inputStream, consolidate, includeRawAffiliations, includeRawCopyrights); + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, + @DefaultValue("0") @FormDataParam(INCLUDE_DISCARDED_TEXT) String includeDiscardedText) { + return processHeaderDocumentReturnXml_post(inputStream, consolidate, includeRawAffiliations, includeRawCopyrights, includeDiscardedText); } @Path(PATH_HEADER) @@ -188,7 +195,8 @@ public Response processHeaderDocumentReturnBibTeX_post( @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights) { int consol = validateConsolidationParam(consolidate); return restProcessFiles.processStatelessHeaderDocument( - inputStream, consol, + inputStream, + consol, validateIncludeRawParam(includeRawAffiliations), validateIncludeRawParam(includeRawCopyrights), ExpectedResponseType.BIBTEX @@ -204,7 +212,11 @@ public Response processStatelessHeaderDocumentReturnBibTeX( @DefaultValue("0") @FormDataParam(CONSOLIDATE_HEADER) String consolidate, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights) { - return processHeaderDocumentReturnBibTeX_post(inputStream, consolidate, includeRawAffiliations, includeRawCopyrights); + return processHeaderDocumentReturnBibTeX_post( + inputStream, + consolidate, + includeRawAffiliations, + includeRawCopyrights); } @Path(PATH_FULL_TEXT) @@ -219,6 +231,7 @@ public Response processFulltextDocument_post( @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_CITATIONS) String includeRawCitations, + @DefaultValue("0") @FormDataParam(INCLUDE_DISCARDED_TEXT) String includeDiscardedText, @DefaultValue("-1") @FormDataParam("start") int startPage, @DefaultValue("-1") @FormDataParam("end") int endPage, @FormDataParam("generateIDs") String generateIDs, @@ -226,7 +239,7 @@ public Response processFulltextDocument_post( @FormDataParam("teiCoordinates") List coordinates) throws Exception { return processFulltext( inputStream, consolidateHeader, consolidateCitations, consolidateFunders, - includeRawAffiliations, includeRawCitations, includeRawCopyrights, + includeRawAffiliations, includeRawCitations, includeRawCopyrights, includeDiscardedText, startPage, endPage, generateIDs, segmentSentences, coordinates ); } @@ -243,6 +256,7 @@ public Response processFulltextDocument( @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_CITATIONS) String includeRawCitations, + @DefaultValue("0") @FormDataParam(INCLUDE_DISCARDED_TEXT) String includeDiscardedText, @DefaultValue("-1") @FormDataParam("start") int startPage, @DefaultValue("-1") @FormDataParam("end") int endPage, @FormDataParam("generateIDs") String generateIDs, @@ -250,7 +264,7 @@ public Response processFulltextDocument( @FormDataParam("teiCoordinates") List coordinates) throws Exception { return processFulltext( inputStream, consolidateHeader, consolidateCitations, consolidateFunders, - includeRawAffiliations, includeRawCitations, includeRawCopyrights, + includeRawAffiliations, includeRawCitations, includeRawCopyrights, includeDiscardedText, startPage, endPage, generateIDs, segmentSentences, coordinates ); } @@ -262,6 +276,7 @@ private Response processFulltext(InputStream inputStream, String includeRawAffiliations, String includeRawCitations, String includeRawCopyrights, + String includeDiscardedText, int startPage, int endPage, String generateIDs, @@ -280,7 +295,9 @@ private Response processFulltext(InputStream inputStream, return restProcessFiles.processFulltextDocument( inputStream, consolHeader, consolCitations, consolFunders, validateIncludeRawParam(includeRawAffiliations), - includeRaw, validateIncludeRawParam(includeRawCopyrights), + includeRaw, + validateIncludeRawParam(includeRawCopyrights), + validateIncludeRawParam(includeDiscardedText), startPage, endPage, generate, segment, teiCoordinates ); } diff --git a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java index 4692fe3de9..142f93a43f 100644 --- a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java +++ b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java @@ -60,11 +60,29 @@ public GrobidRestProcessFiles() { * @param consolidate consolidation parameter for the header extraction * @return a response object which contains a TEI representation of the header part */ + public Response processStatelessHeaderDocument( + final InputStream inputStream, + final int consolidate, + final boolean includeRawAffiliations, + final boolean includeRawCopyrights, + ExpectedResponseType expectedResponseType + ) { + return processStatelessHeaderDocument( + inputStream, + consolidate, + includeRawAffiliations, + includeRawCopyrights, + false, + expectedResponseType + ); + } + public Response processStatelessHeaderDocument( final InputStream inputStream, final int consolidate, final boolean includeRawAffiliations, final boolean includeRawCopyrights, + final boolean includeDiscardedText, ExpectedResponseType expectedResponseType ) { LOGGER.debug(methodLogIn()); @@ -103,6 +121,7 @@ public Response processStatelessHeaderDocument( consolidate, includeRawAffiliations, includeRawCopyrights, + includeDiscardedText, result ); @@ -153,7 +172,8 @@ public Response processStatelessHeaderFundingDocument( final int consolidateHeader, final int consolidateFunders, final boolean includeRawAffiliations, - final boolean includeRawCopyrights + final boolean includeRawCopyrights, + final boolean includeDiscardedText ) { LOGGER.debug(methodLogIn()); String retVal = null; @@ -189,7 +209,8 @@ public Response processStatelessHeaderFundingDocument( consolidateHeader, consolidateFunders, includeRawAffiliations, - includeRawCopyrights + includeRawCopyrights, + includeDiscardedText ); if (GrobidRestUtils.isResultNullOrEmpty(retVal)) { @@ -245,6 +266,7 @@ public Response processFulltextDocument(final InputStream inputStream, final boolean includeRawAffiliations, final boolean includeRawCitations, final boolean includeRawCopyrights, + final boolean includeDiscardedText, final int startPage, final int endPage, final boolean generateIDs, @@ -286,6 +308,7 @@ public Response processFulltextDocument(final InputStream inputStream, .includeRawAffiliations(includeRawAffiliations) .includeRawCitations(includeRawCitations) .includeRawCopyrights(includeRawCopyrights) + .includeDiscardedText(includeDiscardedText) .startPage(startPage) .endPage(endPage) .generateTeiIds(generateIDs)