From f95a5be5dc85f23bcf12830a7ca95e6625e1a731 Mon Sep 17 00:00:00 2001 From: noureldin-eg Date: Sat, 11 Feb 2023 16:50:28 +0200 Subject: [PATCH] PDFBOX-5487: Remove all space characters if contained within the adjacent letters https://issues.apache.org/jira/browse/PDFBOX-5487 --- .../apache/pdfbox/text/PDFTextStripper.java | 32 ++++++++++++ .../org/apache/pdfbox/text/TextPosition.java | 50 +++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java b/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java index 4be7defe333..7f2a8bf969e 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java @@ -508,6 +508,10 @@ protected void writePage() throws IOException { IterativeMergeSort.sort(textList, comparator); } + finally + { + removeContainedSpaces(textList); + } } startArticle(); @@ -709,6 +713,34 @@ private boolean overlap(float y1, float height1, float y2, float height2) || y1 <= y2 && y1 >= y2 - height2; } + /** + * Remove all space characters if contained within the adjacent letters + */ + private void removeContainedSpaces(List textList) + { + TextPosition position, previousPosition; + Iterator iterator = textList.iterator(); + + if (!iterator.hasNext()) + { + return; + } + previousPosition = iterator.next(); + + while (iterator.hasNext()) + { + position = iterator.next(); + if (" ".equals(position.getUnicode()) && previousPosition.completelyContains(position)) + { + iterator.remove(); + } + else + { + previousPosition = position; + } + } + } + /** * Write the line separator value to the output stream. * diff --git a/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java b/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java index f4ba18d0344..227d82f50d9 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java @@ -570,6 +570,56 @@ else if (tp2Xstart < thisXstart && tp2Xend < thisXend) return true; } + /** + * Determine if this TextPosition perfectly contains another (i.e. the other TextPosition + * overlaps 100% with this one and fits entirely inside its bounding box when they are rendered + * on top of each other). + * + * @param tp2 The other TestPosition to compare against + * @return True if tp2 is contained completely inside the bounding box of this text. + */ + public boolean completelyContains(TextPosition tp2) + { + // Note: (0, 0) is in the upper left and y-coordinate is top of TextPosition + + // +---thisTop------------+ + // | +--tp2Top---+ | + // | | | | + // thisLeft | tp2Right | + // | tp2Left | thisRight + // | | | | + // | +-tp2Bottom-+ | + // +---------thisBottom---+ + + float thisLeft = getXDirAdj(); + float thisWidth = getWidthDirAdj(); + float thisRight = thisLeft + thisWidth; + + float tp2Left = tp2.getXDirAdj(); + float tp2Width = tp2.getWidthDirAdj(); + float tp2Right = tp2Left + tp2Width; + + if (thisLeft > tp2Left || tp2Right > thisRight) + { + return false; + } + + float thisTop = getYDirAdj(); + float thisHeight = getHeightDir(); + float thisBottom = thisTop + thisHeight; + + float tp2Top = tp2.getYDirAdj(); + float tp2Height = tp2.getHeightDir(); + float tp2Bottom = tp2Top + tp2Height; + + if (thisTop > tp2Top || tp2Bottom > thisBottom) + { + return false; + } + + return true; + } + /** * Merge a single character TextPosition into the current object. This is to be used only for * cases where we have a diacritic that overlaps an existing TextPosition. In a graphical