Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PDFBOX-5487: Remove all space characters if contained within the adjacent letters #155

Open
wants to merge 1 commit into
base: trunk
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,10 @@ protected void writePage() throws IOException
{
IterativeMergeSort.sort(textList, comparator);
}
finally
{
removeContainedSpaces(textList);
}
}

startArticle();
Expand Down Expand Up @@ -709,6 +713,34 @@ private boolean overlap(float y1, float height1, float y2, float height2)
|| y1 <= y2 && y1 >= y2 - height2;
}

/**
* Remove all space characters if contained within the adjacent letters
*/
private void removeContainedSpaces(List<TextPosition> textList)
{
TextPosition position, previousPosition;
Iterator<TextPosition> iterator = textList.iterator();

if (!iterator.hasNext())
{
return;
}
previousPosition = iterator.next();

while (iterator.hasNext())
{
position = iterator.next();
if (" ".equals(position.getUnicode()) && previousPosition.completelyContains(position))
{
iterator.remove();
}
else
{
previousPosition = position;
}
}
}

/**
* Write the line separator value to the output stream.
*
Expand Down
50 changes: 50 additions & 0 deletions pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,56 @@ else if (tp2Xstart < thisXstart && tp2Xend < thisXend)
return true;
}

/**
* Determine if this TextPosition perfectly contains another (i.e. the other TextPosition
* overlaps 100% with this one and fits entirely inside its bounding box when they are rendered
* on top of each other).
*
* @param tp2 The other TestPosition to compare against
* @return True if tp2 is contained completely inside the bounding box of this text.
*/
public boolean completelyContains(TextPosition tp2)
{
// Note: (0, 0) is in the upper left and y-coordinate is top of TextPosition

// +---thisTop------------+
// | +--tp2Top---+ |
// | | | |
// thisLeft | tp2Right |
// | tp2Left | thisRight
// | | | |
// | +-tp2Bottom-+ |
// +---------thisBottom---+

float thisLeft = getXDirAdj();
float thisWidth = getWidthDirAdj();
float thisRight = thisLeft + thisWidth;

float tp2Left = tp2.getXDirAdj();
float tp2Width = tp2.getWidthDirAdj();
float tp2Right = tp2Left + tp2Width;

if (thisLeft > tp2Left || tp2Right > thisRight)
{
return false;
}

float thisTop = getYDirAdj();
float thisHeight = getHeightDir();
float thisBottom = thisTop + thisHeight;

float tp2Top = tp2.getYDirAdj();
float tp2Height = tp2.getHeightDir();
float tp2Bottom = tp2Top + tp2Height;

if (thisTop > tp2Top || tp2Bottom > thisBottom)
{
return false;
}

return true;
}

/**
* Merge a single character TextPosition into the current object. This is to be used only for
* cases where we have a diacritic that overlaps an existing TextPosition. In a graphical
Expand Down