Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the isFarAway issue #12338

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -243,11 +243,13 @@ protected void writeString(String text, List<TextPosition> textPositions) {
}

private boolean isFarAway(TextPosition previous, TextPosition current) {
float XspaceThreshold = 3.0F;
float YspaceThreshold = previous.getFontSizeInPt() * 1.5F;
float XspaceThreshold = previous.getFontSizeInPt() * 3.0F;
float YspaceThreshold = previous.getFontSizeInPt() * 3.0F;
float Xgap = current.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj());
float Ygap = current.getYDirAdj() - (previous.getYDirAdj() - previous.getHeightDir());
return Xgap > XspaceThreshold && Ygap > YspaceThreshold;
float Ygap = current.getYDirAdj() - previous.getYDirAdj();
// For cases like paper titles spanning two or more lines, both X and Y gaps must exceed thresholds,
// so "&&" is used instead of "||".
return Math.abs(Xgap) > XspaceThreshold && Math.abs(Ygap) > YspaceThreshold;
}

private boolean isUnwantedText(TextPosition previousTextPosition, TextPosition textPosition) {
Expand All @@ -258,28 +260,27 @@ private boolean isUnwantedText(TextPosition previousTextPosition, TextPosition t
return true;
}
// The title usually don't in the bottom 10% of a page.
if ((textPosition.getPageHeight() - textPosition.getYDirAdj())
< (textPosition.getPageHeight() * 0.1)) {
return true;
}
// The title character usually stay together.
return isFarAway(previousTextPosition, textPosition);
return (textPosition.getPageHeight() - textPosition.getYDirAdj())
< (textPosition.getPageHeight() * 0.1);
}

private Optional<String> findLargestFontText(List<TextPosition> textPositions) {
Map<Float, StringBuilder> fontSizeTextMap = new TreeMap<>(Collections.reverseOrder());
Map<Float, TextPosition> lastPositionMap = new TreeMap<>(Collections.reverseOrder());
TextPosition previousTextPosition = null;
for (TextPosition textPosition : textPositions) {
float fontSize = textPosition.getFontSizeInPt();
// Exclude unwanted text based on heuristics
if (isUnwantedText(previousTextPosition, textPosition)) {
if (isUnwantedText(previousTextPosition, textPosition) ||
(lastPositionMap.containsKey(fontSize) && isFarAway(lastPositionMap.get(fontSize), textPosition))) {
continue;
}
float fontSize = textPosition.getFontSizeInPt();
fontSizeTextMap.putIfAbsent(fontSize, new StringBuilder());
if (previousTextPosition != null && isThereSpace(previousTextPosition, textPosition)) {
fontSizeTextMap.get(fontSize).append(" ");
}
fontSizeTextMap.get(fontSize).append(textPosition.getUnicode());
lastPositionMap.put(fontSize, textPosition);
previousTextPosition = textPosition;
}
for (Map.Entry<Float, StringBuilder> entry : fontSizeTextMap.entrySet()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ private static Stream<Arguments> providePdfData() {
Arguments.of("On the impact of service-oriented patterns on software evolvability: a controlled experiment and metric-based analysis", "/pdfs/PdfContentImporter/Bogner2019.pdf"),
Arguments.of("Pandemic programming", "/pdfs/PdfContentImporter/Ralph2020.pdf"),
Arguments.of("Do RESTful API design rules have an impact on the understandability of Web APIs?", "/pdfs/PdfContentImporter/Bogner2023.pdf"),
Arguments.of("Adopting microservices and DevOps in the cyber-physical systems domain: A rapid review and case study", "/pdfs/PdfContentImporter/Fritzsch2022.pdf")
Arguments.of("Adopting microservices and DevOps in the cyber-physical systems domain: A rapid review and case study", "/pdfs/PdfContentImporter/Fritzsch2022.pdf"),
Arguments.of("OPIUM: Optimal Package Install/Uninstall Manager", "/pdfs/PdfContentImporter/opium.pdf")
);
}
}
Binary file not shown.
Loading