Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement line color filter #421

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion src/main/java/technology/tabula/CommandLineApp.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,15 @@ public class CommandLineApp {
private OutputFormat outputFormat;
private String password;
private TableExtractor tableExtractor;
private Integer lineColorFilter;

public CommandLineApp(Appendable defaultOutput, CommandLine line) throws ParseException {
this.defaultOutput = defaultOutput;
this.pageAreas = CommandLineApp.whichAreas(line);
this.pages = CommandLineApp.whichPages(line);
this.outputFormat = CommandLineApp.whichOutputFormat(line);
this.tableExtractor = CommandLineApp.createExtractor(line);
this.lineColorFilter = CommandLineApp.whichLineColorFilter(line);

if (line.hasOption('s')) {
this.password = line.getOptionValue('s');
Expand Down Expand Up @@ -195,7 +197,7 @@ private void extractFile(File pdfFile, Appendable outFile) throws ParseException
}

private PageIterator getPageIterator(PDDocument pdfDocument) throws IOException {
ObjectExtractor extractor = new ObjectExtractor(pdfDocument);
ObjectExtractor extractor = new ObjectExtractor(pdfDocument, lineColorFilter);
return (pages == null) ?
extractor.extract() :
extractor.extract(pages);
Expand Down Expand Up @@ -260,6 +262,23 @@ private static ExtractionMethod whichExtractionMethod(CommandLine line) {
return ExtractionMethod.DECIDE;
}

private static Integer whichLineColorFilter(CommandLine line) throws ParseException {
if (!line.hasOption("line-color-filter")) {
return null;
}

Integer result;
try {
result = Integer.parseInt(line.getOptionValue("line-color-filter"), 16);
} catch (NumberFormatException e) {
throw new ParseException("line-color-filter parameter must be a hexadecimal number");
}
if (result < 0 || result > 0xFFFFFF) {
throw new ParseException("line-color-filter parameter must be at most FFFFFF");
}
return result;
}

private static TableExtractor createExtractor(CommandLine line) throws ParseException {
TableExtractor extractor = new TableExtractor();
extractor.setGuess(line.hasOption('g'));
Expand Down Expand Up @@ -358,6 +377,12 @@ public static Options buildOptions() {
.hasArg()
.argName("PAGES")
.build());
o.addOption(Option.builder(null)
.longOpt("line-color-filter")
.desc("Only consider lines of this color to be lattice lines. Example: --line-color-filter DEADBE .")
.hasArg()
.argName("COLOR")
.build());

return o;
}
Expand Down
8 changes: 7 additions & 1 deletion src/main/java/technology/tabula/ObjectExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,15 @@
public class ObjectExtractor implements java.io.Closeable {

private final PDDocument pdfDocument;
private final Integer lineColorFilter;

public ObjectExtractor(PDDocument pdfDocument) {
this(pdfDocument, null);
}

public ObjectExtractor(PDDocument pdfDocument, Integer lineColorFilter) {
this.pdfDocument = pdfDocument;
this.lineColorFilter = lineColorFilter;
}

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
Expand All @@ -20,7 +26,7 @@ protected Page extractPage(Integer pageNumber) throws IOException {
}
PDPage page = pdfDocument.getPage(pageNumber - 1);

ObjectExtractorStreamEngine streamEngine = new ObjectExtractorStreamEngine(page);
ObjectExtractorStreamEngine streamEngine = new ObjectExtractorStreamEngine(page, lineColorFilter);
streamEngine.processPage(page);

TextStripper textStripper = new TextStripper(pdfDocument, pageNumber);
Expand Down
40 changes: 33 additions & 7 deletions src/main/java/technology/tabula/ObjectExtractorStreamEngine.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import java.awt.geom.PathIterator;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
Expand All @@ -15,14 +16,17 @@
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static java.awt.geom.PathIterator.*;

class ObjectExtractorStreamEngine extends PDFGraphicsStreamEngine {

private Integer lineColorFilter;
protected List<Ruling> rulings;
private AffineTransform pageTransform;
private boolean extractRulingLines = true;
Expand All @@ -32,8 +36,9 @@ class ObjectExtractorStreamEngine extends PDFGraphicsStreamEngine {

private static final float RULING_MINIMUM_LENGTH = 0.01f;

protected ObjectExtractorStreamEngine(PDPage page) {
protected ObjectExtractorStreamEngine(PDPage page, Integer lineColorFilter) {
super(page);
this.lineColorFilter = lineColorFilter;
logger = LoggerFactory.getLogger(ObjectExtractorStreamEngine.class);
rulings = new ArrayList<>();

Expand Down Expand Up @@ -130,16 +135,11 @@ public void strokePath() {
}

private void strokeOrFillPath(boolean isFill) {
if (!extractRulingLines) {
if (!extractRulingLines || filterPathByColor(isFill) || filterPathBySegmentType()) {
currentPath.reset();
return;
}

boolean didNotPassedTheFilter = filterPathBySegmentType();
if (didNotPassedTheFilter) return;

// TODO: how to implement color filter?

// Skip the first path operation and save it as the starting point.
PathIterator pathIterator = currentPath.getPathIterator(getPageTransform());

Expand Down Expand Up @@ -191,6 +191,32 @@ private void strokeOrFillPath(boolean isFill) {
currentPath.reset();
}

private boolean filterPathByColor (boolean isFill) {
if (lineColorFilter == null) {
return false;
}

try {
PDGraphicsState state = getGraphicsState();
PDColor currentColor;
if (isFill) {
currentColor = state.getNonStrokingColor();
} else {
currentColor = state.getStrokingColor();
}
return currentColor.toRGB() != lineColorFilter;
} catch (IOException e) {
System.err.println("Color conversion failed:");
e.printStackTrace();
return false;
} catch (IllegalStateException e) {
System.err.println("Cannot convert pattern color:");
e.printStackTrace();
return false;
}
// TODO: if the toRGB() method throws an exception, should the color be valid or not?
}

private boolean filterPathBySegmentType() {
PathIterator pathIterator = currentPath.getPathIterator(pageTransform);
float[] coordinates = new float[6];
Expand Down
13 changes: 13 additions & 0 deletions src/test/java/technology/tabula/TestObjectExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,19 @@ public void testShouldDetectRulings() throws IOException {
}
}

@Test
public void testShouldFilterRulingsByColor() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/should_filter_rulings_by_color.pdf"));
try (ObjectExtractor oe = new ObjectExtractor(pdf_document, 0)) {
PageIterator pi = oe.extract();

Page page = pi.next();
List<Ruling> rulings = page.getRulings();

assertEquals(7, rulings.size());
}
}

@Test
public void testDontThrowNPEInShfill() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/labor.pdf"));
Expand Down
Binary file not shown.