From 2ae845f53bbfa3006d6744c2b605a0a117282a04 Mon Sep 17 00:00:00 2001 From: jevertz Date: Tue, 16 Jan 2018 14:56:18 +0100 Subject: [PATCH 01/11] Try to resolve conflict before overwrite from origin --- .../elasticsearch/crawler/fs/settings/Fs.java | 22 ++++++++++- .../crawler/fs/tika/TikaInstance.java | 37 ++++++++++++++++--- 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java index 07fa5a2eb..facfc016b 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java @@ -49,6 +49,7 @@ public class Fs { private boolean continueOnError = false; private boolean pdfOcr = true; private Ocr ocr = new Ocr(); + private String tikaConfigPath = ""; public static Builder builder() { return new Builder(); @@ -80,6 +81,7 @@ public static class Builder { private boolean continueOnError = false; private boolean pdfOcr = true; private Ocr ocr = new Ocr(); + private String tikaConfigPath = ""; public Builder setUrl(String url) { this.url = url; @@ -212,10 +214,15 @@ public Builder setOcr(Ocr ocr) { return this; } + public Builder setTikaConfigPath(String tikaConfigPath) { + this.tikaConfigPath = tikaConfigPath; + return this; + } + public Fs build() { return new Fs(url, updateRate, includes, excludes, jsonSupport, filenameAsId, addFilesize, removeDeleted, addAsInnerObject, storeSource, indexedChars, indexContent, attributesSupport, rawMetadata, - checksum, xmlSupport, indexFolders, langDetect, continueOnError, pdfOcr, ocr); + checksum, xmlSupport, indexFolders, langDetect, continueOnError, pdfOcr, ocr, tikaConfigPath); } } @@ -226,7 +233,7 @@ public Fs( ) { private Fs(String url, TimeValue updateRate, List includes, List excludes, boolean jsonSupport, boolean filenameAsId, boolean addFilesize, boolean removeDeleted, boolean addAsInnerObject, boolean storeSource, Percentage indexedChars, boolean indexContent, boolean attributesSupport, boolean rawMetadata, String checksum, boolean xmlSupport, - boolean indexFolders, boolean langDetect, boolean continueOnError, boolean pdfOcr, Ocr ocr) { + boolean indexFolders, boolean langDetect, boolean continueOnError, boolean pdfOcr, Ocr ocr, String tikaConfigPath) { this.url = url; this.updateRate = updateRate; this.includes = includes; @@ -248,6 +255,7 @@ private Fs(String url, TimeValue updateRate, List includes, List this.continueOnError = continueOnError; this.pdfOcr = pdfOcr; this.ocr = ocr; + this.tikaConfigPath = tikaConfigPath; } public String getUrl() { @@ -418,6 +426,14 @@ public void setOcr(Ocr ocr) { this.ocr = ocr; } + public String getTikaConfigPath() { + return tikaConfigPath; + } + + public void setTikaConfigPath(String tikaConfigPath) { + this.tikaConfigPath = tikaConfigPath; + } + @Override public boolean equals(Object o) { if (this == o) return true; @@ -444,6 +460,7 @@ public boolean equals(Object o) { if (includes != null ? !includes.equals(fs.includes) : fs.includes != null) return false; if (excludes != null ? !excludes.equals(fs.excludes) : fs.excludes != null) return false; if (indexedChars != null ? !indexedChars.equals(fs.indexedChars) : fs.indexedChars != null) return false; + if (tikaConfigPath != null ? !tikaConfigPath.equals(fs.tikaConfigPath) : fs.tikaConfigPath != null) return false; return checksum != null ? checksum.equals(fs.checksum) : fs.checksum == null; } @@ -470,6 +487,7 @@ public int hashCode() { result = 31 * result + (langDetect ? 1 : 0); result = 31 * result + (continueOnError ? 1 : 0); result = 31 * result + (pdfOcr ? 1 : 0); + result = 31 * result + (tikaConfigPath != null ? tikaConfigPath.hashCode() : 0); return result; } } diff --git a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java index 524686b69..d96312c4e 100644 --- a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java +++ b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java @@ -25,6 +25,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.tika.config.ServiceLoader; +import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.metadata.Metadata; @@ -75,9 +76,26 @@ private static void initTika(Fs fs) { private static void initParser(Fs fs) { if (parser == null) { - PDFParser pdfParser = new PDFParser(); - DefaultParser defaultParser; - +// PDFParser pdfParser = new PDFParser(); +// DefaultParser defaultParser; + TikaConfig tika_config = null; + if (!fs.getTikaConfigPath().equals("")) { + try { + tika_config = new TikaConfig(fs.getTikaConfigPath()); + + } catch (IOException e) { + logger.error("Caught IOException:" + e.getMessage()); + } catch (TikaException te) { + logger.error("Caught TikaException:" + te.getMessage()); + } catch (SAXException se) { + logger.error("Caught SAXException:" + se.getMessage()); + } + } + parser = new AutoDetectParser(tika_config); +/* MediaTypeRegistry mymedreg = tika_config.getMediaTypeRegistry(); + Parser myparser = tika_config.getParser(); + Detector mydetector = tika_config.getDetector(); + Parser PARSERS[] = new Parser[2]; if (fs.isPdfOcr()) { logger.debug("OCR is activated for PDF documents"); if (ExternalParser.check("tesseract")) { @@ -85,20 +103,27 @@ private static void initParser(Fs fs) { } else { logger.debug("But Tesseract is not installed so we won't run OCR."); } - defaultParser = new DefaultParser(); + if (tika_config.equals(null)) { + PARSERS[0] = new DefaultParser(); + } else { + PARSERS[0] = tika_config.getParser(); + } } else { logger.debug("OCR is disabled. Even though it's detected, it must be disabled explicitly"); defaultParser = new DefaultParser( MediaTypeRegistry.getDefaultRegistry(), new ServiceLoader(), Collections.singletonList(TesseractOCRParser.class)); + PARSERS[0] = defaultParser; } - Parser PARSERS[] = new Parser[2]; - PARSERS[0] = defaultParser; + + PARSERS[1] = pdfParser; + PARSERS[2] = fontParser; parser = new AutoDetectParser(PARSERS); + */ } } From 28482c843f16381685ce2f536aa74e8b9efd8e3d Mon Sep 17 00:00:00 2001 From: jevertz Date: Tue, 16 Jan 2018 14:45:44 +0100 Subject: [PATCH 02/11] Example Tika configuration file. --- tika-config.xml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 tika-config.xml diff --git a/tika-config.xml b/tika-config.xml new file mode 100644 index 000000000..b4656e03d --- /dev/null +++ b/tika-config.xml @@ -0,0 +1,28 @@ + + + + + + image/jpeg + application/pdf + application/x-font-ttf + application/x-font-ttc + application/x-font-otf + + + + + application/x-font-ttf + + application/x-font-otf + false + + + application/pdf + no_ocr + + + + + \ No newline at end of file From f7e2b1a826dd7d04fad41f55da9759826587b49c Mon Sep 17 00:00:00 2001 From: jevertz Date: Mon, 22 Jan 2018 11:01:10 +0100 Subject: [PATCH 03/11] allow custom Tika parser --- .../fs/meta/settings/CustomTikaParser.java | 77 +++++++++++++++ .../elasticsearch/crawler/fs/settings/Fs.java | 39 +++++--- .../fs/settings/FsSettingsParserTest.java | 2 + .../crawler/fs/tika/TikaInstance.java | 97 +++++++++++++------ 4 files changed, 171 insertions(+), 44 deletions(-) create mode 100644 settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/meta/settings/CustomTikaParser.java diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/meta/settings/CustomTikaParser.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/meta/settings/CustomTikaParser.java new file mode 100644 index 000000000..5703e962e --- /dev/null +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/meta/settings/CustomTikaParser.java @@ -0,0 +1,77 @@ +package fr.pilato.elasticsearch.crawler.fs.meta.settings; + +import java.util.ArrayList; +import java.util.List; + +public class CustomTikaParser { + + private String className = ""; + + private String pathToJar = ""; + + private List mimeTypes = new ArrayList(); + + public static Builder builder() { + return new Builder(); + } + + public static class Builder { + + private String className = ""; + + public CustomTikaParser.Builder setClassName(String className) { + this.className = className; + return this; + } + + private String pathToJar = ""; + + public CustomTikaParser.Builder setPathToJar(String pathToJar) { + this.pathToJar = pathToJar; + return this; + } + + private List mimeTypes = new ArrayList(); + + public CustomTikaParser.Builder setMimeTypes(ArrayList mimeTypes) { + this.mimeTypes = mimeTypes; + return this; + } + } + + public CustomTikaParser() { + + } + + private CustomTikaParser(String className, String pathToJar, ArrayList mimeTypes) { + + this.className = className; + this.pathToJar = pathToJar; + this.mimeTypes = mimeTypes; + } + + public String getClassName() { + return className; + } + + public void setClassName(String className) { + this.className = className; + } + + public String getPathToJar() { + return pathToJar; + } + + public void setPathToJar(String pathToJar) { + this.pathToJar = pathToJar; + } + + public List getMimeTypes() { + return mimeTypes; + } + + public void setMimeTypes(ArrayList mimeTypes) { + this.mimeTypes = mimeTypes; + } + +} \ No newline at end of file diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java index facfc016b..de7f9cc1e 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java @@ -49,7 +49,7 @@ public class Fs { private boolean continueOnError = false; private boolean pdfOcr = true; private Ocr ocr = new Ocr(); - private String tikaConfigPath = ""; + private List customTikaParsers = new ArrayList<>(); public static Builder builder() { return new Builder(); @@ -81,7 +81,7 @@ public static class Builder { private boolean continueOnError = false; private boolean pdfOcr = true; private Ocr ocr = new Ocr(); - private String tikaConfigPath = ""; + private List customTikaParsers = new ArrayList<>(); public Builder setUrl(String url) { this.url = url; @@ -214,15 +214,28 @@ public Builder setOcr(Ocr ocr) { return this; } - public Builder setTikaConfigPath(String tikaConfigPath) { - this.tikaConfigPath = tikaConfigPath; + public Builder setTikaCustomParsers(List customTikaParsers) { + this.customTikaParsers = customTikaParsers; + return this; + } + + public Builder addTikaCustomParsers(CustomTikaParser customTikaParser) { + if (this.customTikaParsers == null) { + this.customTikaParsers = new ArrayList<>(); + } + + // We refuse to add duplicates + if (!this.customTikaParsers.contains(customTikaParser)) { + this.customTikaParsers.add(customTikaParser); + } + return this; } public Fs build() { return new Fs(url, updateRate, includes, excludes, jsonSupport, filenameAsId, addFilesize, removeDeleted, addAsInnerObject, storeSource, indexedChars, indexContent, attributesSupport, rawMetadata, - checksum, xmlSupport, indexFolders, langDetect, continueOnError, pdfOcr, ocr, tikaConfigPath); + checksum, xmlSupport, indexFolders, langDetect, continueOnError, pdfOcr, ocr, customTikaParsers); } } @@ -233,7 +246,7 @@ public Fs( ) { private Fs(String url, TimeValue updateRate, List includes, List excludes, boolean jsonSupport, boolean filenameAsId, boolean addFilesize, boolean removeDeleted, boolean addAsInnerObject, boolean storeSource, Percentage indexedChars, boolean indexContent, boolean attributesSupport, boolean rawMetadata, String checksum, boolean xmlSupport, - boolean indexFolders, boolean langDetect, boolean continueOnError, boolean pdfOcr, Ocr ocr, String tikaConfigPath) { + boolean indexFolders, boolean langDetect, boolean continueOnError, boolean pdfOcr, Ocr ocr, List customTikaParsers) { this.url = url; this.updateRate = updateRate; this.includes = includes; @@ -255,7 +268,7 @@ private Fs(String url, TimeValue updateRate, List includes, List this.continueOnError = continueOnError; this.pdfOcr = pdfOcr; this.ocr = ocr; - this.tikaConfigPath = tikaConfigPath; + this.customTikaParsers = customTikaParsers; } public String getUrl() { @@ -426,12 +439,12 @@ public void setOcr(Ocr ocr) { this.ocr = ocr; } - public String getTikaConfigPath() { - return tikaConfigPath; + public List getCustomTikaParsers() { + return customTikaParsers; } - public void setTikaConfigPath(String tikaConfigPath) { - this.tikaConfigPath = tikaConfigPath; + public void setCustomTikaParsers(List customTikaParsers) { + this.customTikaParsers = customTikaParsers; } @Override @@ -460,7 +473,7 @@ public boolean equals(Object o) { if (includes != null ? !includes.equals(fs.includes) : fs.includes != null) return false; if (excludes != null ? !excludes.equals(fs.excludes) : fs.excludes != null) return false; if (indexedChars != null ? !indexedChars.equals(fs.indexedChars) : fs.indexedChars != null) return false; - if (tikaConfigPath != null ? !tikaConfigPath.equals(fs.tikaConfigPath) : fs.tikaConfigPath != null) return false; + if (customTikaParsers != null ? !customTikaParsers.equals(fs.customTikaParsers) : fs.customTikaParsers != null) return false; return checksum != null ? checksum.equals(fs.checksum) : fs.checksum == null; } @@ -487,7 +500,7 @@ public int hashCode() { result = 31 * result + (langDetect ? 1 : 0); result = 31 * result + (continueOnError ? 1 : 0); result = 31 * result + (pdfOcr ? 1 : 0); - result = 31 * result + (tikaConfigPath != null ? tikaConfigPath.hashCode() : 0); + result = 31 * result + (customTikaParsers != null ? customTikaParsers.hashCode() : 0); return result; } } diff --git a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java index a90232d64..aa080f79b 100644 --- a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java +++ b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java @@ -25,6 +25,7 @@ import org.junit.Test; import java.io.IOException; +import java.util.ArrayList; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.contains; @@ -53,6 +54,7 @@ public class FsSettingsParserTest extends AbstractFSCrawlerTestCase { .setUpdateRate(TimeValue.timeValueMinutes(5)) .setIndexContent(true) .setOcr(OCR_FULL) + .setTikaCustomParsers(new ArrayList<>()) .build(); private static final Elasticsearch ELASTICSEARCH_EMPTY = Elasticsearch.builder().build(); private static final Elasticsearch ELASTICSEARCH_FULL = Elasticsearch.builder() diff --git a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java index d96312c4e..7240d6b5d 100644 --- a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java +++ b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java @@ -29,6 +29,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.DefaultParser; @@ -42,9 +43,14 @@ import org.apache.tika.sax.WriteOutContentHandler; import org.xml.sax.SAXException; +import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.util.Collections; +import java.net.URL; +import java.net.URLClassLoader; +import java.util.*; +import java.util.jar.JarEntry; +import java.util.jar.JarFile; import static org.apache.tika.langdetect.OptimaizeLangDetector.getDefaultLanguageDetector; @@ -76,26 +82,11 @@ private static void initTika(Fs fs) { private static void initParser(Fs fs) { if (parser == null) { -// PDFParser pdfParser = new PDFParser(); -// DefaultParser defaultParser; - TikaConfig tika_config = null; - if (!fs.getTikaConfigPath().equals("")) { - try { - tika_config = new TikaConfig(fs.getTikaConfigPath()); - - } catch (IOException e) { - logger.error("Caught IOException:" + e.getMessage()); - } catch (TikaException te) { - logger.error("Caught TikaException:" + te.getMessage()); - } catch (SAXException se) { - logger.error("Caught SAXException:" + se.getMessage()); - } - } - parser = new AutoDetectParser(tika_config); -/* MediaTypeRegistry mymedreg = tika_config.getMediaTypeRegistry(); - Parser myparser = tika_config.getParser(); - Detector mydetector = tika_config.getDetector(); - Parser PARSERS[] = new Parser[2]; + PDFParser pdfParser = new PDFParser(); + DefaultParser defaultParser; + + + if (fs.isPdfOcr()) { logger.debug("OCR is activated for PDF documents"); if (ExternalParser.check("tesseract")) { @@ -103,27 +94,71 @@ private static void initParser(Fs fs) { } else { logger.debug("But Tesseract is not installed so we won't run OCR."); } - if (tika_config.equals(null)) { - PARSERS[0] = new DefaultParser(); - } else { - PARSERS[0] = tika_config.getParser(); - } + defaultParser = new DefaultParser(); } else { logger.debug("OCR is disabled. Even though it's detected, it must be disabled explicitly"); defaultParser = new DefaultParser( MediaTypeRegistry.getDefaultRegistry(), new ServiceLoader(), Collections.singletonList(TesseractOCRParser.class)); - PARSERS[0] = defaultParser; } + // load custom parsers if defined in config + if (fs.getCustomTikaParsers().size() > 0) { + Integer counter = 0; + Parser PARSERS[] = new Parser[fs.getCustomTikaParsers().size()+2]; + + + + // to collect all Mediatypes handled by custom parser to exclude them form the DefaultParser + List excludeMediaTypes = new ArrayList(); - PARSERS[1] = pdfParser; - PARSERS[2] = fontParser; - parser = new AutoDetectParser(PARSERS); - */ + for (CustomTikaParser customTikaParser : fs.getCustomTikaParsers()) { + counter += 1; + try { + URL[] jarUrl = { new URL("jar:file:" + customTikaParser.getPathToJar()+"!/") }; + URLClassLoader urlClassLoader = URLClassLoader.newInstance(jarUrl); + Class customParserClass = urlClassLoader.loadClass(customTikaParser.getClassName()); + Parser customParser = (Parser) customParserClass.newInstance(); + String[] customMimeStrings = new String[customTikaParser.getMimeTypes().size()]; + Set customMediaTypes = MediaType.set(customTikaParser.getMimeTypes().toArray(customMimeStrings)); + excludeMediaTypes.addAll(customMediaTypes); + Parser customParserDecorated = ParserDecorator.withTypes(customParser, customMediaTypes); + PARSERS[counter] = customParserDecorated; + + + + } catch (IOException e) { + logger.error("Caught IOException:" + e.getMessage()); + } catch (ClassNotFoundException e) { + logger.error("Caught ClassNotFoundException:" + e.getMessage()); + } catch (InstantiationException e) { + logger.error("Caught InstantiationException:" + e.getMessage()); + } catch (IllegalAccessException e) { + logger.error("Caught IllegalAccessException:" + e.getMessage()); + }/*catch (TikaException te) { + logger.error("Caught TikaException:" + te.getMessage()); + } catch (SAXException se) { + logger.error("Caught SAXException:" + se.getMessage()); + }*/ + } + if (excludeMediaTypes.size() > 0) { + MediaType[] excludedMediaTypeSet = new MediaType[excludeMediaTypes.size()]; + PARSERS[0] = ParserDecorator.withoutTypes(defaultParser, new HashSet(excludeMediaTypes)); + } else { + PARSERS[0] = defaultParser; + } + PARSERS[counter+1] = pdfParser; + parser = new AutoDetectParser(PARSERS); + } else { + Parser PARSERS[] = new Parser[2]; + PARSERS[0] = defaultParser; + PARSERS[1] = pdfParser; + parser = new AutoDetectParser(PARSERS); + } + } } From 1a0f5f6e4b11ff628aa54b086d00218aa6b0dee1 Mon Sep 17 00:00:00 2001 From: jevertz Date: Wed, 21 Feb 2018 15:12:14 +0100 Subject: [PATCH 04/11] After rebase. Added support for a custom external Tika Parser. --- .../{meta => }/settings/CustomTikaParser.java | 8 +++--- tika-config.xml | 28 ------------------- .../crawler/fs/tika/TikaInstance.java | 2 ++ 3 files changed, 6 insertions(+), 32 deletions(-) rename settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/{meta => }/settings/CustomTikaParser.java (77%) delete mode 100644 tika-config.xml diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/meta/settings/CustomTikaParser.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/CustomTikaParser.java similarity index 77% rename from settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/meta/settings/CustomTikaParser.java rename to settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/CustomTikaParser.java index 5703e962e..f0ea665de 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/meta/settings/CustomTikaParser.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/CustomTikaParser.java @@ -1,4 +1,4 @@ -package fr.pilato.elasticsearch.crawler.fs.meta.settings; +package fr.pilato.elasticsearch.crawler.fs.settings; import java.util.ArrayList; import java.util.List; @@ -19,21 +19,21 @@ public static class Builder { private String className = ""; - public CustomTikaParser.Builder setClassName(String className) { + public fr.pilato.elasticsearch.crawler.fs.settings.CustomTikaParser.Builder setClassName(String className) { this.className = className; return this; } private String pathToJar = ""; - public CustomTikaParser.Builder setPathToJar(String pathToJar) { + public fr.pilato.elasticsearch.crawler.fs.settings.CustomTikaParser.Builder setPathToJar(String pathToJar) { this.pathToJar = pathToJar; return this; } private List mimeTypes = new ArrayList(); - public CustomTikaParser.Builder setMimeTypes(ArrayList mimeTypes) { + public fr.pilato.elasticsearch.crawler.fs.settings.CustomTikaParser.Builder setMimeTypes(ArrayList mimeTypes) { this.mimeTypes = mimeTypes; return this; } diff --git a/tika-config.xml b/tika-config.xml deleted file mode 100644 index b4656e03d..000000000 --- a/tika-config.xml +++ /dev/null @@ -1,28 +0,0 @@ - - - - - - image/jpeg - application/pdf - application/x-font-ttf - application/x-font-ttc - application/x-font-otf - - - - - application/x-font-ttf - - application/x-font-otf - false - - - application/pdf - no_ocr - - - - - \ No newline at end of file diff --git a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java index 7240d6b5d..8bf50a6fb 100644 --- a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java +++ b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java @@ -22,6 +22,7 @@ import fr.pilato.elasticsearch.crawler.fs.settings.Fs; import fr.pilato.elasticsearch.crawler.fs.settings.FsSettings; +import fr.pilato.elasticsearch.crawler.fs.settings.CustomTikaParser; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.tika.config.ServiceLoader; @@ -35,6 +36,7 @@ import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.ocr.TesseractOCRParser; From 224f22e5611acb7308ac2cbe25d6a22602894773 Mon Sep 17 00:00:00 2001 From: jevertz Date: Wed, 21 Feb 2018 16:47:54 +0100 Subject: [PATCH 05/11] Extended README to cover custom tika parsers setting --- README.md | 178 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 177 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 25154c751..e970b9b6d 100644 --- a/README.md +++ b/README.md @@ -620,7 +620,8 @@ Here is a list of Local FS settings (under `fs.` prefix)`: | `fs.continue_on_error` | `false` | [Continue on File Permission Error](#continue-on-error) (from 2.3) | | `fs.pdf_ocr` | `true` | [Run OCR on PDF documents](#ocr-integration) (from 2.3) | | `fs.indexed_chars` | `100000.0` | [Extracted characters](#extracted-characters) | -| `fs.checksum` | `null` | [File Checksum](#file-checksum) | +| `fs.checksum` | `null` | [File Checksum](#file-checksum) +| `fs.custom_tika_parsers` | `null` | [Custom Tika Parsers](#custom-tika-parsers) | #### Root directory @@ -1183,6 +1184,181 @@ to compute the checksum, such as `MD5` or `SHA-1`. } ``` +#### Custom Tika Parsers + +It might occur that one or more existing Tika parsers do not provide the intended information, or just do not exist. +This setting allows to use a custom parser instead. +The parsers must be provided as a .jar, but does not need to be on any classpath. +Note that this is an array. Here an example for just one + +```json +{ + "name": "test", + "fs": { + "custom_tika_parsers": [ + { + "class_name": "org.me.MyParser", + "path_to_jar": "/some/full/path/to/myParser-0.0.1-SNAPSHOT.jar", + "mime_types": ["application/dns", "or-another-mimetype-from-tika"] + } + ] + } +} +``` + +Some info about creating a custom parser is available [here](https://tika.apache.org/1.17/parser_guide.html) +Or use a existing parser as a blueprint. Make sure to choose the correct branch. +At the time of this writing fscrawler uses Tika 1.17, while on github the main branch is 2.x. +The parsers from ["branch_1x"](https://github.com/apache/tika/tree/branch_1x/tika-parsers/src/main/java/org/apache/tika/parser) should work fine. + +To build the custom parser separately, a pom file can be derived from the tika-parsers [pom.xml](https://github.com/apache/tika/blob/branch_1x/tika-parsers/pom.xml). +Probably a lot can be left out. Here is an example which required fontbox (guess still to long, but worked). + +
Example pom.xml +

+ + +``` + + 4.0.0 + + org.me + myParser + 0.0.1-SNAPSHOT + + + 1.7 + 1.7 + UTF-8 + 1.17 + 2.0.8 + + + + src + + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.0.0-M1 + + all,-missing,-accessibility + true + + + + + + + + org.apache.tika + tika-parsers + ${tika.version} + + + + edu.ucar + netcdf + + + + edu.ucar + cdm + + + + edu.ucar + httpservices + + + + edu.ucar + grib + + + + edu.ucar + netcdf4 + + + + com.uwyn + jhighlight + + + + org.ow2.asm + asm-debug-all + + + commons-logging + commons-logging-api + + + + org.apache.cxf + cxf-rt-rs-client + + + + + org.apache.pdfbox + fontbox + ${fontbox.version} + + + + edu.ucar + netcdf + + + + edu.ucar + cdm + + + + edu.ucar + httpservices + + + + edu.ucar + grib + + + + edu.ucar + netcdf4 + + + + com.uwyn + jhighlight + + + + org.ow2.asm + asm-debug-all + + + commons-logging + commons-logging-api + + + + org.apache.cxf + cxf-rt-rs-client + + + + + +``` + +

+
### SSH settings From 5c4b88db16ac815c9e51a3ef0214079f5232fdca Mon Sep 17 00:00:00 2001 From: jevertz Date: Tue, 16 Jan 2018 14:56:18 +0100 Subject: [PATCH 06/11] Try to resolve conflict before overwrite from origin --- .../elasticsearch/crawler/fs/settings/Fs.java | 22 ++++++++++- .../crawler/fs/tika/TikaInstance.java | 37 ++++++++++++++++--- 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java index 07fa5a2eb..facfc016b 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java @@ -49,6 +49,7 @@ public class Fs { private boolean continueOnError = false; private boolean pdfOcr = true; private Ocr ocr = new Ocr(); + private String tikaConfigPath = ""; public static Builder builder() { return new Builder(); @@ -80,6 +81,7 @@ public static class Builder { private boolean continueOnError = false; private boolean pdfOcr = true; private Ocr ocr = new Ocr(); + private String tikaConfigPath = ""; public Builder setUrl(String url) { this.url = url; @@ -212,10 +214,15 @@ public Builder setOcr(Ocr ocr) { return this; } + public Builder setTikaConfigPath(String tikaConfigPath) { + this.tikaConfigPath = tikaConfigPath; + return this; + } + public Fs build() { return new Fs(url, updateRate, includes, excludes, jsonSupport, filenameAsId, addFilesize, removeDeleted, addAsInnerObject, storeSource, indexedChars, indexContent, attributesSupport, rawMetadata, - checksum, xmlSupport, indexFolders, langDetect, continueOnError, pdfOcr, ocr); + checksum, xmlSupport, indexFolders, langDetect, continueOnError, pdfOcr, ocr, tikaConfigPath); } } @@ -226,7 +233,7 @@ public Fs( ) { private Fs(String url, TimeValue updateRate, List includes, List excludes, boolean jsonSupport, boolean filenameAsId, boolean addFilesize, boolean removeDeleted, boolean addAsInnerObject, boolean storeSource, Percentage indexedChars, boolean indexContent, boolean attributesSupport, boolean rawMetadata, String checksum, boolean xmlSupport, - boolean indexFolders, boolean langDetect, boolean continueOnError, boolean pdfOcr, Ocr ocr) { + boolean indexFolders, boolean langDetect, boolean continueOnError, boolean pdfOcr, Ocr ocr, String tikaConfigPath) { this.url = url; this.updateRate = updateRate; this.includes = includes; @@ -248,6 +255,7 @@ private Fs(String url, TimeValue updateRate, List includes, List this.continueOnError = continueOnError; this.pdfOcr = pdfOcr; this.ocr = ocr; + this.tikaConfigPath = tikaConfigPath; } public String getUrl() { @@ -418,6 +426,14 @@ public void setOcr(Ocr ocr) { this.ocr = ocr; } + public String getTikaConfigPath() { + return tikaConfigPath; + } + + public void setTikaConfigPath(String tikaConfigPath) { + this.tikaConfigPath = tikaConfigPath; + } + @Override public boolean equals(Object o) { if (this == o) return true; @@ -444,6 +460,7 @@ public boolean equals(Object o) { if (includes != null ? !includes.equals(fs.includes) : fs.includes != null) return false; if (excludes != null ? !excludes.equals(fs.excludes) : fs.excludes != null) return false; if (indexedChars != null ? !indexedChars.equals(fs.indexedChars) : fs.indexedChars != null) return false; + if (tikaConfigPath != null ? !tikaConfigPath.equals(fs.tikaConfigPath) : fs.tikaConfigPath != null) return false; return checksum != null ? checksum.equals(fs.checksum) : fs.checksum == null; } @@ -470,6 +487,7 @@ public int hashCode() { result = 31 * result + (langDetect ? 1 : 0); result = 31 * result + (continueOnError ? 1 : 0); result = 31 * result + (pdfOcr ? 1 : 0); + result = 31 * result + (tikaConfigPath != null ? tikaConfigPath.hashCode() : 0); return result; } } diff --git a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java index 524686b69..d96312c4e 100644 --- a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java +++ b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java @@ -25,6 +25,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.tika.config.ServiceLoader; +import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.metadata.Metadata; @@ -75,9 +76,26 @@ private static void initTika(Fs fs) { private static void initParser(Fs fs) { if (parser == null) { - PDFParser pdfParser = new PDFParser(); - DefaultParser defaultParser; - +// PDFParser pdfParser = new PDFParser(); +// DefaultParser defaultParser; + TikaConfig tika_config = null; + if (!fs.getTikaConfigPath().equals("")) { + try { + tika_config = new TikaConfig(fs.getTikaConfigPath()); + + } catch (IOException e) { + logger.error("Caught IOException:" + e.getMessage()); + } catch (TikaException te) { + logger.error("Caught TikaException:" + te.getMessage()); + } catch (SAXException se) { + logger.error("Caught SAXException:" + se.getMessage()); + } + } + parser = new AutoDetectParser(tika_config); +/* MediaTypeRegistry mymedreg = tika_config.getMediaTypeRegistry(); + Parser myparser = tika_config.getParser(); + Detector mydetector = tika_config.getDetector(); + Parser PARSERS[] = new Parser[2]; if (fs.isPdfOcr()) { logger.debug("OCR is activated for PDF documents"); if (ExternalParser.check("tesseract")) { @@ -85,20 +103,27 @@ private static void initParser(Fs fs) { } else { logger.debug("But Tesseract is not installed so we won't run OCR."); } - defaultParser = new DefaultParser(); + if (tika_config.equals(null)) { + PARSERS[0] = new DefaultParser(); + } else { + PARSERS[0] = tika_config.getParser(); + } } else { logger.debug("OCR is disabled. Even though it's detected, it must be disabled explicitly"); defaultParser = new DefaultParser( MediaTypeRegistry.getDefaultRegistry(), new ServiceLoader(), Collections.singletonList(TesseractOCRParser.class)); + PARSERS[0] = defaultParser; } - Parser PARSERS[] = new Parser[2]; - PARSERS[0] = defaultParser; + + PARSERS[1] = pdfParser; + PARSERS[2] = fontParser; parser = new AutoDetectParser(PARSERS); + */ } } From 43e42aaee62a4bbaaeec4b88806a17a8cc6d4af5 Mon Sep 17 00:00:00 2001 From: jevertz Date: Tue, 16 Jan 2018 14:45:44 +0100 Subject: [PATCH 07/11] Example Tika configuration file. --- tika-config.xml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 tika-config.xml diff --git a/tika-config.xml b/tika-config.xml new file mode 100644 index 000000000..b4656e03d --- /dev/null +++ b/tika-config.xml @@ -0,0 +1,28 @@ + + + + + + image/jpeg + application/pdf + application/x-font-ttf + application/x-font-ttc + application/x-font-otf + + + + + application/x-font-ttf + + application/x-font-otf + false + + + application/pdf + no_ocr + + + + + \ No newline at end of file From 4e9cd5b7baf89f4b7ab0eab3aeaa7cf1abd6a2e3 Mon Sep 17 00:00:00 2001 From: jevertz Date: Mon, 22 Jan 2018 11:01:10 +0100 Subject: [PATCH 08/11] allow custom Tika parser --- .../fs/meta/settings/CustomTikaParser.java | 77 +++++++++++++++ .../elasticsearch/crawler/fs/settings/Fs.java | 39 +++++--- .../fs/settings/FsSettingsParserTest.java | 2 + .../crawler/fs/tika/TikaInstance.java | 97 +++++++++++++------ 4 files changed, 171 insertions(+), 44 deletions(-) create mode 100644 settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/meta/settings/CustomTikaParser.java diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/meta/settings/CustomTikaParser.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/meta/settings/CustomTikaParser.java new file mode 100644 index 000000000..5703e962e --- /dev/null +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/meta/settings/CustomTikaParser.java @@ -0,0 +1,77 @@ +package fr.pilato.elasticsearch.crawler.fs.meta.settings; + +import java.util.ArrayList; +import java.util.List; + +public class CustomTikaParser { + + private String className = ""; + + private String pathToJar = ""; + + private List mimeTypes = new ArrayList(); + + public static Builder builder() { + return new Builder(); + } + + public static class Builder { + + private String className = ""; + + public CustomTikaParser.Builder setClassName(String className) { + this.className = className; + return this; + } + + private String pathToJar = ""; + + public CustomTikaParser.Builder setPathToJar(String pathToJar) { + this.pathToJar = pathToJar; + return this; + } + + private List mimeTypes = new ArrayList(); + + public CustomTikaParser.Builder setMimeTypes(ArrayList mimeTypes) { + this.mimeTypes = mimeTypes; + return this; + } + } + + public CustomTikaParser() { + + } + + private CustomTikaParser(String className, String pathToJar, ArrayList mimeTypes) { + + this.className = className; + this.pathToJar = pathToJar; + this.mimeTypes = mimeTypes; + } + + public String getClassName() { + return className; + } + + public void setClassName(String className) { + this.className = className; + } + + public String getPathToJar() { + return pathToJar; + } + + public void setPathToJar(String pathToJar) { + this.pathToJar = pathToJar; + } + + public List getMimeTypes() { + return mimeTypes; + } + + public void setMimeTypes(ArrayList mimeTypes) { + this.mimeTypes = mimeTypes; + } + +} \ No newline at end of file diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java index facfc016b..de7f9cc1e 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java @@ -49,7 +49,7 @@ public class Fs { private boolean continueOnError = false; private boolean pdfOcr = true; private Ocr ocr = new Ocr(); - private String tikaConfigPath = ""; + private List customTikaParsers = new ArrayList<>(); public static Builder builder() { return new Builder(); @@ -81,7 +81,7 @@ public static class Builder { private boolean continueOnError = false; private boolean pdfOcr = true; private Ocr ocr = new Ocr(); - private String tikaConfigPath = ""; + private List customTikaParsers = new ArrayList<>(); public Builder setUrl(String url) { this.url = url; @@ -214,15 +214,28 @@ public Builder setOcr(Ocr ocr) { return this; } - public Builder setTikaConfigPath(String tikaConfigPath) { - this.tikaConfigPath = tikaConfigPath; + public Builder setTikaCustomParsers(List customTikaParsers) { + this.customTikaParsers = customTikaParsers; + return this; + } + + public Builder addTikaCustomParsers(CustomTikaParser customTikaParser) { + if (this.customTikaParsers == null) { + this.customTikaParsers = new ArrayList<>(); + } + + // We refuse to add duplicates + if (!this.customTikaParsers.contains(customTikaParser)) { + this.customTikaParsers.add(customTikaParser); + } + return this; } public Fs build() { return new Fs(url, updateRate, includes, excludes, jsonSupport, filenameAsId, addFilesize, removeDeleted, addAsInnerObject, storeSource, indexedChars, indexContent, attributesSupport, rawMetadata, - checksum, xmlSupport, indexFolders, langDetect, continueOnError, pdfOcr, ocr, tikaConfigPath); + checksum, xmlSupport, indexFolders, langDetect, continueOnError, pdfOcr, ocr, customTikaParsers); } } @@ -233,7 +246,7 @@ public Fs( ) { private Fs(String url, TimeValue updateRate, List includes, List excludes, boolean jsonSupport, boolean filenameAsId, boolean addFilesize, boolean removeDeleted, boolean addAsInnerObject, boolean storeSource, Percentage indexedChars, boolean indexContent, boolean attributesSupport, boolean rawMetadata, String checksum, boolean xmlSupport, - boolean indexFolders, boolean langDetect, boolean continueOnError, boolean pdfOcr, Ocr ocr, String tikaConfigPath) { + boolean indexFolders, boolean langDetect, boolean continueOnError, boolean pdfOcr, Ocr ocr, List customTikaParsers) { this.url = url; this.updateRate = updateRate; this.includes = includes; @@ -255,7 +268,7 @@ private Fs(String url, TimeValue updateRate, List includes, List this.continueOnError = continueOnError; this.pdfOcr = pdfOcr; this.ocr = ocr; - this.tikaConfigPath = tikaConfigPath; + this.customTikaParsers = customTikaParsers; } public String getUrl() { @@ -426,12 +439,12 @@ public void setOcr(Ocr ocr) { this.ocr = ocr; } - public String getTikaConfigPath() { - return tikaConfigPath; + public List getCustomTikaParsers() { + return customTikaParsers; } - public void setTikaConfigPath(String tikaConfigPath) { - this.tikaConfigPath = tikaConfigPath; + public void setCustomTikaParsers(List customTikaParsers) { + this.customTikaParsers = customTikaParsers; } @Override @@ -460,7 +473,7 @@ public boolean equals(Object o) { if (includes != null ? !includes.equals(fs.includes) : fs.includes != null) return false; if (excludes != null ? !excludes.equals(fs.excludes) : fs.excludes != null) return false; if (indexedChars != null ? !indexedChars.equals(fs.indexedChars) : fs.indexedChars != null) return false; - if (tikaConfigPath != null ? !tikaConfigPath.equals(fs.tikaConfigPath) : fs.tikaConfigPath != null) return false; + if (customTikaParsers != null ? !customTikaParsers.equals(fs.customTikaParsers) : fs.customTikaParsers != null) return false; return checksum != null ? checksum.equals(fs.checksum) : fs.checksum == null; } @@ -487,7 +500,7 @@ public int hashCode() { result = 31 * result + (langDetect ? 1 : 0); result = 31 * result + (continueOnError ? 1 : 0); result = 31 * result + (pdfOcr ? 1 : 0); - result = 31 * result + (tikaConfigPath != null ? tikaConfigPath.hashCode() : 0); + result = 31 * result + (customTikaParsers != null ? customTikaParsers.hashCode() : 0); return result; } } diff --git a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java index a90232d64..aa080f79b 100644 --- a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java +++ b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java @@ -25,6 +25,7 @@ import org.junit.Test; import java.io.IOException; +import java.util.ArrayList; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.contains; @@ -53,6 +54,7 @@ public class FsSettingsParserTest extends AbstractFSCrawlerTestCase { .setUpdateRate(TimeValue.timeValueMinutes(5)) .setIndexContent(true) .setOcr(OCR_FULL) + .setTikaCustomParsers(new ArrayList<>()) .build(); private static final Elasticsearch ELASTICSEARCH_EMPTY = Elasticsearch.builder().build(); private static final Elasticsearch ELASTICSEARCH_FULL = Elasticsearch.builder() diff --git a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java index d96312c4e..7240d6b5d 100644 --- a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java +++ b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java @@ -29,6 +29,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.DefaultParser; @@ -42,9 +43,14 @@ import org.apache.tika.sax.WriteOutContentHandler; import org.xml.sax.SAXException; +import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.util.Collections; +import java.net.URL; +import java.net.URLClassLoader; +import java.util.*; +import java.util.jar.JarEntry; +import java.util.jar.JarFile; import static org.apache.tika.langdetect.OptimaizeLangDetector.getDefaultLanguageDetector; @@ -76,26 +82,11 @@ private static void initTika(Fs fs) { private static void initParser(Fs fs) { if (parser == null) { -// PDFParser pdfParser = new PDFParser(); -// DefaultParser defaultParser; - TikaConfig tika_config = null; - if (!fs.getTikaConfigPath().equals("")) { - try { - tika_config = new TikaConfig(fs.getTikaConfigPath()); - - } catch (IOException e) { - logger.error("Caught IOException:" + e.getMessage()); - } catch (TikaException te) { - logger.error("Caught TikaException:" + te.getMessage()); - } catch (SAXException se) { - logger.error("Caught SAXException:" + se.getMessage()); - } - } - parser = new AutoDetectParser(tika_config); -/* MediaTypeRegistry mymedreg = tika_config.getMediaTypeRegistry(); - Parser myparser = tika_config.getParser(); - Detector mydetector = tika_config.getDetector(); - Parser PARSERS[] = new Parser[2]; + PDFParser pdfParser = new PDFParser(); + DefaultParser defaultParser; + + + if (fs.isPdfOcr()) { logger.debug("OCR is activated for PDF documents"); if (ExternalParser.check("tesseract")) { @@ -103,27 +94,71 @@ private static void initParser(Fs fs) { } else { logger.debug("But Tesseract is not installed so we won't run OCR."); } - if (tika_config.equals(null)) { - PARSERS[0] = new DefaultParser(); - } else { - PARSERS[0] = tika_config.getParser(); - } + defaultParser = new DefaultParser(); } else { logger.debug("OCR is disabled. Even though it's detected, it must be disabled explicitly"); defaultParser = new DefaultParser( MediaTypeRegistry.getDefaultRegistry(), new ServiceLoader(), Collections.singletonList(TesseractOCRParser.class)); - PARSERS[0] = defaultParser; } + // load custom parsers if defined in config + if (fs.getCustomTikaParsers().size() > 0) { + Integer counter = 0; + Parser PARSERS[] = new Parser[fs.getCustomTikaParsers().size()+2]; + + + + // to collect all Mediatypes handled by custom parser to exclude them form the DefaultParser + List excludeMediaTypes = new ArrayList(); - PARSERS[1] = pdfParser; - PARSERS[2] = fontParser; - parser = new AutoDetectParser(PARSERS); - */ + for (CustomTikaParser customTikaParser : fs.getCustomTikaParsers()) { + counter += 1; + try { + URL[] jarUrl = { new URL("jar:file:" + customTikaParser.getPathToJar()+"!/") }; + URLClassLoader urlClassLoader = URLClassLoader.newInstance(jarUrl); + Class customParserClass = urlClassLoader.loadClass(customTikaParser.getClassName()); + Parser customParser = (Parser) customParserClass.newInstance(); + String[] customMimeStrings = new String[customTikaParser.getMimeTypes().size()]; + Set customMediaTypes = MediaType.set(customTikaParser.getMimeTypes().toArray(customMimeStrings)); + excludeMediaTypes.addAll(customMediaTypes); + Parser customParserDecorated = ParserDecorator.withTypes(customParser, customMediaTypes); + PARSERS[counter] = customParserDecorated; + + + + } catch (IOException e) { + logger.error("Caught IOException:" + e.getMessage()); + } catch (ClassNotFoundException e) { + logger.error("Caught ClassNotFoundException:" + e.getMessage()); + } catch (InstantiationException e) { + logger.error("Caught InstantiationException:" + e.getMessage()); + } catch (IllegalAccessException e) { + logger.error("Caught IllegalAccessException:" + e.getMessage()); + }/*catch (TikaException te) { + logger.error("Caught TikaException:" + te.getMessage()); + } catch (SAXException se) { + logger.error("Caught SAXException:" + se.getMessage()); + }*/ + } + if (excludeMediaTypes.size() > 0) { + MediaType[] excludedMediaTypeSet = new MediaType[excludeMediaTypes.size()]; + PARSERS[0] = ParserDecorator.withoutTypes(defaultParser, new HashSet(excludeMediaTypes)); + } else { + PARSERS[0] = defaultParser; + } + PARSERS[counter+1] = pdfParser; + parser = new AutoDetectParser(PARSERS); + } else { + Parser PARSERS[] = new Parser[2]; + PARSERS[0] = defaultParser; + PARSERS[1] = pdfParser; + parser = new AutoDetectParser(PARSERS); + } + } } From 456104c4e0a46f649e53f2a491bbb6fc2fc3b6d7 Mon Sep 17 00:00:00 2001 From: jevertz Date: Wed, 21 Feb 2018 15:12:14 +0100 Subject: [PATCH 09/11] After rebase. Added support for a custom external Tika Parser. --- .../{meta => }/settings/CustomTikaParser.java | 8 +++--- tika-config.xml | 28 ------------------- .../crawler/fs/tika/TikaInstance.java | 2 ++ 3 files changed, 6 insertions(+), 32 deletions(-) rename settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/{meta => }/settings/CustomTikaParser.java (77%) delete mode 100644 tika-config.xml diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/meta/settings/CustomTikaParser.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/CustomTikaParser.java similarity index 77% rename from settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/meta/settings/CustomTikaParser.java rename to settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/CustomTikaParser.java index 5703e962e..f0ea665de 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/meta/settings/CustomTikaParser.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/CustomTikaParser.java @@ -1,4 +1,4 @@ -package fr.pilato.elasticsearch.crawler.fs.meta.settings; +package fr.pilato.elasticsearch.crawler.fs.settings; import java.util.ArrayList; import java.util.List; @@ -19,21 +19,21 @@ public static class Builder { private String className = ""; - public CustomTikaParser.Builder setClassName(String className) { + public fr.pilato.elasticsearch.crawler.fs.settings.CustomTikaParser.Builder setClassName(String className) { this.className = className; return this; } private String pathToJar = ""; - public CustomTikaParser.Builder setPathToJar(String pathToJar) { + public fr.pilato.elasticsearch.crawler.fs.settings.CustomTikaParser.Builder setPathToJar(String pathToJar) { this.pathToJar = pathToJar; return this; } private List mimeTypes = new ArrayList(); - public CustomTikaParser.Builder setMimeTypes(ArrayList mimeTypes) { + public fr.pilato.elasticsearch.crawler.fs.settings.CustomTikaParser.Builder setMimeTypes(ArrayList mimeTypes) { this.mimeTypes = mimeTypes; return this; } diff --git a/tika-config.xml b/tika-config.xml deleted file mode 100644 index b4656e03d..000000000 --- a/tika-config.xml +++ /dev/null @@ -1,28 +0,0 @@ - - - - - - image/jpeg - application/pdf - application/x-font-ttf - application/x-font-ttc - application/x-font-otf - - - - - application/x-font-ttf - - application/x-font-otf - false - - - application/pdf - no_ocr - - - - - \ No newline at end of file diff --git a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java index 7240d6b5d..8bf50a6fb 100644 --- a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java +++ b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java @@ -22,6 +22,7 @@ import fr.pilato.elasticsearch.crawler.fs.settings.Fs; import fr.pilato.elasticsearch.crawler.fs.settings.FsSettings; +import fr.pilato.elasticsearch.crawler.fs.settings.CustomTikaParser; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.tika.config.ServiceLoader; @@ -35,6 +36,7 @@ import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.ocr.TesseractOCRParser; From ed528b2d95d79558f66c191e5a4034615203d571 Mon Sep 17 00:00:00 2001 From: jevertz Date: Wed, 21 Feb 2018 16:47:54 +0100 Subject: [PATCH 10/11] Extended README to cover custom tika parsers setting --- README.md | 178 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 177 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 60c602596..13a85b441 100644 --- a/README.md +++ b/README.md @@ -622,7 +622,8 @@ Here is a list of Local FS settings (under `fs.` prefix)`: | `fs.continue_on_error` | `false` | [Continue on File Permission Error](#continue-on-error) (from 2.3) | | `fs.pdf_ocr` | `true` | [Run OCR on PDF documents](#ocr-integration) (from 2.3) | | `fs.indexed_chars` | `100000.0` | [Extracted characters](#extracted-characters) | -| `fs.checksum` | `null` | [File Checksum](#file-checksum) | +| `fs.checksum` | `null` | [File Checksum](#file-checksum) +| `fs.custom_tika_parsers` | `null` | [Custom Tika Parsers](#custom-tika-parsers) | #### Root directory @@ -1198,6 +1199,181 @@ to compute the checksum, such as `MD5` or `SHA-1`. } ``` +#### Custom Tika Parsers + +It might occur that one or more existing Tika parsers do not provide the intended information, or just do not exist. +This setting allows to use a custom parser instead. +The parsers must be provided as a .jar, but does not need to be on any classpath. +Note that this is an array. Here an example for just one + +```json +{ + "name": "test", + "fs": { + "custom_tika_parsers": [ + { + "class_name": "org.me.MyParser", + "path_to_jar": "/some/full/path/to/myParser-0.0.1-SNAPSHOT.jar", + "mime_types": ["application/dns", "or-another-mimetype-from-tika"] + } + ] + } +} +``` + +Some info about creating a custom parser is available [here](https://tika.apache.org/1.17/parser_guide.html) +Or use a existing parser as a blueprint. Make sure to choose the correct branch. +At the time of this writing fscrawler uses Tika 1.17, while on github the main branch is 2.x. +The parsers from ["branch_1x"](https://github.com/apache/tika/tree/branch_1x/tika-parsers/src/main/java/org/apache/tika/parser) should work fine. + +To build the custom parser separately, a pom file can be derived from the tika-parsers [pom.xml](https://github.com/apache/tika/blob/branch_1x/tika-parsers/pom.xml). +Probably a lot can be left out. Here is an example which required fontbox (guess still to long, but worked). + +
Example pom.xml +

+ + +``` + + 4.0.0 + + org.me + myParser + 0.0.1-SNAPSHOT + + + 1.7 + 1.7 + UTF-8 + 1.17 + 2.0.8 + + + + src + + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.0.0-M1 + + all,-missing,-accessibility + true + + + + + + + + org.apache.tika + tika-parsers + ${tika.version} + + + + edu.ucar + netcdf + + + + edu.ucar + cdm + + + + edu.ucar + httpservices + + + + edu.ucar + grib + + + + edu.ucar + netcdf4 + + + + com.uwyn + jhighlight + + + + org.ow2.asm + asm-debug-all + + + commons-logging + commons-logging-api + + + + org.apache.cxf + cxf-rt-rs-client + + + + + org.apache.pdfbox + fontbox + ${fontbox.version} + + + + edu.ucar + netcdf + + + + edu.ucar + cdm + + + + edu.ucar + httpservices + + + + edu.ucar + grib + + + + edu.ucar + netcdf4 + + + + com.uwyn + jhighlight + + + + org.ow2.asm + asm-debug-all + + + commons-logging + commons-logging-api + + + + org.apache.cxf + cxf-rt-rs-client + + + + + +``` + +

+
### SSH settings From 05a609a74fb1000dffdb1aa4b1d971e2f5d3a1b0 Mon Sep 17 00:00:00 2001 From: jevertz Date: Wed, 28 Feb 2018 14:44:59 +0100 Subject: [PATCH 11/11] Implemented change requests https://github.com/dadoonet/fscrawler/pull/498/files/224f22e5611acb7308ac2cbe25d6a22602894773 --- README.md | 11 ++++--- .../crawler/fs/settings/CustomTikaParser.java | 33 +++++++++++++------ .../elasticsearch/crawler/fs/settings/Fs.java | 13 -------- .../fs/settings/FsSettingsParserTest.java | 16 +++++++-- .../crawler/fs/tika/TikaInstance.java | 28 +++------------- 5 files changed, 47 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 13a85b441..929befcc9 100644 --- a/README.md +++ b/README.md @@ -1204,7 +1204,7 @@ to compute the checksum, such as `MD5` or `SHA-1`. It might occur that one or more existing Tika parsers do not provide the intended information, or just do not exist. This setting allows to use a custom parser instead. The parsers must be provided as a .jar, but does not need to be on any classpath. -Note that this is an array. Here an example for just one +Note that this is an array. Here an example for just one: ```json { @@ -1223,11 +1223,12 @@ Note that this is an array. Here an example for just one Some info about creating a custom parser is available [here](https://tika.apache.org/1.17/parser_guide.html) Or use a existing parser as a blueprint. Make sure to choose the correct branch. -At the time of this writing fscrawler uses Tika 1.17, while on github the main branch is 2.x. +At the time of this writing fscrawler uses Tika 1.17, while on github the main Tika branch is 2.x. The parsers from ["branch_1x"](https://github.com/apache/tika/tree/branch_1x/tika-parsers/src/main/java/org/apache/tika/parser) should work fine. To build the custom parser separately, a pom file can be derived from the tika-parsers [pom.xml](https://github.com/apache/tika/blob/branch_1x/tika-parsers/pom.xml). -Probably a lot can be left out. Here is an example which required fontbox (guess still to long, but worked). +Probably a lot can be left out. Here is an example which requires fontbox. +(The exclusions are copied 1:1 from fscrawler's pom.xml, to be on the safe side)
Example pom.xml

@@ -1242,8 +1243,8 @@ Probably a lot can be left out. Here is an example which required fontbox (guess 0.0.1-SNAPSHOT - 1.7 - 1.7 + 1.8 + 1.8 UTF-8 1.17 2.0.8 diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/CustomTikaParser.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/CustomTikaParser.java index f0ea665de..20506f961 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/CustomTikaParser.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/CustomTikaParser.java @@ -6,10 +6,8 @@ public class CustomTikaParser { private String className = ""; - private String pathToJar = ""; - - private List mimeTypes = new ArrayList(); + private ArrayList mimeTypes = new ArrayList(); public static Builder builder() { return new Builder(); @@ -18,25 +16,27 @@ public static Builder builder() { public static class Builder { private String className = ""; + private String pathToJar = ""; + private ArrayList mimeTypes = new ArrayList(); - public fr.pilato.elasticsearch.crawler.fs.settings.CustomTikaParser.Builder setClassName(String className) { + public Builder setClassName(String className) { this.className = className; return this; } - private String pathToJar = ""; - - public fr.pilato.elasticsearch.crawler.fs.settings.CustomTikaParser.Builder setPathToJar(String pathToJar) { + public Builder setPathToJar(String pathToJar) { this.pathToJar = pathToJar; return this; } - private List mimeTypes = new ArrayList(); - - public fr.pilato.elasticsearch.crawler.fs.settings.CustomTikaParser.Builder setMimeTypes(ArrayList mimeTypes) { + public Builder setMimeTypes(ArrayList mimeTypes) { this.mimeTypes = mimeTypes; return this; } + + public CustomTikaParser build() { + return new CustomTikaParser(className, pathToJar, mimeTypes); + } } public CustomTikaParser() { @@ -74,4 +74,17 @@ public void setMimeTypes(ArrayList mimeTypes) { this.mimeTypes = mimeTypes; } + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + CustomTikaParser ctp = (CustomTikaParser) o; + + if (className != null ? !className.equals(ctp.className) : ctp.className != null) return false; + if (pathToJar != null ? !pathToJar.equals(ctp.pathToJar) : ctp.pathToJar != null) return false; + return mimeTypes != null ? mimeTypes.equals(ctp.mimeTypes) : ctp.mimeTypes == null; + + } + } \ No newline at end of file diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java index de7f9cc1e..ff1122a57 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java @@ -219,19 +219,6 @@ public Builder setTikaCustomParsers(List customTikaParsers) { return this; } - public Builder addTikaCustomParsers(CustomTikaParser customTikaParser) { - if (this.customTikaParsers == null) { - this.customTikaParsers = new ArrayList<>(); - } - - // We refuse to add duplicates - if (!this.customTikaParsers.contains(customTikaParser)) { - this.customTikaParsers.add(customTikaParser); - } - - return this; - } - public Fs build() { return new Fs(url, updateRate, includes, excludes, jsonSupport, filenameAsId, addFilesize, removeDeleted, addAsInnerObject, storeSource, indexedChars, indexContent, attributesSupport, rawMetadata, diff --git a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java index aa080f79b..05a74b83e 100644 --- a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java +++ b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java @@ -22,10 +22,13 @@ import fr.pilato.elasticsearch.crawler.fs.framework.Percentage; import fr.pilato.elasticsearch.crawler.fs.framework.TimeValue; import fr.pilato.elasticsearch.crawler.fs.test.framework.AbstractFSCrawlerTestCase; +import org.hamcrest.Matcher; import org.junit.Test; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.contains; @@ -41,6 +44,13 @@ public class FsSettingsParserTest extends AbstractFSCrawlerTestCase { private static final Ocr OCR_FULL = Ocr.builder().setLanguage("eng").build(); private static final Fs FS_EMPTY = Fs.builder().build(); + + private static final CustomTikaParser CTS = CustomTikaParser.builder() + .setClassName("org.test.aParser") + .setPathToJar("./a_parser.jar") + .setMimeTypes(new ArrayList<>(Arrays.asList("text/json"))) + .build(); + private static final Fs FS_FULL = Fs.builder() .setUrl("/path/to/docs") .setStoreSource(true) @@ -54,8 +64,9 @@ public class FsSettingsParserTest extends AbstractFSCrawlerTestCase { .setUpdateRate(TimeValue.timeValueMinutes(5)) .setIndexContent(true) .setOcr(OCR_FULL) - .setTikaCustomParsers(new ArrayList<>()) + .setTikaCustomParsers(new ArrayList<>(Arrays.asList(CTS))) .build(); + private static final Elasticsearch ELASTICSEARCH_EMPTY = Elasticsearch.builder().build(); private static final Elasticsearch ELASTICSEARCH_FULL = Elasticsearch.builder() .addNode(Elasticsearch.Node.builder() @@ -90,7 +101,8 @@ private void settingsTester(FsSettings source) throws IOException { logger.info("-> testing settings: [{}]", json); FsSettings generated = FsSettingsParser.fromJson(json); - assertThat(generated, is(source)); + Matcher mmatch = is(source); + assertThat(generated, mmatch); } @Test diff --git a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java index 8bf50a6fb..a22b94a8c 100644 --- a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java +++ b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java @@ -26,7 +26,6 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.tika.config.ServiceLoader; -import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.metadata.Metadata; @@ -45,14 +44,12 @@ import org.apache.tika.sax.WriteOutContentHandler; import org.xml.sax.SAXException; -import java.io.File; + import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.net.URLClassLoader; import java.util.*; -import java.util.jar.JarEntry; -import java.util.jar.JarFile; import static org.apache.tika.langdetect.OptimaizeLangDetector.getDefaultLanguageDetector; @@ -87,8 +84,6 @@ private static void initParser(Fs fs) { PDFParser pdfParser = new PDFParser(); DefaultParser defaultParser; - - if (fs.isPdfOcr()) { logger.debug("OCR is activated for PDF documents"); if (ExternalParser.check("tesseract")) { @@ -105,14 +100,11 @@ private static void initParser(Fs fs) { Collections.singletonList(TesseractOCRParser.class)); } - // load custom parsers if defined in config if (fs.getCustomTikaParsers().size() > 0) { Integer counter = 0; Parser PARSERS[] = new Parser[fs.getCustomTikaParsers().size()+2]; - - // to collect all Mediatypes handled by custom parser to exclude them form the DefaultParser List excludeMediaTypes = new ArrayList(); @@ -130,21 +122,9 @@ private static void initParser(Fs fs) { Parser customParserDecorated = ParserDecorator.withTypes(customParser, customMediaTypes); PARSERS[counter] = customParserDecorated; - - - } catch (IOException e) { - logger.error("Caught IOException:" + e.getMessage()); - } catch (ClassNotFoundException e) { - logger.error("Caught ClassNotFoundException:" + e.getMessage()); - } catch (InstantiationException e) { - logger.error("Caught InstantiationException:" + e.getMessage()); - } catch (IllegalAccessException e) { - logger.error("Caught IllegalAccessException:" + e.getMessage()); - }/*catch (TikaException te) { - logger.error("Caught TikaException:" + te.getMessage()); - } catch (SAXException se) { - logger.error("Caught SAXException:" + se.getMessage()); - }*/ + } catch (IOException|ClassNotFoundException|InstantiationException|IllegalAccessException e) { + logger.error("Caught {}: {}", e.getClass().getSimpleName(), e.getMessage()); + } } if (excludeMediaTypes.size() > 0) { MediaType[] excludedMediaTypeSet = new MediaType[excludeMediaTypes.size()];