diff --git a/README.md b/README.md index 60c602596..929befcc9 100644 --- a/README.md +++ b/README.md @@ -622,7 +622,8 @@ Here is a list of Local FS settings (under `fs.` prefix)`: | `fs.continue_on_error` | `false` | [Continue on File Permission Error](#continue-on-error) (from 2.3) | | `fs.pdf_ocr` | `true` | [Run OCR on PDF documents](#ocr-integration) (from 2.3) | | `fs.indexed_chars` | `100000.0` | [Extracted characters](#extracted-characters) | -| `fs.checksum` | `null` | [File Checksum](#file-checksum) | +| `fs.checksum` | `null` | [File Checksum](#file-checksum) +| `fs.custom_tika_parsers` | `null` | [Custom Tika Parsers](#custom-tika-parsers) | #### Root directory @@ -1198,6 +1199,182 @@ to compute the checksum, such as `MD5` or `SHA-1`. } ``` +#### Custom Tika Parsers + +It might occur that one or more existing Tika parsers do not provide the intended information, or just do not exist. +This setting allows to use a custom parser instead. +The parsers must be provided as a .jar, but does not need to be on any classpath. +Note that this is an array. Here an example for just one: + +```json +{ + "name": "test", + "fs": { + "custom_tika_parsers": [ + { + "class_name": "org.me.MyParser", + "path_to_jar": "/some/full/path/to/myParser-0.0.1-SNAPSHOT.jar", + "mime_types": ["application/dns", "or-another-mimetype-from-tika"] + } + ] + } +} +``` + +Some info about creating a custom parser is available [here](https://tika.apache.org/1.17/parser_guide.html) +Or use a existing parser as a blueprint. Make sure to choose the correct branch. +At the time of this writing fscrawler uses Tika 1.17, while on github the main Tika branch is 2.x. +The parsers from ["branch_1x"](https://github.com/apache/tika/tree/branch_1x/tika-parsers/src/main/java/org/apache/tika/parser) should work fine. + +To build the custom parser separately, a pom file can be derived from the tika-parsers [pom.xml](https://github.com/apache/tika/blob/branch_1x/tika-parsers/pom.xml). +Probably a lot can be left out. Here is an example which requires fontbox. +(The exclusions are copied 1:1 from fscrawler's pom.xml, to be on the safe side) + +
Example pom.xml +

+ + +``` + + 4.0.0 + + org.me + myParser + 0.0.1-SNAPSHOT + + + 1.8 + 1.8 + UTF-8 + 1.17 + 2.0.8 + + + + src + + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.0.0-M1 + + all,-missing,-accessibility + true + + + + + + + + org.apache.tika + tika-parsers + ${tika.version} + + + + edu.ucar + netcdf + + + + edu.ucar + cdm + + + + edu.ucar + httpservices + + + + edu.ucar + grib + + + + edu.ucar + netcdf4 + + + + com.uwyn + jhighlight + + + + org.ow2.asm + asm-debug-all + + + commons-logging + commons-logging-api + + + + org.apache.cxf + cxf-rt-rs-client + + + + + org.apache.pdfbox + fontbox + ${fontbox.version} + + + + edu.ucar + netcdf + + + + edu.ucar + cdm + + + + edu.ucar + httpservices + + + + edu.ucar + grib + + + + edu.ucar + netcdf4 + + + + com.uwyn + jhighlight + + + + org.ow2.asm + asm-debug-all + + + commons-logging + commons-logging-api + + + + org.apache.cxf + cxf-rt-rs-client + + + + + +``` + +

+
### SSH settings diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/CustomTikaParser.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/CustomTikaParser.java new file mode 100644 index 000000000..20506f961 --- /dev/null +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/CustomTikaParser.java @@ -0,0 +1,90 @@ +package fr.pilato.elasticsearch.crawler.fs.settings; + +import java.util.ArrayList; +import java.util.List; + +public class CustomTikaParser { + + private String className = ""; + private String pathToJar = ""; + private ArrayList mimeTypes = new ArrayList(); + + public static Builder builder() { + return new Builder(); + } + + public static class Builder { + + private String className = ""; + private String pathToJar = ""; + private ArrayList mimeTypes = new ArrayList(); + + public Builder setClassName(String className) { + this.className = className; + return this; + } + + public Builder setPathToJar(String pathToJar) { + this.pathToJar = pathToJar; + return this; + } + + public Builder setMimeTypes(ArrayList mimeTypes) { + this.mimeTypes = mimeTypes; + return this; + } + + public CustomTikaParser build() { + return new CustomTikaParser(className, pathToJar, mimeTypes); + } + } + + public CustomTikaParser() { + + } + + private CustomTikaParser(String className, String pathToJar, ArrayList mimeTypes) { + + this.className = className; + this.pathToJar = pathToJar; + this.mimeTypes = mimeTypes; + } + + public String getClassName() { + return className; + } + + public void setClassName(String className) { + this.className = className; + } + + public String getPathToJar() { + return pathToJar; + } + + public void setPathToJar(String pathToJar) { + this.pathToJar = pathToJar; + } + + public List getMimeTypes() { + return mimeTypes; + } + + public void setMimeTypes(ArrayList mimeTypes) { + this.mimeTypes = mimeTypes; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + CustomTikaParser ctp = (CustomTikaParser) o; + + if (className != null ? !className.equals(ctp.className) : ctp.className != null) return false; + if (pathToJar != null ? !pathToJar.equals(ctp.pathToJar) : ctp.pathToJar != null) return false; + return mimeTypes != null ? mimeTypes.equals(ctp.mimeTypes) : ctp.mimeTypes == null; + + } + +} \ No newline at end of file diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java index 07fa5a2eb..ff1122a57 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java @@ -49,6 +49,7 @@ public class Fs { private boolean continueOnError = false; private boolean pdfOcr = true; private Ocr ocr = new Ocr(); + private List customTikaParsers = new ArrayList<>(); public static Builder builder() { return new Builder(); @@ -80,6 +81,7 @@ public static class Builder { private boolean continueOnError = false; private boolean pdfOcr = true; private Ocr ocr = new Ocr(); + private List customTikaParsers = new ArrayList<>(); public Builder setUrl(String url) { this.url = url; @@ -212,10 +214,15 @@ public Builder setOcr(Ocr ocr) { return this; } + public Builder setTikaCustomParsers(List customTikaParsers) { + this.customTikaParsers = customTikaParsers; + return this; + } + public Fs build() { return new Fs(url, updateRate, includes, excludes, jsonSupport, filenameAsId, addFilesize, removeDeleted, addAsInnerObject, storeSource, indexedChars, indexContent, attributesSupport, rawMetadata, - checksum, xmlSupport, indexFolders, langDetect, continueOnError, pdfOcr, ocr); + checksum, xmlSupport, indexFolders, langDetect, continueOnError, pdfOcr, ocr, customTikaParsers); } } @@ -226,7 +233,7 @@ public Fs( ) { private Fs(String url, TimeValue updateRate, List includes, List excludes, boolean jsonSupport, boolean filenameAsId, boolean addFilesize, boolean removeDeleted, boolean addAsInnerObject, boolean storeSource, Percentage indexedChars, boolean indexContent, boolean attributesSupport, boolean rawMetadata, String checksum, boolean xmlSupport, - boolean indexFolders, boolean langDetect, boolean continueOnError, boolean pdfOcr, Ocr ocr) { + boolean indexFolders, boolean langDetect, boolean continueOnError, boolean pdfOcr, Ocr ocr, List customTikaParsers) { this.url = url; this.updateRate = updateRate; this.includes = includes; @@ -248,6 +255,7 @@ private Fs(String url, TimeValue updateRate, List includes, List this.continueOnError = continueOnError; this.pdfOcr = pdfOcr; this.ocr = ocr; + this.customTikaParsers = customTikaParsers; } public String getUrl() { @@ -418,6 +426,14 @@ public void setOcr(Ocr ocr) { this.ocr = ocr; } + public List getCustomTikaParsers() { + return customTikaParsers; + } + + public void setCustomTikaParsers(List customTikaParsers) { + this.customTikaParsers = customTikaParsers; + } + @Override public boolean equals(Object o) { if (this == o) return true; @@ -444,6 +460,7 @@ public boolean equals(Object o) { if (includes != null ? !includes.equals(fs.includes) : fs.includes != null) return false; if (excludes != null ? !excludes.equals(fs.excludes) : fs.excludes != null) return false; if (indexedChars != null ? !indexedChars.equals(fs.indexedChars) : fs.indexedChars != null) return false; + if (customTikaParsers != null ? !customTikaParsers.equals(fs.customTikaParsers) : fs.customTikaParsers != null) return false; return checksum != null ? checksum.equals(fs.checksum) : fs.checksum == null; } @@ -470,6 +487,7 @@ public int hashCode() { result = 31 * result + (langDetect ? 1 : 0); result = 31 * result + (continueOnError ? 1 : 0); result = 31 * result + (pdfOcr ? 1 : 0); + result = 31 * result + (customTikaParsers != null ? customTikaParsers.hashCode() : 0); return result; } } diff --git a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java index a90232d64..05a74b83e 100644 --- a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java +++ b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java @@ -22,9 +22,13 @@ import fr.pilato.elasticsearch.crawler.fs.framework.Percentage; import fr.pilato.elasticsearch.crawler.fs.framework.TimeValue; import fr.pilato.elasticsearch.crawler.fs.test.framework.AbstractFSCrawlerTestCase; +import org.hamcrest.Matcher; import org.junit.Test; import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.contains; @@ -40,6 +44,13 @@ public class FsSettingsParserTest extends AbstractFSCrawlerTestCase { private static final Ocr OCR_FULL = Ocr.builder().setLanguage("eng").build(); private static final Fs FS_EMPTY = Fs.builder().build(); + + private static final CustomTikaParser CTS = CustomTikaParser.builder() + .setClassName("org.test.aParser") + .setPathToJar("./a_parser.jar") + .setMimeTypes(new ArrayList<>(Arrays.asList("text/json"))) + .build(); + private static final Fs FS_FULL = Fs.builder() .setUrl("/path/to/docs") .setStoreSource(true) @@ -53,7 +64,9 @@ public class FsSettingsParserTest extends AbstractFSCrawlerTestCase { .setUpdateRate(TimeValue.timeValueMinutes(5)) .setIndexContent(true) .setOcr(OCR_FULL) + .setTikaCustomParsers(new ArrayList<>(Arrays.asList(CTS))) .build(); + private static final Elasticsearch ELASTICSEARCH_EMPTY = Elasticsearch.builder().build(); private static final Elasticsearch ELASTICSEARCH_FULL = Elasticsearch.builder() .addNode(Elasticsearch.Node.builder() @@ -88,7 +101,8 @@ private void settingsTester(FsSettings source) throws IOException { logger.info("-> testing settings: [{}]", json); FsSettings generated = FsSettingsParser.fromJson(json); - assertThat(generated, is(source)); + Matcher mmatch = is(source); + assertThat(generated, mmatch); } @Test diff --git a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java index 524686b69..d326e8670 100644 --- a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java +++ b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java @@ -22,17 +22,20 @@ import fr.pilato.elasticsearch.crawler.fs.settings.Fs; import fr.pilato.elasticsearch.crawler.fs.settings.FsSettings; +import fr.pilato.elasticsearch.crawler.fs.settings.CustomTikaParser; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.tika.config.ServiceLoader; import org.apache.tika.exception.TikaException; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.ocr.TesseractOCRParser; @@ -43,7 +46,9 @@ import java.io.IOException; import java.io.InputStream; -import java.util.Collections; +import java.net.URL; +import java.net.URLClassLoader; +import java.util.*; import static org.apache.tika.langdetect.OptimaizeLangDetector.getDefaultLanguageDetector; @@ -94,11 +99,47 @@ private static void initParser(Fs fs) { Collections.singletonList(TesseractOCRParser.class)); } - Parser PARSERS[] = new Parser[2]; - PARSERS[0] = defaultParser; - PARSERS[1] = pdfParser; + // load custom parsers if defined in config + if (fs.getCustomTikaParsers().size() > 0) { + Integer counter = 0; + Parser PARSERS[] = new Parser[fs.getCustomTikaParsers().size()+2]; + + // to collect all Mediatypes handled by custom parser to exclude them form the DefaultParser + List excludeMediaTypes = new ArrayList(); + + + for (CustomTikaParser customTikaParser : fs.getCustomTikaParsers()) { + counter += 1; + try { + URL[] jarUrl = { new URL("jar:file:" + customTikaParser.getPathToJar()+"!/") }; + URLClassLoader urlClassLoader = URLClassLoader.newInstance(jarUrl); + Class customParserClass = urlClassLoader.loadClass(customTikaParser.getClassName()); + Parser customParser = (Parser) customParserClass.newInstance(); + String[] customMimeStrings = new String[customTikaParser.getMimeTypes().size()]; + Set customMediaTypes = MediaType.set(customTikaParser.getMimeTypes().toArray(customMimeStrings)); + excludeMediaTypes.addAll(customMediaTypes); + Parser customParserDecorated = ParserDecorator.withTypes(customParser, customMediaTypes); + PARSERS[counter] = customParserDecorated; + + } catch (IOException|ClassNotFoundException|InstantiationException|IllegalAccessException e) { + logger.error("Caught {}: {}", e.getClass().getSimpleName(), e.getMessage()); + } + } + if (excludeMediaTypes.size() > 0) { + MediaType[] excludedMediaTypeSet = new MediaType[excludeMediaTypes.size()]; + PARSERS[0] = ParserDecorator.withoutTypes(defaultParser, new HashSet(excludeMediaTypes)); + } else { + PARSERS[0] = defaultParser; + } + PARSERS[counter+1] = pdfParser; + parser = new AutoDetectParser(PARSERS); + } else { + Parser PARSERS[] = new Parser[2]; + PARSERS[0] = defaultParser; + PARSERS[1] = pdfParser; + parser = new AutoDetectParser(PARSERS); + } - parser = new AutoDetectParser(PARSERS); } }