[GECO-122] Formatted the file

diging · abhis1989kumar · Sep 13, 2018 · Oct 2, 2018 · Oct 12, 2018 · Oct 16, 2018
commit 3a0479f4add0b95cd85aae7551875411ee6fa535
diff --git a/cassiopeia/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/cassiopeia/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -86,297 +86,290 @@
  */
 public class TesseractOCRParser extends AbstractParser {
 
-    @Autowired
-    private ISystemMessageHandler messageHandler;
-
-    private static final long serialVersionUID = -8167538283213097265L;
-    private static final TesseractOCRConfig DEFAULT_CONFIG = new TesseractOCRConfig();
-    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
-            new HashSet<MediaType>(Arrays.asList(new MediaType[] {
-                    MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"),
-                    MediaType.image("x-ms-bmp"), MediaType.image("gif")
-            })));
-    private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>();
-
-    private boolean createHOCR = false;
-
-    public TesseractOCRParser(boolean createHOCR) {
-        this.createHOCR = createHOCR;
-    }
-
-    @Override
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        // If Tesseract is installed, offer our supported image types
-        TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
-        if (hasTesseract(config))
-            return SUPPORTED_TYPES;
-
-        // Otherwise don't advertise anything, so the other image parsers
-        //  can be selected instead
-        return Collections.emptySet();
-    }
-
-    private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
-        String tessdataPrefix = "TESSDATA_PREFIX";
-        Map<String, String> env = pb.environment();
-
-        if (!config.getTessdataPath().isEmpty()) {
-            env.put(tessdataPrefix, config.getTessdataPath());
-        }
-        else if(!config.getTesseractPath().isEmpty()) {
-            env.put(tessdataPrefix, config.getTesseractPath());
-        }
-    }
-
-    private boolean hasTesseract(TesseractOCRConfig config) {
-        // Fetch where the config says to find Tesseract
-        String tesseract = config.getTesseractPath() + getTesseractProg();
-
-        // Have we already checked for a copy of Tesseract there?
-        if (TESSERACT_PRESENT.containsKey(tesseract)) {
-            return TESSERACT_PRESENT.get(tesseract);
-        }
-
-        // Try running Tesseract from there, and see if it exists + works
-        String[] checkCmd = { tesseract };
-        boolean hasTesseract = ExternalParser.check(checkCmd);
-        TESSERACT_PRESENT.put(tesseract, hasTesseract);
-        return hasTesseract;
-
-    }
-
-    public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
-            SAXException, TikaException {
-
-        TemporaryResources tmp = new TemporaryResources();
-        FileOutputStream fos = null;
-        TikaInputStream tis = null;
-        try {
-            int w = image.getWidth(null);
-            int h = image.getHeight(null);
-            BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
-            File file = tmp.createTemporaryFile();
-            fos = new FileOutputStream(file);
-            ImageIO.write(bImage, "png", fos);
-            tis = TikaInputStream.get(file);
-            parse(tis, handler, metadata, context);
-
-        } finally {
-            tmp.dispose();
-            if (tis != null)
-                tis.close();
-            if (fos != null)
-                fos.close();
-        }
-
-    }
-
-    @Override
-    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
-
-        // If Tesseract is not on the path with the current config, do not try to run OCR
-        // getSupportedTypes shouldn't have listed us as handling it, so this should only
-        //  occur if someone directly calls this parser, not via DefaultParser or similar
-        if (! hasTesseract(config))
-            return;
-
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-
-        TemporaryResources tmp = new TemporaryResources();
-        File output = null;
-        try {
-            TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
-            File input = tikaStream.getFile();
-            long size = tikaStream.getLength();
-
-            if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
-
-                output = tmp.createTemporaryFile();
-                doOCR(input, output, config);
-
-                // determine file extension
-                String fileExtension = ".txt";
-                if (createHOCR) {
-                    fileExtension = ".hocr";
-                }
-                output = new File(output.getAbsolutePath() + fileExtension);
-
-                if (output.exists())
-                    extractOutput(new FileInputStream(output), xhtml);
-
-            }
-
-            // Temporary workaround for TIKA-1445 - until we can specify
-            //  composite parsers with strategies (eg Composite, Try In Turn),
-            //  always send the image onwards to the regular parser to have
-            //  the metadata for them extracted as well
-            _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
-        } finally {
-            tmp.dispose();
-            if (output != null) {
-                output.delete();
-            }
-        }
-    }
-    // TIKA-1445 workaround parser
-    private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser();
-    private static class CompositeImageParser extends CompositeParser {
-        private static final long serialVersionUID = -2398203346206381382L;
-        private static List<Parser> imageParsers = Arrays.asList(new Parser[]{
-                new ImageParser(), new JpegParser(), new TiffParser()
-        });
-        CompositeImageParser() {
-            super(new MediaTypeRegistry(), imageParsers);
-        }
-    }
-
-    /**
-     * Run external tesseract-ocr process.
-     *
-     * @param input
-     *          File to be ocred
-     * @param output
-     *          File to collect ocr result
-     * @param config
-     *          Configuration of tesseract-ocr engine
-     * @throws TikaException
-     *           if the extraction timed out
-     * @throws IOException
-     *           if an input error occurred
-     */
-    private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
-
-        List<String> cmd = new ArrayList<>(Arrays.asList(config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
-            config.getLanguage(), "-psm", config.getPageSegMode()));
-
-        if (createHOCR) {
-            cmd.add("hocr");
-        }
-
-        ProcessBuilder pb = new ProcessBuilder(cmd.toArray(new String[cmd.size()]));
-        setEnv(config, pb);
-        final Process process = pb.start();
-
-        process.getOutputStream().close();
-        InputStream out = process.getInputStream();
-        InputStream err = process.getErrorStream();
-
-        logStream("OCR MSG", out, input);
-        logStream("OCR ERROR", err, input);
-
-        FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() {
-            public Integer call() throws Exception {
-                return process.waitFor();
-            }
-        });
-
-        Thread waitThread = new Thread(waitTask);
-        waitThread.start();
-
-        try {
-            waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
-
-        } catch (InterruptedException e) {
-            waitThread.interrupt();
-            process.destroy();
-            Thread.currentThread().interrupt();
-            throw new TikaException("TesseractOCRParser interrupted", e);
-
-        } catch (ExecutionException e) {
-            // should not be thrown
-            messageHandler.handleMessage("TesseractOCRParser attempting to retrive result of aborted task.", e, MessageType.ERROR);
-        } catch (TimeoutException e) {
-            waitThread.interrupt();
-            process.destroy();
-            throw new TikaException("TesseractOCRParser timeout", e);
-        }
-
-    }
-
-    /**
-     * Reads the contents of the given stream and write it to the given XHTML
-     * content handler. The stream is closed once fully processed.
-     *
-     * @param stream
-     *          Stream where is the result of ocr
-     * @param xhtml
-     *          XHTML content handler
-     * @throws SAXException
-     *           if the XHTML SAX events could not be handled
-     * @throws IOException
-     *           if an input error occurred
-     */
-    private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
-
-        xhtml.startDocument();
-        xhtml.startElement("div");
-        try (Reader reader = new InputStreamReader(stream, UTF_8)) {
-            char[] buffer = new char[1024];
-            for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
-                if (n > 0)
-                    xhtml.characters(buffer, 0, n);
-            }
-        }
-        xhtml.endElement("div");
-        xhtml.endDocument();
-    }
-
-    /**
-     * Starts a thread that reads the contents of the standard output or error
-     * stream of the given process to not block the process. The stream is closed
-     * once fully processed.
-     */
-    private void logStream(final String logType, final InputStream stream, final File file) {
-        new Thread() {
-            public void run() {
-                Reader reader = new InputStreamReader(stream, UTF_8);
-                StringBuilder out = new StringBuilder();
-                char[] buffer = new char[1024];
-                try {
-                    for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
-                        out.append(buffer, 0, n);
-                } catch (IOException e) {
-                    messageHandler.handleMessage("Could not read input stream.", e, MessageType.ERROR);
-                } finally {
-                    IOUtils.closeQuietly(stream);
-                }
-
-                String msg = out.toString();
-                LogFactory.getLog(TesseractOCRParser.class).debug(msg);
-            }
-        }.start();
-    }
-
-    static String getTesseractProg() {
-        return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
-    }
-
-    public String[] getTessLangs(IPropertiesManager propertyManager) {
-    	String tesseractBin = propertyManager.getProperty(Properties.TESSERACT_BIN_FOLDER);
-    	String command = tesseractBin + "/tesseract --list-langs";
-        Process proc;
-        BufferedReader reader;
-        String output = "";
-        String[] lang_list;
-        String[] languages;
-        try {
+	@Autowired
+	private ISystemMessageHandler messageHandler;
+
+	private static final long serialVersionUID = -8167538283213097265L;
+	private static final TesseractOCRConfig DEFAULT_CONFIG = new TesseractOCRConfig();
+	private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
+			new HashSet<MediaType>(Arrays.asList(new MediaType[] { MediaType.image("png"), MediaType.image("jpeg"),
+					MediaType.image("tiff"), MediaType.image("x-ms-bmp"), MediaType.image("gif") })));
+	private static Map<String, Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>();
+
+	private boolean createHOCR = false;
+
+	public TesseractOCRParser(boolean createHOCR) {
+		this.createHOCR = createHOCR;
+	}
+
+	@Override
+	public Set<MediaType> getSupportedTypes(ParseContext context) {
+		// If Tesseract is installed, offer our supported image types
+		TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+		if (hasTesseract(config))
+			return SUPPORTED_TYPES;
+
+		// Otherwise don't advertise anything, so the other image parsers
+		// can be selected instead
+		return Collections.emptySet();
+	}
+
+	private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
+		String tessdataPrefix = "TESSDATA_PREFIX";
+		Map<String, String> env = pb.environment();
+
+		if (!config.getTessdataPath().isEmpty()) {
+			env.put(tessdataPrefix, config.getTessdataPath());
+		} else if (!config.getTesseractPath().isEmpty()) {
+			env.put(tessdataPrefix, config.getTesseractPath());
+		}
+	}
+
+	private boolean hasTesseract(TesseractOCRConfig config) {
+		// Fetch where the config says to find Tesseract
+		String tesseract = config.getTesseractPath() + getTesseractProg();
+
+		// Have we already checked for a copy of Tesseract there?
+		if (TESSERACT_PRESENT.containsKey(tesseract)) {
+			return TESSERACT_PRESENT.get(tesseract);
+		}
+
+		// Try running Tesseract from there, and see if it exists + works
+		String[] checkCmd = { tesseract };
+		boolean hasTesseract = ExternalParser.check(checkCmd);
+		TESSERACT_PRESENT.put(tesseract, hasTesseract);
+		return hasTesseract;
+
+	}
+
+	public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context)
+			throws IOException, SAXException, TikaException {
+
+		TemporaryResources tmp = new TemporaryResources();
+		FileOutputStream fos = null;
+		TikaInputStream tis = null;
+		try {
+			int w = image.getWidth(null);
+			int h = image.getHeight(null);
+			BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
+			File file = tmp.createTemporaryFile();
+			fos = new FileOutputStream(file);
+			ImageIO.write(bImage, "png", fos);
+			tis = TikaInputStream.get(file);
+			parse(tis, handler, metadata, context);
+
+		} finally {
+			tmp.dispose();
+			if (tis != null)
+				tis.close();
+			if (fos != null)
+				fos.close();
+		}
+
+	}
+
+	@Override
+	public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+			throws IOException, SAXException, TikaException {
+		TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+
+		// If Tesseract is not on the path with the current config, do not try to run
+		// OCR
+		// getSupportedTypes shouldn't have listed us as handling it, so this should
+		// only
+		// occur if someone directly calls this parser, not via DefaultParser or similar
+		if (!hasTesseract(config))
+			return;
+
+		XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+		TemporaryResources tmp = new TemporaryResources();
+		File output = null;
+		try {
+			TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+			File input = tikaStream.getFile();
+			long size = tikaStream.getLength();
+
+			if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
+
+				output = tmp.createTemporaryFile();
+				doOCR(input, output, config);
+
+				// determine file extension
+				String fileExtension = ".txt";
+				if (createHOCR) {
+					fileExtension = ".hocr";
+				}
+				output = new File(output.getAbsolutePath() + fileExtension);
+
+				if (output.exists())
+					extractOutput(new FileInputStream(output), xhtml);
+
+			}
+
+			// Temporary workaround for TIKA-1445 - until we can specify
+			// composite parsers with strategies (eg Composite, Try In Turn),
+			// always send the image onwards to the regular parser to have
+			// the metadata for them extracted as well
+			_TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
+		} finally {
+			tmp.dispose();
+			if (output != null) {
+				output.delete();
+			}
+		}
+	}
+
+	// TIKA-1445 workaround parser
+	private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser();
+
+	private static class CompositeImageParser extends CompositeParser {
+		private static final long serialVersionUID = -2398203346206381382L;
+		private static List<Parser> imageParsers = Arrays
+				.asList(new Parser[] { new ImageParser(), new JpegParser(), new TiffParser() });
+
+		CompositeImageParser() {
+			super(new MediaTypeRegistry(), imageParsers);
+		}
+	}
+
+	/**
+	 * Run external tesseract-ocr process.
+	 *
+	 * @param input  File to be ocred
+	 * @param output File to collect ocr result
+	 * @param config Configuration of tesseract-ocr engine
+	 * @throws TikaException if the extraction timed out
+	 * @throws IOException   if an input error occurred
+	 */
+	private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
+
+		List<String> cmd = new ArrayList<>(Arrays.asList(config.getTesseractPath() + getTesseractProg(),
+				input.getPath(), output.getPath(), "-l", config.getLanguage(), "-psm", config.getPageSegMode()));
+
+		if (createHOCR) {
+			cmd.add("hocr");
+		}
+
+		ProcessBuilder pb = new ProcessBuilder(cmd.toArray(new String[cmd.size()]));
+		setEnv(config, pb);
+		final Process process = pb.start();
+
+		process.getOutputStream().close();
+		InputStream out = process.getInputStream();
+		InputStream err = process.getErrorStream();
+
+		logStream("OCR MSG", out, input);
+		logStream("OCR ERROR", err, input);
+
+		FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() {
+			public Integer call() throws Exception {
+				return process.waitFor();
+			}
+		});
+
+		Thread waitThread = new Thread(waitTask);
+		waitThread.start();
+
+		try {
+			waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+
+		} catch (InterruptedException e) {
+			waitThread.interrupt();
+			process.destroy();
+			Thread.currentThread().interrupt();
+			throw new TikaException("TesseractOCRParser interrupted", e);
+
+		} catch (ExecutionException e) {
+			// should not be thrown
+			messageHandler.handleMessage("TesseractOCRParser attempting to retrive result of aborted task.", e,
+					MessageType.ERROR);
+		} catch (TimeoutException e) {
+			waitThread.interrupt();
+			process.destroy();
+			throw new TikaException("TesseractOCRParser timeout", e);
+		}
+
+	}
+
+	/**
+	 * Reads the contents of the given stream and write it to the given XHTML
+	 * content handler. The stream is closed once fully processed.
+	 *
+	 * @param stream Stream where is the result of ocr
+	 * @param xhtml  XHTML content handler
+	 * @throws SAXException if the XHTML SAX events could not be handled
+	 * @throws IOException  if an input error occurred
+	 */
+	private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
+
+		xhtml.startDocument();
+		xhtml.startElement("div");
+		try (Reader reader = new InputStreamReader(stream, UTF_8)) {
+			char[] buffer = new char[1024];
+			for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+				if (n > 0)
+					xhtml.characters(buffer, 0, n);
+			}
+		}
+		xhtml.endElement("div");
+		xhtml.endDocument();
+	}
+
+	/**
+	 * Starts a thread that reads the contents of the standard output or error
+	 * stream of the given process to not block the process. The stream is closed
+	 * once fully processed.
+	 */
+	private void logStream(final String logType, final InputStream stream, final File file) {
+		new Thread() {
+			public void run() {
+				Reader reader = new InputStreamReader(stream, UTF_8);
+				StringBuilder out = new StringBuilder();
+				char[] buffer = new char[1024];
+				try {
+					for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
+						out.append(buffer, 0, n);
+				} catch (IOException e) {
+					messageHandler.handleMessage("Could not read input stream.", e, MessageType.ERROR);
+				} finally {
+					IOUtils.closeQuietly(stream);
+				}
+
+				String msg = out.toString();
+				LogFactory.getLog(TesseractOCRParser.class).debug(msg);
+			}
+		}.start();
+	}
+
+	static String getTesseractProg() {
+		return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
+	}
+
+	public String[] getTessLangs(IPropertiesManager propertyManager) {
+		String tesseractBin = propertyManager.getProperty(Properties.TESSERACT_BIN_FOLDER);
+		String command = tesseractBin + "/tesseract --list-langs";
+		Process proc;
+		BufferedReader reader;
+		String output = "";
+		String[] lang_list;
+		String[] languages;
+		try {
 			proc = Runtime.getRuntime().exec(command);
-			reader =  new BufferedReader(new InputStreamReader(proc.getInputStream()));
+			reader = new BufferedReader(new InputStreamReader(proc.getInputStream()));
 			String line = "";
-	        while((line = reader.readLine()) != null) {
-	        	output = output + line + " ";
-	        }
-	        proc.waitFor();   
+			while ((line = reader.readLine()) != null) {
+				output = output + line + " ";
+			}
+			proc.waitFor();
 		} catch (IOException e) {
-            messageHandler.handleMessage("Error while getting Tesserract languages.", e, MessageType.ERROR);
+			messageHandler.handleMessage("Error while getting Tesserract languages.", e, MessageType.ERROR);
 		} catch (InterruptedException e) {
-            messageHandler.handleMessage("Error while getting Tesserract languages.", e, MessageType.ERROR);
+			messageHandler.handleMessage("Error while getting Tesserract languages.", e, MessageType.ERROR);
 		}
-        lang_list = output.split(":");
-        languages = lang_list[1].split(" ");
-        return languages;
-    }
+		lang_list = output.split(":");
+		languages = lang_list[1].split(" ");
+		return languages;
+	}
 
 }