Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Story/geco 122 #23

Open
wants to merge 36 commits into
base: develop
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
7380633
[GECO-103] Check added for Null values in File Download URL.
Sep 13, 2018
5acc28e
[GECO-103] Formatting the code
Oct 2, 2018
2998f50
[GECO-103] removing the null check code which is not required.
Oct 12, 2018
ead6133
[GECO-122] Added functionality to get the tesseract languages and sho…
Oct 16, 2018
4184509
[GECO-122] Formatted the getTessLangs() Function
Oct 16, 2018
8cc293e
[GECO-122] Making the default language display as English
Oct 18, 2018
d60e1af
[GECO-122] Fixing the code formatting
Oct 18, 2018
7ecc269
[GECO-122] Removing unused variable
Oct 18, 2018
4b29304
[GECO-122] Reset the formatting style of OCRManager.java to original
Oct 18, 2018
a1724fb
[GECO-122] Removing unused imports
Oct 23, 2018
2264cd7
[GECO-122] Made the ocrTypeMap and langTypeMap private
Oct 23, 2018
a50169a
[GECO-122] Moved the function getTessLangs() to TesseractOCRParser.ja…
Oct 23, 2018
0892cc9
[GECO-122] Removing unnecessary spaces
Oct 23, 2018
999a32e
[GECO-122] Removed unnecessary spaces
Oct 23, 2018
3332dee
[GECO-122] adding newline at the end of the class
Oct 23, 2018
3f3811f
[GECO-122] Removed unnecessary spaces
Oct 23, 2018
3eebf3f
[GECO-122] Removed unnecessary spaces
Oct 23, 2018
3a0479f
[GECO-122] Formatted the file
Oct 23, 2018
5944bdc
[GECO-122] Formatted the code
Oct 23, 2018
e3ab167
[GECO-122] Formatted the code
Oct 23, 2018
41e1441
[GECO-122] Made the defaultLang variable private
Oct 25, 2018
0dce4c8
[GECO-122] Formatted the files and made some canges in getConfigPage …
Oct 25, 2018
4ef85ed
[GECO-103] Handling the wrongly autowired annotation used in non -Spr…
Oct 25, 2018
da2eb3b
[GECO-122] BeanUtil class added to call SystemMessageHandler Bean in …
Oct 26, 2018
7c0c55a
[GECO-122] Formatted Indentation
Oct 26, 2018
3a9e2bc
[GECO-122] Removing the extra class Beanutils.java and passing the Sy…
Oct 30, 2018
d91addf
Delete BeanUtil.java
abhis1989kumar Oct 30, 2018
200876b
[GECO-122] changed the return type of getTessLangs() tp List of Strings
Nov 1, 2018
227feed
[GECO-122] Merged the declarationa nd initialisation in one line of t…
Nov 1, 2018
f556367
[GECO-122] Removed unnecessary SystemHandler variable
Nov 1, 2018
b5b2f59
[GECO-122] Formatting
Nov 1, 2018
40e6d31
Merge branch 'story/GECO-122' of https://github.com/diging/giles-eco-…
Nov 1, 2018
5dbf232
[GECO-122] finally block inserted
Nov 1, 2018
9151dbf
[GECO-122]Added comments
Nov 1, 2018
3cbd6a9
[GECO-122] Moved the code,for setting up of the default language in p…
Nov 2, 2018
ce0446d
[GECO-122] Using TesseractOCRConfig instead of propertyManager in get…
Nov 6, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[GECO-122] Formatted the file
Abhishek Kumar committed Oct 23, 2018

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 3a0479f4add0b95cd85aae7551875411ee6fa535
Original file line number Diff line number Diff line change
@@ -86,297 +86,290 @@
*/
public class TesseractOCRParser extends AbstractParser {

@Autowired
private ISystemMessageHandler messageHandler;

private static final long serialVersionUID = -8167538283213097265L;
private static final TesseractOCRConfig DEFAULT_CONFIG = new TesseractOCRConfig();
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
new HashSet<MediaType>(Arrays.asList(new MediaType[] {
MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"),
MediaType.image("x-ms-bmp"), MediaType.image("gif")
})));
private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>();

private boolean createHOCR = false;

public TesseractOCRParser(boolean createHOCR) {
this.createHOCR = createHOCR;
}

@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
// If Tesseract is installed, offer our supported image types
TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
if (hasTesseract(config))
return SUPPORTED_TYPES;

// Otherwise don't advertise anything, so the other image parsers
// can be selected instead
return Collections.emptySet();
}

private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
String tessdataPrefix = "TESSDATA_PREFIX";
Map<String, String> env = pb.environment();

if (!config.getTessdataPath().isEmpty()) {
env.put(tessdataPrefix, config.getTessdataPath());
}
else if(!config.getTesseractPath().isEmpty()) {
env.put(tessdataPrefix, config.getTesseractPath());
}
}

private boolean hasTesseract(TesseractOCRConfig config) {
// Fetch where the config says to find Tesseract
String tesseract = config.getTesseractPath() + getTesseractProg();

// Have we already checked for a copy of Tesseract there?
if (TESSERACT_PRESENT.containsKey(tesseract)) {
return TESSERACT_PRESENT.get(tesseract);
}

// Try running Tesseract from there, and see if it exists + works
String[] checkCmd = { tesseract };
boolean hasTesseract = ExternalParser.check(checkCmd);
TESSERACT_PRESENT.put(tesseract, hasTesseract);
return hasTesseract;

}

public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {

TemporaryResources tmp = new TemporaryResources();
FileOutputStream fos = null;
TikaInputStream tis = null;
try {
int w = image.getWidth(null);
int h = image.getHeight(null);
BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
File file = tmp.createTemporaryFile();
fos = new FileOutputStream(file);
ImageIO.write(bImage, "png", fos);
tis = TikaInputStream.get(file);
parse(tis, handler, metadata, context);

} finally {
tmp.dispose();
if (tis != null)
tis.close();
if (fos != null)
fos.close();
}

}

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);

// If Tesseract is not on the path with the current config, do not try to run OCR
// getSupportedTypes shouldn't have listed us as handling it, so this should only
// occur if someone directly calls this parser, not via DefaultParser or similar
if (! hasTesseract(config))
return;

XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

TemporaryResources tmp = new TemporaryResources();
File output = null;
try {
TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
File input = tikaStream.getFile();
long size = tikaStream.getLength();

if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {

output = tmp.createTemporaryFile();
doOCR(input, output, config);

// determine file extension
String fileExtension = ".txt";
if (createHOCR) {
fileExtension = ".hocr";
}
output = new File(output.getAbsolutePath() + fileExtension);

if (output.exists())
extractOutput(new FileInputStream(output), xhtml);

}

// Temporary workaround for TIKA-1445 - until we can specify
// composite parsers with strategies (eg Composite, Try In Turn),
// always send the image onwards to the regular parser to have
// the metadata for them extracted as well
_TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
} finally {
tmp.dispose();
if (output != null) {
output.delete();
}
}
}
// TIKA-1445 workaround parser
private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser();
private static class CompositeImageParser extends CompositeParser {
private static final long serialVersionUID = -2398203346206381382L;
private static List<Parser> imageParsers = Arrays.asList(new Parser[]{
new ImageParser(), new JpegParser(), new TiffParser()
});
CompositeImageParser() {
super(new MediaTypeRegistry(), imageParsers);
}
}

/**
* Run external tesseract-ocr process.
*
* @param input
* File to be ocred
* @param output
* File to collect ocr result
* @param config
* Configuration of tesseract-ocr engine
* @throws TikaException
* if the extraction timed out
* @throws IOException
* if an input error occurred
*/
private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {

List<String> cmd = new ArrayList<>(Arrays.asList(config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
config.getLanguage(), "-psm", config.getPageSegMode()));

if (createHOCR) {
cmd.add("hocr");
}

ProcessBuilder pb = new ProcessBuilder(cmd.toArray(new String[cmd.size()]));
setEnv(config, pb);
final Process process = pb.start();

process.getOutputStream().close();
InputStream out = process.getInputStream();
InputStream err = process.getErrorStream();

logStream("OCR MSG", out, input);
logStream("OCR ERROR", err, input);

FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() {
public Integer call() throws Exception {
return process.waitFor();
}
});

Thread waitThread = new Thread(waitTask);
waitThread.start();

try {
waitTask.get(config.getTimeout(), TimeUnit.SECONDS);

} catch (InterruptedException e) {
waitThread.interrupt();
process.destroy();
Thread.currentThread().interrupt();
throw new TikaException("TesseractOCRParser interrupted", e);

} catch (ExecutionException e) {
// should not be thrown
messageHandler.handleMessage("TesseractOCRParser attempting to retrive result of aborted task.", e, MessageType.ERROR);
} catch (TimeoutException e) {
waitThread.interrupt();
process.destroy();
throw new TikaException("TesseractOCRParser timeout", e);
}

}

/**
* Reads the contents of the given stream and write it to the given XHTML
* content handler. The stream is closed once fully processed.
*
* @param stream
* Stream where is the result of ocr
* @param xhtml
* XHTML content handler
* @throws SAXException
* if the XHTML SAX events could not be handled
* @throws IOException
* if an input error occurred
*/
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {

xhtml.startDocument();
xhtml.startElement("div");
try (Reader reader = new InputStreamReader(stream, UTF_8)) {
char[] buffer = new char[1024];
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
if (n > 0)
xhtml.characters(buffer, 0, n);
}
}
xhtml.endElement("div");
xhtml.endDocument();
}

/**
* Starts a thread that reads the contents of the standard output or error
* stream of the given process to not block the process. The stream is closed
* once fully processed.
*/
private void logStream(final String logType, final InputStream stream, final File file) {
new Thread() {
public void run() {
Reader reader = new InputStreamReader(stream, UTF_8);
StringBuilder out = new StringBuilder();
char[] buffer = new char[1024];
try {
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
out.append(buffer, 0, n);
} catch (IOException e) {
messageHandler.handleMessage("Could not read input stream.", e, MessageType.ERROR);
} finally {
IOUtils.closeQuietly(stream);
}

String msg = out.toString();
LogFactory.getLog(TesseractOCRParser.class).debug(msg);
}
}.start();
}

static String getTesseractProg() {
return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
}

public String[] getTessLangs(IPropertiesManager propertyManager) {
String tesseractBin = propertyManager.getProperty(Properties.TESSERACT_BIN_FOLDER);
String command = tesseractBin + "/tesseract --list-langs";
Process proc;
BufferedReader reader;
String output = "";
String[] lang_list;
String[] languages;
try {
@Autowired
private ISystemMessageHandler messageHandler;

private static final long serialVersionUID = -8167538283213097265L;
private static final TesseractOCRConfig DEFAULT_CONFIG = new TesseractOCRConfig();
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
new HashSet<MediaType>(Arrays.asList(new MediaType[] { MediaType.image("png"), MediaType.image("jpeg"),
MediaType.image("tiff"), MediaType.image("x-ms-bmp"), MediaType.image("gif") })));
private static Map<String, Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>();

private boolean createHOCR = false;

public TesseractOCRParser(boolean createHOCR) {
this.createHOCR = createHOCR;
}

@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
// If Tesseract is installed, offer our supported image types
TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
if (hasTesseract(config))
return SUPPORTED_TYPES;

// Otherwise don't advertise anything, so the other image parsers
// can be selected instead
return Collections.emptySet();
}

private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
String tessdataPrefix = "TESSDATA_PREFIX";
Map<String, String> env = pb.environment();

if (!config.getTessdataPath().isEmpty()) {
env.put(tessdataPrefix, config.getTessdataPath());
} else if (!config.getTesseractPath().isEmpty()) {
env.put(tessdataPrefix, config.getTesseractPath());
}
}

private boolean hasTesseract(TesseractOCRConfig config) {
// Fetch where the config says to find Tesseract
String tesseract = config.getTesseractPath() + getTesseractProg();

// Have we already checked for a copy of Tesseract there?
if (TESSERACT_PRESENT.containsKey(tesseract)) {
return TESSERACT_PRESENT.get(tesseract);
}

// Try running Tesseract from there, and see if it exists + works
String[] checkCmd = { tesseract };
boolean hasTesseract = ExternalParser.check(checkCmd);
TESSERACT_PRESENT.put(tesseract, hasTesseract);
return hasTesseract;

}

public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {

TemporaryResources tmp = new TemporaryResources();
FileOutputStream fos = null;
TikaInputStream tis = null;
try {
int w = image.getWidth(null);
int h = image.getHeight(null);
BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
File file = tmp.createTemporaryFile();
fos = new FileOutputStream(file);
ImageIO.write(bImage, "png", fos);
tis = TikaInputStream.get(file);
parse(tis, handler, metadata, context);

} finally {
tmp.dispose();
if (tis != null)
tis.close();
if (fos != null)
fos.close();
}

}

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);

// If Tesseract is not on the path with the current config, do not try to run
// OCR
// getSupportedTypes shouldn't have listed us as handling it, so this should
// only
// occur if someone directly calls this parser, not via DefaultParser or similar
if (!hasTesseract(config))
return;

XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

TemporaryResources tmp = new TemporaryResources();
File output = null;
try {
TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
File input = tikaStream.getFile();
long size = tikaStream.getLength();

if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {

output = tmp.createTemporaryFile();
doOCR(input, output, config);

// determine file extension
String fileExtension = ".txt";
if (createHOCR) {
fileExtension = ".hocr";
}
output = new File(output.getAbsolutePath() + fileExtension);

if (output.exists())
extractOutput(new FileInputStream(output), xhtml);

}

// Temporary workaround for TIKA-1445 - until we can specify
// composite parsers with strategies (eg Composite, Try In Turn),
// always send the image onwards to the regular parser to have
// the metadata for them extracted as well
_TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
} finally {
tmp.dispose();
if (output != null) {
output.delete();
}
}
}

// TIKA-1445 workaround parser
private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser();

private static class CompositeImageParser extends CompositeParser {
private static final long serialVersionUID = -2398203346206381382L;
private static List<Parser> imageParsers = Arrays
.asList(new Parser[] { new ImageParser(), new JpegParser(), new TiffParser() });

CompositeImageParser() {
super(new MediaTypeRegistry(), imageParsers);
}
}

/**
* Run external tesseract-ocr process.
*
* @param input File to be ocred
* @param output File to collect ocr result
* @param config Configuration of tesseract-ocr engine
* @throws TikaException if the extraction timed out
* @throws IOException if an input error occurred
*/
private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {

List<String> cmd = new ArrayList<>(Arrays.asList(config.getTesseractPath() + getTesseractProg(),
input.getPath(), output.getPath(), "-l", config.getLanguage(), "-psm", config.getPageSegMode()));

if (createHOCR) {
cmd.add("hocr");
}

ProcessBuilder pb = new ProcessBuilder(cmd.toArray(new String[cmd.size()]));
setEnv(config, pb);
final Process process = pb.start();

process.getOutputStream().close();
InputStream out = process.getInputStream();
InputStream err = process.getErrorStream();

logStream("OCR MSG", out, input);
logStream("OCR ERROR", err, input);

FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() {
public Integer call() throws Exception {
return process.waitFor();
}
});

Thread waitThread = new Thread(waitTask);
waitThread.start();

try {
waitTask.get(config.getTimeout(), TimeUnit.SECONDS);

} catch (InterruptedException e) {
waitThread.interrupt();
process.destroy();
Thread.currentThread().interrupt();
throw new TikaException("TesseractOCRParser interrupted", e);

} catch (ExecutionException e) {
// should not be thrown
messageHandler.handleMessage("TesseractOCRParser attempting to retrive result of aborted task.", e,
MessageType.ERROR);
} catch (TimeoutException e) {
waitThread.interrupt();
process.destroy();
throw new TikaException("TesseractOCRParser timeout", e);
}

}

/**
* Reads the contents of the given stream and write it to the given XHTML
* content handler. The stream is closed once fully processed.
*
* @param stream Stream where is the result of ocr
* @param xhtml XHTML content handler
* @throws SAXException if the XHTML SAX events could not be handled
* @throws IOException if an input error occurred
*/
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {

xhtml.startDocument();
xhtml.startElement("div");
try (Reader reader = new InputStreamReader(stream, UTF_8)) {
char[] buffer = new char[1024];
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
if (n > 0)
xhtml.characters(buffer, 0, n);
}
}
xhtml.endElement("div");
xhtml.endDocument();
}

/**
* Starts a thread that reads the contents of the standard output or error
* stream of the given process to not block the process. The stream is closed
* once fully processed.
*/
private void logStream(final String logType, final InputStream stream, final File file) {
new Thread() {
public void run() {
Reader reader = new InputStreamReader(stream, UTF_8);
StringBuilder out = new StringBuilder();
char[] buffer = new char[1024];
try {
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
out.append(buffer, 0, n);
} catch (IOException e) {
messageHandler.handleMessage("Could not read input stream.", e, MessageType.ERROR);
} finally {
IOUtils.closeQuietly(stream);
}

String msg = out.toString();
LogFactory.getLog(TesseractOCRParser.class).debug(msg);
}
}.start();
}

static String getTesseractProg() {
return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
}

public String[] getTessLangs(IPropertiesManager propertyManager) {
String tesseractBin = propertyManager.getProperty(Properties.TESSERACT_BIN_FOLDER);
String command = tesseractBin + "/tesseract --list-langs";
Process proc;
BufferedReader reader;
String output = "";
String[] lang_list;
String[] languages;
try {
proc = Runtime.getRuntime().exec(command);
reader = new BufferedReader(new InputStreamReader(proc.getInputStream()));
reader = new BufferedReader(new InputStreamReader(proc.getInputStream()));
String line = "";
while((line = reader.readLine()) != null) {
output = output + line + " ";
}
proc.waitFor();
while ((line = reader.readLine()) != null) {
output = output + line + " ";
}
proc.waitFor();
} catch (IOException e) {
messageHandler.handleMessage("Error while getting Tesserract languages.", e, MessageType.ERROR);
messageHandler.handleMessage("Error while getting Tesserract languages.", e, MessageType.ERROR);
} catch (InterruptedException e) {
messageHandler.handleMessage("Error while getting Tesserract languages.", e, MessageType.ERROR);
messageHandler.handleMessage("Error while getting Tesserract languages.", e, MessageType.ERROR);
}
lang_list = output.split(":");
languages = lang_list[1].split(" ");
return languages;
}
lang_list = output.split(":");
languages = lang_list[1].split(" ");
return languages;
}

}