Skip to content

Commit

Permalink
tesseract: Replace ProcessPoolExecutor with list of Popened processes
Browse files Browse the repository at this point in the history
  • Loading branch information
jbaiter committed Dec 30, 2013
1 parent 7bccde7 commit 1cd6e70
Showing 1 changed file with 26 additions and 9 deletions.
35 changes: 26 additions & 9 deletions spreadsplug/tesseract.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import logging
import multiprocessing
import os
import re
import subprocess
import time
import xml.etree.cElementTree as ET

from concurrent import futures

from spreads.plugin import HookPlugin, PluginOption
from spreads.util import find_in_path, MissingDependencyException
from spreads.vendor.pathlib import Path
Expand Down Expand Up @@ -37,17 +38,33 @@ def configuration_template(cls):
return conf

def process(self, path):
def _clean_processes(procs):
for p in processes[:]:
if p.poll() is not None:
processes.remove(p)

ocr_lang = self.config['language'].get()
logger.info("Performing OCR")
logger.info("Language is \"{0}\"".format(ocr_lang))
img_dir = path / 'done'
with futures.ProcessPoolExecutor() as executor:
for img in img_dir.glob('*.tif'):
executor.submit(
subprocess.check_output,
["tesseract", unicode(img), unicode(img_dir / img.stem),
"-l", ocr_lang, "hocr"], stderr=subprocess.STDOUT
)

processes = []
max_procs = multiprocessing.cpu_count()
FNULL = open(os.devnull, 'w')
for img in img_dir.glob('*.tif'):
# Wait until another process has finished
while len(processes) >= max_procs:
_clean_processes(processes)
time.sleep(0.01)
proc = subprocess.Popen(["tesseract", unicode(img),
unicode(img_dir / img.stem), "-l",
ocr_lang, "hocr"], stderr=FNULL,
stdout=FNULL)
processes.append(proc)
# Wait for remaining processes to finish
while processes:
_clean_processes(processes)

# NOTE: This modifies the hOCR files to make them compatible with
# pdfbeads.
# See the following bugreport for more information:
Expand Down

0 comments on commit 1cd6e70

Please sign in to comment.