Skip to content

Commit

Permalink
switching to pipe.communicate. fixes #33
Browse files Browse the repository at this point in the history
  • Loading branch information
Dean Malmgren committed Aug 9, 2014
1 parent cd4cd57 commit 48b7b32
Show file tree
Hide file tree
Showing 6 changed files with 23 additions and 13 deletions.
6 changes: 6 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ latest changes in development

[will add changes here as they are made]

* several bug fixes, including:

* documentation fixes

* shell commands hanging on large files (`#33`_)

0.5.0
-----

Expand Down
4 changes: 2 additions & 2 deletions textract/parsers/doc_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
def extract(filename, **kwargs):
"""Extract text from doc files using antiword.
"""
pipe = run('antiword %(filename)s' % locals())
return pipe.stdout.read()
stdout, stderr = run('antiword %(filename)s' % locals())
return stdout
8 changes: 4 additions & 4 deletions textract/parsers/pdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ def extract(filename, method='', **kwargs):

def extract_pdftotext(filename):
"""Extract text from pdfs using the pdftotext command line utility."""
pipe = run('pdftotext %(filename)s -' % locals())
return pipe.stdout.read()
stdout, stderr = run('pdftotext %(filename)s -' % locals())
return stdout


def extract_pdfminer(filename):
"""Extract text from pdfs using pdfminer."""
pipe = run('pdf2txt.py %(filename)s' % locals())
return pipe.stdout.read()
stdout, stderr = run('pdf2txt.py %(filename)s' % locals())
return stdout
4 changes: 2 additions & 2 deletions textract/parsers/ps_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
def extract(filename, **kwargs):
"""Extract text from postscript files using pstotext command.
"""
pipe = run('pstotext %(filename)s' % locals())
return pipe.stdout.read()
stdout, stderr = run('pstotext %(filename)s' % locals())
return stdout
7 changes: 4 additions & 3 deletions textract/parsers/tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ def extract(filename, **kwargs):
"""Extract text from various image file formats using tesseract-ocr"""
# Tesseract can't output to console directly so you must first create
# a dummy file to write to, read, and then delete
pipe = run(
stdout, stderr = run(
'tesseract %(filename)s tmpout && cat tmpout.txt && rm -f tmpout.txt'
% locals())
return pipe.stdout.read()
% locals()
)
return stdout
7 changes: 5 additions & 2 deletions textract/shell.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@ def run(command):
command, shell=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
)
pipe.wait()

# pipe.wait() ends up hanging on large files. using
# pipe.communicate appears to avoid this issue
stdout, stderr = pipe.communicate()

# if pipe is busted, raise an error (unlike Fabric)
if pipe.returncode != 0:
raise exceptions.ShellError(command, pipe.returncode)

return pipe
return stdout, stderr

0 comments on commit 48b7b32

Please sign in to comment.