Skip to content

Commit

Permalink
Merge pull request #19 from kermitt2/issue_#18
Browse files Browse the repository at this point in the history
issue #18
  • Loading branch information
kermitt2 authored Jan 14, 2021
2 parents 598edb5 + 983fb6f commit 4ae10a1
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 21 deletions.
4 changes: 3 additions & 1 deletion Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ usage: grobid_client.py [-h] [--input INPUT] [--output OUTPUT]
[--config CONFIG] [--n N] [--generateIDs]
[--consolidate_header] [--consolidate_citations]
[--include_raw_citations] [--include_raw_affiliations]
[--force] [--teiCoordinates]
[--force] [--teiCoordinates] [--verbose]
service
Client for GROBID services
Expand Down Expand Up @@ -59,6 +59,8 @@ optional arguments:
files already exist
--teiCoordinates add the original PDF coordinates (bounding boxes) to
the extracted elements
--verbose print information about processed files in the console
```

Examples:
Expand Down
57 changes: 37 additions & 20 deletions grobid_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@

'''
This version uses the standard ProcessPoolExecutor for parallelizing the concurrent calls to the GROBID services.
Given the limits of ThreadPoolExecutor (input stored in memory, blocking Executor.map until the whole input
is acquired), it works with batches of PDF of a size indicated in the config.json file (default is 1000 entries).
We are moving from first batch to the second one only when the first is entirely processed - which means it is
slightly sub-optimal, but should scale better. Working without batch would mean acquiring a list of million of
files in directories would require something scalable too (e.g. done in a separate thread), which is not
implemented for the moment.
Given the limits of ThreadPoolExecutor (the legendary GIL, input stored in memory, blocking Executor.map until
the whole input is acquired), ProcessPoolExecutor works with batches of PDF of a size indicated in the config.json
file (default is 1000 entries). We are moving from first batch to the second one only when the first is entirely
processed - which means it is slightly sub-optimal, but should scale better. Working without batch would mean
acquiring a list of millions of files in directories and would require something scalable too (e.g. done in a separate
thread), which is not implemented for the moment and possibly not implementable in Python as long it uses the GIL.
'''
class grobid_client(ApiClient):

Expand All @@ -36,7 +36,12 @@ def _load_config(self, path='./config.json'):
if len(self.config['grobid_port'])>0:
the_url += ":"+self.config['grobid_port']
the_url += "/api/isalive"
r = requests.get(the_url)
try:
r = requests.get(the_url)
except:
print('GROBID server does not appear up and running, the connection to the server failed')
exit(1)

status = r.status_code

if status != 200:
Expand All @@ -53,35 +58,39 @@ def process(self, service, input_path,
include_raw_citations=False,
include_raw_affiliations=False,
teiCoordinates=False,
force=True):
force=True,
verbose=False):
batch_size_pdf = self.config['batch_size']
pdf_files = []

print(input_path)

for (dirpath, dirnames, filenames) in os.walk(input_path):
print(dirpath, dirnames, filenames)
for filename in filenames:
if filename.endswith('.pdf') or filename.endswith('.PDF'):
print(filename)
if verbose:
try:
print(filename)
except Exception:
# may happen on linux see https://stackoverflow.com/questions/27366479/python-3-os-walk-file-paths-unicodeencodeerror-utf-8-codec-cant-encode-s
pass
pdf_files.append(os.sep.join([dirpath, filename]))

if len(pdf_files) == batch_size_pdf:
self.process_batch(service, pdf_files, input_path, output, n, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, teiCoordinates, force)
self.process_batch(service, pdf_files, input_path, output, n, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, teiCoordinates, force, verbose)
pdf_files = []

# last batch
if len(pdf_files) > 0:
self.process_batch(service, pdf_files, input_path, output, n, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, teiCoordinates, force)
self.process_batch(service, pdf_files, input_path, output, n, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, teiCoordinates, force, verbose)

def process_batch(self, service, pdf_files, input_path, output, n, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, teiCoordinates, force):
print(len(pdf_files), "PDF files to process")
def process_batch(self, service, pdf_files, input_path, output, n, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, teiCoordinates, force, verbose=False):
if verbose:
print(len(pdf_files), "PDF files to process in current batch")
#with concurrent.futures.ThreadPoolExecutor(max_workers=n) as executor:
with concurrent.futures.ProcessPoolExecutor(max_workers=n) as executor:
for pdf_file in pdf_files:
executor.submit(self.process_pdf, service, pdf_file, input_path, output, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, teiCoordinates, force)
executor.submit(self.process_pdf, service, pdf_file, input_path, output, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, teiCoordinates, force, verbose)

def process_pdf(self, service, pdf_file, input_path, output, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, teiCoordinates, force):
def process_pdf(self, service, pdf_file, input_path, output, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, teiCoordinates, force, verbose=False):
# check if TEI file is already produced
# we use ntpath here to be sure it will work on Windows too
if output is not None:
Expand All @@ -95,7 +104,6 @@ def process_pdf(self, service, pdf_file, input_path, output, generateIDs, consol
print(filename, "already exist, skipping... (use --force to reprocess pdf input files)")
return

print(pdf_file)
files = {
'input': (
pdf_file,
Expand Down Expand Up @@ -148,6 +156,8 @@ def process_pdf(self, service, pdf_file, input_path, output, generateIDs, consol
pass

if __name__ == "__main__":
valid_services = ["processFulltextDocument", "processHeaderDocument", "processReferences"]

parser = argparse.ArgumentParser(description = "Client for GROBID services")
parser.add_argument("service", help="one of [processFulltextDocument, processHeaderDocument, processReferences]")
parser.add_argument("--input", default=None, help="path to the directory containing PDF to process")
Expand All @@ -161,6 +171,7 @@ def process_pdf(self, service, pdf_file, input_path, output, generateIDs, consol
parser.add_argument("--include_raw_affiliations", action='store_true', help="call GROBID requestiong the extraciton of raw affiliations")
parser.add_argument("--force", action='store_true', help="force re-processing pdf input files when tei output files already exist")
parser.add_argument("--teiCoordinates", action='store_true', help="add the original PDF coordinates (bounding boxes) to the extracted elements")
parser.add_argument("--verbose", action='store_true', help="print information about processed files in the console")

args = parser.parse_args()

Expand Down Expand Up @@ -193,6 +204,11 @@ def process_pdf(self, service, pdf_file, input_path, output, generateIDs, consol
include_raw_affiliations = args.include_raw_affiliations
force = args.force
teiCoordinates = args.teiCoordinates
verbose = args.verbose

if service is None or not service in valid_services:
print("Missing or invalid service, must be one of", valid_services)
exit(1)

client = grobid_client(config_path=config_path)

Expand All @@ -207,7 +223,8 @@ def process_pdf(self, service, pdf_file, input_path, output, generateIDs, consol
include_raw_citations=include_raw_citations,
include_raw_affiliations=include_raw_affiliations,
teiCoordinates=teiCoordinates,
force=force)
force=force,
verbose=verbose)

runtime = round(time.time() - start_time, 3)
print("runtime: %s seconds " % (runtime))

0 comments on commit 4ae10a1

Please sign in to comment.