Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Created a download argument for running the client #16

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 35 additions & 16 deletions grobid-client.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@
slightly sub-optimal, but should scale better. However acquiring a list of million of files in directories would
require something scalable too, which is not implemented for the moment.
'''

class grobid_client(ApiClient):

def __init__(self, config_path='./config.json'):
self.config = None
self._load_config(config_path)
self.cache = []

def _load_config(self, path='./config.json'):
"""
Expand All @@ -43,7 +45,7 @@ def _load_config(self, path='./config.json'):
else:
print("GROBID server is up and running")

def process(self, input, output, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates):
def process(self, input, output, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download):
batch_size_pdf = self.config['batch_size']
pdf_files = []

Expand All @@ -53,21 +55,29 @@ def process(self, input, output, n, service, generateIDs, consolidate_header, co
pdf_files.append(os.sep.join([dirpath, filename]))

if len(pdf_files) == batch_size_pdf:
self.process_batch(pdf_files, output, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates)
self.process_batch(pdf_files, output, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download)
pdf_files = []

# last batch
if len(pdf_files) > 0:
self.process_batch(pdf_files, output, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates)
self.process_batch(pdf_files, output, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download)


def factory_wrapper(self, output, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download):
return lambda item: self.process_pdf(item, output, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download)

def process_batch(self, pdf_files, output, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates):

def process_batch(self, pdf_files, output, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download):
print(len(pdf_files), "PDF files to process")
#with concurrent.futures.ThreadPoolExecutor(max_workers=n) as executor:
with concurrent.futures.ProcessPoolExecutor(max_workers=n) as executor:
futures = []
for pdf_file in pdf_files:
executor.submit(self.process_pdf, pdf_file, output, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates)
futures.append(executor.submit(self.process_pdf, pdf_file, output, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download))
for future in concurrent.futures.as_completed(futures):
self.cache.append(future.result())


def process_pdf(self, pdf_file, output, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates):
def process_pdf(self, pdf_file, output, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download):
# check if TEI file is already produced
# we use ntpath here to be sure it will work on Windows too
pdf_file_name = ntpath.basename(pdf_file)
Expand Down Expand Up @@ -109,6 +119,8 @@ def process_pdf(self, pdf_file, output, service, generateIDs, consolidate_header
the_data['includeRawAffiliations'] = '1'
if teiCoordinates:
the_data['teiCoordinates'] = self.config['coordinates']
if download:
the_data['download'] = '1'

res, status = self.post(
url=the_url,
Expand All @@ -119,17 +131,22 @@ def process_pdf(self, pdf_file, output, service, generateIDs, consolidate_header

if status == 503:
time.sleep(self.config['sleep_time'])
return self.process_pdf(pdf_file, output, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates)
return self.process_pdf(pdf_file, output, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download)
elif status != 200:
print('Processing failed with error ' + str(status))
else:
# writing TEI file
try:
with io.open(filename,'w',encoding='utf8') as tei_file:
tei_file.write(res.text)
except OSError:
print ("Writing resulting TEI XML file %s failed" % filename)
pass
if download:
# writing TEI file
try:
with io.open(filename,'w',encoding='utf8') as tei_file:
tei_file.write(res.text)
except OSError:
print ("Writing resulting TEI XML file %s failed" % filename)
pass
else:
print("Saving to cache")
return (pdf_file_name, pdf_file, res.text)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description = "Client for GROBID services")
Expand All @@ -145,6 +162,7 @@ def process_pdf(self, pdf_file, output, service, generateIDs, consolidate_header
parser.add_argument("--include_raw_affiliations", action='store_true', help="call GROBID requestiong the extraciton of raw affiliations")
parser.add_argument("--force", action='store_true', help="force re-processing pdf input files when tei output files already exist")
parser.add_argument("--teiCoordinates", action='store_true', help="add the original PDF coordinates (bounding boxes) to the extracted elements")
parser.add_argument("--download", action='store_true', help="1 to download the XML files, 0 to return them locally")

args = parser.parse_args()

Expand Down Expand Up @@ -178,12 +196,13 @@ def process_pdf(self, pdf_file, output, service, generateIDs, consolidate_header
include_raw_affiliations = args.include_raw_affiliations
force = args.force
teiCoordinates = args.teiCoordinates
download = args.download

client = grobid_client(config_path=config_path)

start_time = time.time()

client.process(input_path, output_path, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates)
client.process(input_path, output_path, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download)

runtime = round(time.time() - start_time, 3)
print("runtime: %s seconds " % (runtime))
27 changes: 27 additions & 0 deletions test-cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
'''
Recursively apply GROBID to the PDF present in a file tree via the grobid client and save the output XMLs in a cache without downloading them locally.
'''

import os
import re
import json
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import subprocess
import xml.etree.ElementTree as ET

grobid = __import__('grobid-client')

if __name__ == '__main__':

client = grobid.grobid_client(config_path="./config.json")
input_path = "/mnt/data/covid/data/"
for root, _, _ in os.walk(input_path):
client.process(root, root, 10, "processFulltextDocument", False, 1, 0, True, True, True, False, False)
print(root)

# client.cache contains a list of tuples containing the file name, path, and the XML output in a string form
print(client.cache)

4 changes: 2 additions & 2 deletions test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
'''
Recursively apply GROBID to the PDF present in a file tree via the grobid client.
Recursively apply GROBID to the PDF present in a file tree via the grobid client and download the output XMLs locally.
'''

import os
Expand All @@ -10,6 +10,6 @@
client = grobid.grobid_client(config_path="./config.json")
input_path = "/mnt/data/covid/data/"
for root, _, _ in os.walk(input_path):
client.process(root, root, 10, "processFulltextDocument", False, 1, 0, True, True, True, False)
client.process(root, root, 10, "processFulltextDocument", False, 1, 0, True, True, True, False, True)
print(root)