From 802ffd415c4210d8d4775a31a0831172d4018a05 Mon Sep 17 00:00:00 2001 From: amenshikova Date: Thu, 17 Dec 2020 15:07:45 -0800 Subject: [PATCH] Added a download argument --- grobid-client.py | 51 +++++++++++++++++++++++++++++++++--------------- test-cache.py | 27 +++++++++++++++++++++++++ test.py | 4 ++-- 3 files changed, 64 insertions(+), 18 deletions(-) create mode 100644 test-cache.py diff --git a/grobid-client.py b/grobid-client.py index 59d44ea..f9aa752 100644 --- a/grobid-client.py +++ b/grobid-client.py @@ -17,11 +17,13 @@ slightly sub-optimal, but should scale better. However acquiring a list of million of files in directories would require something scalable too, which is not implemented for the moment. ''' + class grobid_client(ApiClient): def __init__(self, config_path='./config.json'): self.config = None self._load_config(config_path) + self.cache = [] def _load_config(self, path='./config.json'): """ @@ -43,7 +45,7 @@ def _load_config(self, path='./config.json'): else: print("GROBID server is up and running") - def process(self, input, output, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates): + def process(self, input, output, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download): batch_size_pdf = self.config['batch_size'] pdf_files = [] @@ -53,21 +55,29 @@ def process(self, input, output, n, service, generateIDs, consolidate_header, co pdf_files.append(os.sep.join([dirpath, filename])) if len(pdf_files) == batch_size_pdf: - self.process_batch(pdf_files, output, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates) + self.process_batch(pdf_files, output, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download) pdf_files = [] # last batch if len(pdf_files) > 0: - self.process_batch(pdf_files, output, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates) + self.process_batch(pdf_files, output, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download) + + + def factory_wrapper(self, output, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download): + return lambda item: self.process_pdf(item, output, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download) - def process_batch(self, pdf_files, output, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates): + + def process_batch(self, pdf_files, output, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download): print(len(pdf_files), "PDF files to process") - #with concurrent.futures.ThreadPoolExecutor(max_workers=n) as executor: with concurrent.futures.ProcessPoolExecutor(max_workers=n) as executor: + futures = [] for pdf_file in pdf_files: - executor.submit(self.process_pdf, pdf_file, output, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates) + futures.append(executor.submit(self.process_pdf, pdf_file, output, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download)) + for future in concurrent.futures.as_completed(futures): + self.cache.append(future.result()) + - def process_pdf(self, pdf_file, output, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates): + def process_pdf(self, pdf_file, output, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download): # check if TEI file is already produced # we use ntpath here to be sure it will work on Windows too pdf_file_name = ntpath.basename(pdf_file) @@ -109,6 +119,8 @@ def process_pdf(self, pdf_file, output, service, generateIDs, consolidate_header the_data['includeRawAffiliations'] = '1' if teiCoordinates: the_data['teiCoordinates'] = self.config['coordinates'] + if download: + the_data['download'] = '1' res, status = self.post( url=the_url, @@ -119,17 +131,22 @@ def process_pdf(self, pdf_file, output, service, generateIDs, consolidate_header if status == 503: time.sleep(self.config['sleep_time']) - return self.process_pdf(pdf_file, output, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates) + return self.process_pdf(pdf_file, output, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download) elif status != 200: print('Processing failed with error ' + str(status)) else: - # writing TEI file - try: - with io.open(filename,'w',encoding='utf8') as tei_file: - tei_file.write(res.text) - except OSError: - print ("Writing resulting TEI XML file %s failed" % filename) - pass + if download: + # writing TEI file + try: + with io.open(filename,'w',encoding='utf8') as tei_file: + tei_file.write(res.text) + except OSError: + print ("Writing resulting TEI XML file %s failed" % filename) + pass + else: + print("Saving to cache") + return (pdf_file_name, pdf_file, res.text) + if __name__ == "__main__": parser = argparse.ArgumentParser(description = "Client for GROBID services") @@ -145,6 +162,7 @@ def process_pdf(self, pdf_file, output, service, generateIDs, consolidate_header parser.add_argument("--include_raw_affiliations", action='store_true', help="call GROBID requestiong the extraciton of raw affiliations") parser.add_argument("--force", action='store_true', help="force re-processing pdf input files when tei output files already exist") parser.add_argument("--teiCoordinates", action='store_true', help="add the original PDF coordinates (bounding boxes) to the extracted elements") + parser.add_argument("--download", action='store_true', help="1 to download the XML files, 0 to return them locally") args = parser.parse_args() @@ -178,12 +196,13 @@ def process_pdf(self, pdf_file, output, service, generateIDs, consolidate_header include_raw_affiliations = args.include_raw_affiliations force = args.force teiCoordinates = args.teiCoordinates + download = args.download client = grobid_client(config_path=config_path) start_time = time.time() - client.process(input_path, output_path, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates) + client.process(input_path, output_path, n, service, generateIDs, consolidate_header, consolidate_citations, include_raw_citations, include_raw_affiliations, force, teiCoordinates, download) runtime = round(time.time() - start_time, 3) print("runtime: %s seconds " % (runtime)) diff --git a/test-cache.py b/test-cache.py new file mode 100644 index 0000000..c9d2bce --- /dev/null +++ b/test-cache.py @@ -0,0 +1,27 @@ +''' +Recursively apply GROBID to the PDF present in a file tree via the grobid client and save the output XMLs in a cache without downloading them locally. +''' + +import os +import re +import json +import time +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import subprocess +import xml.etree.ElementTree as ET + +grobid = __import__('grobid-client') + +if __name__ == '__main__': + + client = grobid.grobid_client(config_path="./config.json") + input_path = "/mnt/data/covid/data/" + for root, _, _ in os.walk(input_path): + client.process(root, root, 10, "processFulltextDocument", False, 1, 0, True, True, True, False, False) + print(root) + + # client.cache contains a list of tuples containing the file name, path, and the XML output in a string form + print(client.cache) + diff --git a/test.py b/test.py index 6272e58..763a2a7 100644 --- a/test.py +++ b/test.py @@ -1,5 +1,5 @@ ''' -Recursively apply GROBID to the PDF present in a file tree via the grobid client. +Recursively apply GROBID to the PDF present in a file tree via the grobid client and download the output XMLs locally. ''' import os @@ -10,6 +10,6 @@ client = grobid.grobid_client(config_path="./config.json") input_path = "/mnt/data/covid/data/" for root, _, _ in os.walk(input_path): - client.process(root, root, 10, "processFulltextDocument", False, 1, 0, True, True, True, False) + client.process(root, root, 10, "processFulltextDocument", False, 1, 0, True, True, True, False, True) print(root) \ No newline at end of file