-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf2xml.py
67 lines (55 loc) · 2.37 KB
/
pdf2xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
import requests
from multiprocessing import Pool
import json
def send_request(pdf_path, url_base='http://127.0.0.2:8070/'):
"""
Sends a request to the Grobid server to process a PDF file and save the response as an XML file.
When you use it, you should deploy the Grobid server first. Please follow the instructions by visiting the link below:
https://grobid.readthedocs.io/en/latest.
And if you want to use your own request method, you can refer to the link below:
https://grobid.readthedocs.io/en/latest/Grobid-service/#grobid-web-services
Args:
pdf_path (str): The path to the PDF file to be processed.
url_base (str, optional): The base URL of the Grobid server. Defaults to 'http://127.0.0.2:8070/'.
Raises:
requests.exceptions.RequestException: If there is an error in sending the request.
Returns:
None
"""
url = url_base + 'api/processFulltextDocument'
form_data = {
'consolidateCitations': '1',
'segmentSentences': '1'
}
with open(pdf_path, 'rb') as pdf_file:
files = {
'input': pdf_file
}
try:
# Create the XML file path
pdf_dir, pdf_filename = os.path.split(pdf_path)
print(pdf_filename)
xml_filename = os.path.splitext(pdf_filename)[0] + '.xml'
xml_path = os.path.join(pdf_dir, xml_filename)
if not os.path.exists(xml_path):
response = requests.post(url, data=form_data, files=files) # Enable SSL verification
response_text = response.text
# Save the response as XML text with proper encoding
with open(xml_path, 'w', encoding='utf-8') as xml_file:
xml_file.write(response_text)
except requests.exceptions.RequestException as e:
print('Error for', pdf_path, ':', e)
# This is a example of how to use send pdf2xml request to grobid server in parallel
if __name__ == '__main__':
# read the pdf file paths from json file
paths_list = []
pdf_file_path = './papers/gcb2020.json'
with open(pdf_file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
paths_list = list(data.values())
#Process num
max_process_num = 6
with Pool(max_process_num) as pool:
# Send requests in parallel
pool.map(send_request, paths_list)