-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
84 lines (68 loc) · 2.92 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# Import necessary libraries
import os
import requests
# Retrieve the Sensible API key from environment variables
SENSIBLE_API_KEY = "INSERT YOUR API KEY HERE"
# Define the Bearer token prefix
BEARER = "Bearer"
# Define the function to extract content from a PDF file
def extract_content(d_type: str, d_name: str, env: str) -> dict:
# Construct the URL for the API endpoint
url = "https://api.sensible.so/v0/extract/{}?environment={}".format(d_type, env)
# Define the headers for the API request
headers = {
"Authorization": 'Bearer {}'.format(SENSIBLE_API_KEY),
"content-type": "application/pdf"
}
# Open the PDF file in binary mode
with open(d_name, 'rb') as fp:
pdf_file = fp.read()
# Send a POST request to the API and get the response
response = requests.post(url, headers=headers, data=pdf_file)
print("Extraction Status code: {}".format(response.status_code))
# If the request is successful, return the JSON response
if response.status_code == 200:
return response.json()
# If the request fails, print the JSON response
else:
print(response.json())
# Define the function to convert the extracted content to an Excel file
def convert_to_excel(ids: str) -> str:
# Construct the URL for the API endpoint
url = "https://api.sensible.so/v0/generate_excel/{}".format(ids)
# Define the headers for the API request
headers = {
"accept": "application/json",
"Authorization": 'Bearer {}'.format(SENSIBLE_API_KEY)
}
# Send a GET request to the API and get the response
response = requests.get(url, headers=headers)
print("Conversion Status code: {}".format(response.status_code))
# If the request is successful, return the URL for the Excel file
if response.status_code == 200:
data = response.json()
return data["url"]
# If the request fails, print the JSON response
else:
print(response.json())
# Define the function to download the Excel file
def download_xlsx(url: str, d_name: str) -> None:
# Send a GET request to the URL and get the response
response = requests.get(url)
# Define the filename for the downloaded file
filename = "{}.xlsx".format(d_name)
# Open the file in binary mode and write the response content to it
with open(filename, "wb") as fp:
fp.write(response.content)
print("{} downloaded.".format(filename))
# Define the document type, environment, and document name
document_type = "covid_reports"
environment = "development"
document_name = "pdfs/20230301_Weekly_Epi_Update_132.pdf"
# Extract content from the PDF file
pdf_content = extract_content(document_type, document_name, environment)
# Convert the extracted content to an Excel file
url = convert_to_excel(pdf_content["id"])
# If the URL is not None, download the Excel file
if url is not None:
download_xlsx(url, document_name[:-4])