-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathahs_extract_results.py
52 lines (45 loc) · 2.09 KB
/
ahs_extract_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python3
import os
import csv
from pdfminer.high_level import extract_text
import time
start_time = time.time()
#results_foldername = 'ahs_jan_7'#''.join('ahs_'+datetime.date.today().strftime("%b-%d-%Y").lower().replace('-','_')[0:6])
def extract_pdf(filename):
"""Take in a PDF covid result, return a dict of relevant information."""
text = extract_text(filename).split('\n')
new_text = [item for item in text if len(item) < 75 and len(item)]
info_dict = {}
for i,v in enumerate(new_text):
if v == 'Accession #:':
info_dict['Accession #'] = new_text[i+1]
if v == 'Phone:':
info_dict['First Name'] = new_text[i+1].split(',')[1].strip().replace(' Ref Physician:', '')
info_dict['Last Name'] = new_text[i+1].split(',')[0].strip()
if not isphone(new_text[i+2]) and new_text[i+2] not in ('DOB (Age) / Sex:','Ref Physician:'):
info_dict['First Name'] = ' '.join([info_dict['First Name'], new_text[i+2]]).strip()
if v.startswith('DOB'):
info_dict['DOB'] = new_text[i+1].split()[0]
if 'Result:' in v:
info_dict['Result'] = v.split('Result:', 1)[1].strip()
return info_dict
def extract_covid_results(results_foldername):
"""Take in foldername containing PDF results, output Excel sheet with detailed results for everyone."""
path = r'C:\Users\dangr\Downloads'
path = os.path.join(path, results_foldername)
os.chdir(path)
info_array = []
for i, file in enumerate(os.listdir()):
if file.endswith('.pdf'):
info_array.append(extract_pdf(file))
print(f'File {i} Completed')
fields = ['Accession #', 'First Name', 'Last Name', 'DOB', 'Result']
filename = os.path.basename(path) + '.csv'
with open(filename, 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames = fields)
writer.writeheader()
writer.writerows(info_array)
def main():
extract_covid_results('ahs_11_3')
if __name__ == '__main__':
main()