-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathphase1.py
92 lines (76 loc) · 3.12 KB
/
phase1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Revisions:
# 1) Separation of paragraphs
# 2) The period symbol (.) and comma symbol (,) need to be close to the previous word without space
# 3) Highlighting areas for a phrase
from pathlib import Path
import pandas as pd
from bs4 import Comment, BeautifulSoup as Soup
import os
from os import listdir
def find_csv_filenames(path_to_dir, suffix=".csv"):
filenames = listdir(path_to_dir)
return [filename for filename in filenames if filename.endswith(suffix)]
def processcsv(file):
# V2 - Modify variable for different files
file_to_open = input_folder / file
# V2 - To detect empty cells as new paragraph
df = pd.read_csv(file_to_open, header=0).fillna(value="PARA")
text = """"""
close = '</span>'
# define dictionary for HTML code of text colours
thisdict = {
"ORG": """<span style="background-color: Tomato">""",
"PER": """<span style="background-color: SkyBlue">""",
"LOC": """<span style="background-color: BurlyWood">""",
"DATE": """<span style="background-color: MediumOrchid">""",
"MISC": """<span style="background-color: Chartreuse">"""
}
# prepare HTML code for text bssed on NER id and concatenate to "text"
for i in range(len(df)):
if df["NER"][i] == "PER":
text += thisdict["PER"] + df["word"][i] + close + ' '
elif df["NER"][i] == "ORG":
text += thisdict["ORG"] + df["word"][i] + close + ' '
elif df["NER"][i] == "LOC":
text += thisdict["LOC"] + df["word"][i] + close + ' '
elif df["NER"][i] == "DATE":
text += thisdict["DATE"] + df["word"][i] + close + ' '
elif df["NER"][i] == "MISC":
text += thisdict["MISC"] + df["word"][i] + close + ' '
elif df["NER"][i] == "PARA":
text += """<p></p>"""
else:
if df["word"][i].isalpha() == True:
text += str(df["word"][i]) + ' '
# V2 - no spaces for punctuation
else:
text = text.rstrip()
text += str(df["word"][i]) + ' '
html = open(template, 'r')
htmlcode = html.read()
soup = Soup(htmlcode, 'html.parser')
insert = Soup(text, 'html.parser')
# remove exisiting paragraphs
for i in soup.find_all('p'):
i.decompose()
# insert "text" into header
for i in soup.find_all('h2'):
if "Text" in i.text:
i.insert_after(insert)
# remove comments from HTML file
div = soup.find('body')
for element in div(text=lambda text: isinstance(text, Comment)):
element.extract()
# V2 - save edits to HTML file
output = file.replace("input", "output")
output = output.replace(".csv",".html")
output = os.path.join(output_folder, output)
Html_file = open(output, "w")
Html_file.write(str(soup))
Html_file.close()
output_folder = Path.cwd() / "./phase_1_output"
template = Path.cwd() / "./template-v2.html"
input_folder = Path.cwd() / "./phase_1_input"
filenames = find_csv_filenames(input_folder)
for file in filenames:
processcsv(file)