-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_extraction.py
213 lines (187 loc) · 8.52 KB
/
data_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
from processing import engine
import json
import re
def check_changes_in_headers():
def normalize(s):
return re.sub(r"(<s>\d+</s>)|(</?s>)|([^a-z0-9])", "", s.lower())
more_than_one_table_query = '''
SELECT headTable
FROM tables
WHERE relevancy = 1
GROUP BY headTable
HAVING count(headTable) > 1
ORDER BY count(headTable) DESC;
'''
query = '''
SELECT level, accepted_text
FROM tables t
LEFT JOIN csvs c ON t.correct_csv = c.csvId
LEFT JOIN tables_tags tt ON (t.tableId = tt.tableId AND tt.tagId = 1)
WHERE relevancy = 1
AND tt.tagId IS NULL
AND headTable = %s
ORDER BY headTable, level;
'''
with engine.connect() as conn:
head_ids = [head_id[0] for head_id in conn.execute(more_than_one_table_query)]
for head_id in head_ids:
results = conn.execute(query, (head_id,))
data = []
for item in results:
table = json.loads(item[1])
if len(table) > 0:
data.append((item[0], table[0]))
discrepancies = False
first_headers = data[0][1]
for item in data:
(level, headers) = item
for idx, _ in enumerate(headers):
if normalize(headers[idx]) != normalize(first_headers[idx]):
print(f'"{first_headers[idx]}" vs')
print(f'"{headers[idx]}" for "{head_id}" on level "{level}"')
print()
discrepancies = True
if discrepancies:
print("=======================================================\n")
def populate_headers_table():
get_jsons = '''
SELECT t.tableId, t.combinedConText
FROM tables t
LEFT JOIN tables_tags tt
ON tt.tableId = t.tableId AND tt.tagId = 4
LEFT JOIN pdfs p ON t.pdfName = p.pdfName
WHERE combinedConText IS NOT NULL
AND tt.tableId IS NULL
AND p.latest = 1
GROUP BY t.tableId;
'''
get_query = "SELECT tableId, header_idx, header_title FROM headers WHERE tableId = %s AND header_idx = %s;"
insert_query = "INSERT INTO headers (tableId, header_idx, header_title) VALUES (%s, %s, %s);"
update_query = "UPDATE headers SET header_title = %s WHERE tableId = %s AND header_idx = %s;"
with engine.connect() as conn:
jsons = conn.execute(get_jsons)
for j in jsons:
table_id = j[0]
table = json.loads(j[1])
headers = table[0]
for header_idx, header_title in enumerate(headers):
existing = list(conn.execute(get_query, (table_id, header_idx)))
if len(existing) > 0:
if existing[0][2] == header_title:
continue
conn.execute(update_query, (header_title, table_id, header_idx))
print(f"updated '{existing[0][2]}' to '{header_title}' at index {header_idx} in {table_id}")
continue
conn.execute(insert_query, (table_id, header_idx, header_title))
print("all done")
def populate_issues_table():
get_tables = 'SELECT tableId, combinedConText FROM tables WHERE headers_tagged = 1;'
get_tags = "SELECT header_idx, htag FROM headers_htags WHERE tableId = %s;"
insert_query = "INSERT INTO issues (tableId, rowIndex, content) VALUES (%s, %s, %s);"
def update_query(col):
return f"UPDATE issues SET {col} = %s WHERE tableId = %s AND rowIndex = %s;"
with engine.connect() as conn:
tables = conn.execute(get_tables)
for table in list(tables):
table_id = table[0]
full_table_content = json.loads(table[1])
headers = full_table_content[0]
tags = list(conn.execute(get_tags, (table_id,)))
for row_index in range(1, len(full_table_content)):
row = full_table_content[row_index]
conn.execute(insert_query, (table_id, row_index, json.dumps([headers, row])))
for tag in tags:
header_index = tag[0]
header_tag = tag[1].replace(' ', '_') # 'loc kp' => 'loc_kp'
content = row[header_index]
conn.execute(update_query(header_tag), (content, table_id, row_index))
print("all done")
def populate_latest_column():
projects_query = 'SELECT application_id FROM projects;'
highest_year_query = '''
SELECT monitoring_year FROM pdfs WHERE application_id = %s ORDER BY monitoring_year DESC LIMIT 1;
'''
update_latest_query = 'UPDATE pdfs SET latest = 1 WHERE application_id = %s AND monitoring_year = %s;'
with engine.connect() as conn:
projects = [project[0] for project in conn.execute(projects_query)]
for project in projects:
highest_year = list(conn.execute(highest_year_query, (project,)))[0][0]
conn.execute(update_latest_query, (project, highest_year))
print("All done")
# def clear_issues():
# with engine.connect()as conn:
# conn.execute("DELETE FROM ISSUES WHERE true;")
def get_tag_permutations():
get_tables = 'SELECT tableId, combinedConText FROM tables WHERE headers_tagged = 1;'
get_index_tags = "SELECT header_idx, htag FROM headers_htags WHERE tableId = %s ORDER BY header_idx;"
permutations_index_tag = {}
permutations_index_tag_cols = {}
permutations_tags = {}
with engine.connect() as conn:
tables = conn.execute(get_tables)
for table in tables:
table_id = table[0]
col_count = len(json.loads(table[1])[0])
index_tags = list(conn.execute(get_index_tags, (table_id,)))
key_tag = ""
key_index_tag = ""
key_index_tag_col = f'({col_count} columns)'
for i, _ in enumerate(index_tags):
if i == 0:
key_index_tag_col += " "
if i > 0:
key_tag += " + "
key_index_tag += " + "
key_index_tag_col += " + "
idx = index_tags[i][0]
tag_name = index_tags[i][1]
key_tag += f'{tag_name}'
key_index_tag += f'{idx} {tag_name}'
key_index_tag_col += f'{idx} {tag_name}'
if key_tag in permutations_tags:
permutations_tags[key_tag][0] += 1
else:
permutations_tags[key_tag] = [1, table_id]
if key_index_tag_col in permutations_index_tag_cols:
permutations_index_tag_cols[key_index_tag_col][0] += 1
else:
permutations_index_tag_cols[key_index_tag_col] = [1, table_id]
if key_index_tag in permutations_index_tag:
permutations_index_tag[key_index_tag][0] += 1
else:
permutations_index_tag[key_index_tag] = [1, table_id]
def print_results(dictionary, variant):
dictionary = sorted(dictionary.items(), key=lambda x: x[1][0], reverse=True)
print(f"{len(dictionary)} {variant} permutations:\n")
for tag, data in dictionary:
count = data[0]
example = data[1]
print(f"{count}:\t{tag} (example tableId {example})")
print()
print_results(permutations_tags, "tag")
print_results(permutations_index_tag, "tag+index")
print_results(permutations_index_tag_cols, "tag+index+num_cols")
def populate_meridians_from_headers():
meridian_regex = re.compile(r'w\dm?')
get_tables = 'SELECT tableId, rowIndex, content FROM issues WHERE loc_mer IS NULL;'
update_query = 'UPDATE issues SET loc_mer = %s WHERE tableId = %s AND rowIndex = %s'
with engine.connect() as conn:
tables = conn.execute(get_tables)
for table in tables:
table_id = table[0]
row_index = table[1]
content = json.loads(table[2])
headers = content[0]
for header in headers:
found = meridian_regex.findall(header.lower())
if found:
meridian = found[0].upper()
conn.execute(update_query, (meridian, table_id, row_index))
if __name__ == "__main__":
# check_changes_in_headers()
# populate_latest_column()
# populate_headers_table()
# populate_issues_table()
# get_tag_permutations()
populate_meridians_from_headers()
pass