Skip to content

Commit fb36b76

Browse files
committed
Move update.py and check_update.py from parent directory into this repo.
1 parent a636e13 commit fb36b76

File tree

2 files changed

+197
-4
lines changed

2 files changed

+197
-4
lines changed

check_update.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1+
# Last Updated | 2019-11-10
2+
13
import os
24
from datetime import datetime
35

4-
5-
# change threshhold to 7 minutes
66
def check_update():
77
now = datetime.strptime(str(datetime.now()).split(' ')[0], '%Y-%m-%d')
88
with open('./update.py', 'r') as f:
@@ -11,7 +11,7 @@ def check_update():
1111
timestamp = datetime.strptime(comment.split('|')[1].strip(), '%Y-%m-%d')
1212

1313
if timestamp < now:
14-
print(f'The repository has not been updated since {str(timestamp)}. Please run update.py on the day of merging the branch.')
14+
print('The repository has not been updated since ' + str(timestamp) + '. Please run update.py on the day of merging the branch.')
1515
assert False
1616

17-
check_update()
17+
check_update()

update.py

+193
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
# Last Updated | 2020-08-21
2+
# Python Modules
3+
import os
4+
import sys
5+
import re
6+
from typing import List
7+
8+
sys.path.insert(1, './manuscript-object/')
9+
10+
# Third Party Modules
11+
import pandas as pd
12+
from datetime import datetime
13+
14+
# Local Modules
15+
from digital_manuscript import BnF
16+
from recipe import Recipe
17+
18+
versions = ['tc', 'tcn', 'tl']
19+
properties = ['animal', 'body_part', 'currency', 'definition', 'environment', 'material', 'medical', 'measurement',
20+
'music', 'plant', 'place', 'personal_name', 'profession', 'sensory', 'tool', 'time', 'weapon',
21+
'german', 'greek', 'italian', 'latin', 'occitan', 'poitevin']
22+
prop_dict = {'animal': 'al', 'body_part': 'bp', 'currency': 'cn', 'definition': 'df',
23+
'environment': 'env', 'material': 'm', 'medical': 'md', 'measurement': 'ms',
24+
'music': 'mu', 'plant': 'pa', 'place': 'pl', 'personal_name': 'pn',
25+
'profession': 'pro', 'sensory': 'sn', 'tool': 'tl', 'time': 'tmp', 'weapon': 'wp',
26+
'german': 'de', 'greek': 'el', 'italian': 'it', 'latin': 'la', 'occitan': 'oc', 'poitevin': 'po',}
27+
28+
m_path = f'{os.getcwd()}'
29+
30+
def update_metadata(manuscript: BnF) -> None:
31+
"""
32+
Update /m-k-manuscript-data/metadata/entry_metadata.csv with the current manuscript. Create a Pandas DataFrame
33+
indexed by entry. Create data columns, and remove the column that contains the entry objects. Save File.
34+
35+
Input:
36+
manuscript -- Python object of the manuscript defined in digital_manuscript.py
37+
Output:
38+
None
39+
"""
40+
# create DataFrame (spreadsheet) with one entry per row
41+
df = pd.DataFrame(columns=['entry'], data=manuscript.entries.values())
42+
df['folio'] = df.entry.apply(lambda x: x.folio)
43+
df['folio_display'] = df.entry.apply(lambda x: x.folio.lstrip('0'))
44+
df['div_id'] = df.entry.apply(lambda x: x.identity)
45+
df['categories'] = df.entry.apply(lambda x: (';'.join(x.categories)))
46+
df['heading_tc'] = df.entry.apply(lambda x: x.find_title(x.versions['tc'], remove_del_text=True))
47+
df['heading_tcn'] = df.entry.apply(lambda x: x.find_title(x.versions['tcn'], remove_del_text=True))
48+
df['heading_tl'] = df.entry.apply(lambda x: x.find_title(x.versions['tl'], remove_del_text=True))
49+
50+
for prop, tag in prop_dict.items():
51+
for version in versions:
52+
df[f'{tag}_{version}'] = df.entry.apply(lambda x: '; '.join(x.get_prop(prop=prop, version=version)))
53+
# remove entry column, since it only displays memory address
54+
df.drop(columns=['entry'], inplace=True)
55+
56+
df.to_csv(f'{m_path}/metadata/entry_metadata.csv', index=False)
57+
58+
def update_entries(manuscript: BnF) -> None:
59+
"""
60+
Update /m-k-manuscript-data/entries/ with the current manuscript from /ms-xml/. For each version, delete all existing
61+
entries. Regenerate folio text entry by entry, and save the file.
62+
63+
Input:
64+
manuscript -- Python object of the manuscript defined in digital_manuscript.py
65+
Output:
66+
None
67+
"""
68+
69+
for path in [f'{m_path}/entries', f'{m_path}/entries/txt', f'{m_path}/entries/xml']:
70+
if not os.path.exists(path):
71+
os.mkdir(path)
72+
73+
for version in versions:
74+
txt_path = f'{m_path}/entries/txt/{version}'
75+
xml_path = f'{m_path}/entries/xml/{version}'
76+
77+
# If the entries/txt or xml directory does not exist, create it. Otherwise, clear the directory.
78+
for path in [txt_path, xml_path]:
79+
if not os.path.exists(path):
80+
os.mkdir(path)
81+
elif len(os.listdir(path)) > 0: # remove existing files
82+
for f in os.listdir(path):
83+
os.remove(os.path.join(path, f))
84+
85+
# Write new files with manuscript object
86+
for identity, entry in manuscript.entries.items():
87+
if identity: # TODO: resolve issue of unidentified entries
88+
# TODO: ask for a naming convention
89+
filename_txt = f'{txt_path}/{version}_{entry.identity}.txt'
90+
filename_xml = f'{xml_path}/{version}_{entry.identity}.xml'
91+
92+
content_txt = entry.text(version, xml=False)
93+
content_xml = entry.text(version, xml=True)
94+
95+
f_txt = open(filename_txt, 'w')
96+
f_txt.write(content_txt)
97+
f_txt.close()
98+
99+
f_xml = open(filename_xml, 'w')
100+
f_xml.write(content_xml)
101+
f_xml.close()
102+
103+
def update_all_folios(manuscript: BnF) -> None:
104+
"""
105+
Update /m-k-manuscript-data/allFolios/ with the current manuscript from /ms-xml/.
106+
107+
Input:
108+
manuscript -- Python object of the manuscript defined in digital_manuscript.py
109+
Output:
110+
None
111+
"""
112+
for b in [True, False]: # xml and txt respectively
113+
for version in versions:
114+
text = ''
115+
folder = 'xml' if b else 'txt'
116+
117+
# add text entry by entry, with two line breaks in between each
118+
for identity, entry in manuscript.entries.items():
119+
new_text = entry.text(version, xml=b)
120+
text = f'{text}\n\n{new_text}' if text else new_text
121+
122+
# write file
123+
f = open(f'{m_path}/allFolios/{folder}/all_{version}.{folder}', 'w')
124+
f.write(text)
125+
f.close()
126+
127+
def update_ms(manuscript: BnF) -> None:
128+
"""
129+
Update /m-k-manuscript-data/update_ms/ with the current manuscript from /ms-xml/.
130+
Iterate through /ms-xml/ for each version, remove tags, and save to /ms-txt/.
131+
132+
Input:
133+
manuscript -- Python object of the manuscript defined in digital_manuscript.py
134+
Output:
135+
None
136+
"""
137+
for version in versions:
138+
for r, d, f in os.walk(f'{m_path}/ms-xml/{version}'):
139+
for filename in f: # iterate through /ms-xml/{version} folder
140+
# read xml file
141+
text = ''
142+
filepath = f'{m_path}/ms-xml/{version}/{filename}'
143+
with open(filepath, encoding="utf-8", errors="surrogateescape") as f:
144+
text = f.read()
145+
146+
# remove xml, normalize whitespace
147+
text = text.replace('\n', '**NEWLINE**')
148+
text = re.sub(r'<.*?>', '', text)
149+
text = text.replace('**NEWLINE**', '\n')
150+
text = text.strip(' \n')
151+
152+
# write txt file
153+
txt_filepath = filepath.replace('xml', 'txt')
154+
f = open(txt_filepath, 'w')
155+
f.write(text)
156+
f.close()
157+
158+
def update_time():
159+
""" Extract timestamp at the top of this file and update it. """
160+
# Initialize date to write and container for the text
161+
now_str = str(datetime.now()).split(' ')[0]
162+
lines = []
163+
164+
# open file, extract text, and modify
165+
with open('./update.py', 'r') as f:
166+
lines = f.read().split('\n')
167+
lines[0] = f'# Last Updated | {now_str}'
168+
169+
# write modified text
170+
f = open('./update.py', 'w')
171+
f.write('\n'.join(lines))
172+
f.close
173+
174+
def update():
175+
176+
manuscript = BnF(apply_corrections=False)
177+
178+
print('Updating metadata')
179+
update_metadata(manuscript)
180+
181+
print('Updating entries')
182+
update_entries(manuscript)
183+
184+
print('Updating ms-txt')
185+
update_ms(manuscript)
186+
187+
print('Updating allFolios')
188+
update_all_folios(manuscript)
189+
190+
update_time()
191+
192+
if __name__ == "__main__":
193+
update()

0 commit comments

Comments
 (0)