Skip to content

Commit 82ee572

Browse files
committed
Merge issue24 with master
2 parents 48557a3 + 36db60e commit 82ee572

9 files changed

+1178
-93
lines changed

.vscode/settings.json

-3
This file was deleted.

check_update.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Last Updated | 2019-11-10
2+
3+
import os
4+
from datetime import datetime
5+
6+
def check_update():
7+
now = datetime.strptime(str(datetime.now()).split(' ')[0], '%Y-%m-%d')
8+
with open('./update.py', 'r') as f:
9+
text = f.read()
10+
comment = text.split('\n')[0]
11+
timestamp = datetime.strptime(comment.split('|')[1].strip(), '%Y-%m-%d')
12+
13+
if timestamp < now:
14+
print('The repository has not been updated since ' + str(timestamp) + '. Please run update.py on the day of merging the branch.')
15+
assert False
16+
17+
check_update()

jsonify.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def read_manuscript(manuscript: BnF, df_dict: Dict[str, pd.DataFrame]) -> Dict[s
5353
df_dict[prop] = df
5454
return df_dict
5555

56-
def df_to_dict(df = pd.DataFrame) -> Dict[str, Tuple[Reciple, str]]:
56+
def df_to_dict(df = pd.DataFrame) -> Dict[str, Tuple[Recipe, str]]:
5757
"""
5858
Convert dataframe into dict of the following format:
5959
'prefLabel_en1': [(entry1, verbatim_term1), (entry2, verbatim_term2), ...],

manuscript_helpers.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@
1010
'music', 'plant', 'place', 'personal_name',
1111
'profession', 'sensory', 'tool', 'time', 'weapon']
1212

13-
cwd = os.getcwd()
14-
m_path = cwd if 'manuscript-object' not in cwd else f'{cwd}/../'
15-
m_k_data_to_thesaurus = f'{m_path}/manuscript-object/thesaurus'
13+
manuscript_data_path = os.path.dirname(os.getcwd()) + "/m-k-manuscript-data"
14+
thesaurus_path = os.getcwd() + "/thesaurus"
15+
assert(os.path.exists(manuscript_data_path)), ("Could not find manuscript data directory: " + manuscript_data_path)
16+
print("Using manuscript data directory:", manuscript_data_path)
1617

1718
def use_thesaurus(entries: Dict[str, Recipe]) -> List[Recipe]:
1819
"""
@@ -27,7 +28,7 @@ def use_thesaurus(entries: Dict[str, Recipe]) -> List[Recipe]:
2728
Output:
2829
entries: List[Recipe] -- same as above, but with the thesaurus corrections applied.
2930
"""
30-
if not os.path.exists(m_k_data_to_thesaurus):
31+
if not os.path.exists(thesaurus_path):
3132
print('Thesaurus not found. Generating now.')
3233
os.system(f'python {cwd}/manuscript-object/thesaurus.py')
3334
print('Finished Generating Thesaurus')
@@ -36,7 +37,7 @@ def use_thesaurus(entries: Dict[str, Recipe]) -> List[Recipe]:
3637

3738
for prop in properties:
3839
dct = {} # {verbatim_term: prefLabel_en}
39-
df = pd.read_csv(f'{m_k_data_to_thesaurus}/{prop}.csv')
40+
df = pd.read_csv(f'{thesaurus_path}/{prop}.csv')
4041

4142
# manual_df = manual_corrections[manual_corrections['property'] == prop]
4243
# manual_dict = {} # verbatim_term, prefLabel_en pairs
@@ -108,13 +109,14 @@ def generate_complete_manuscript(apply_corrections=True) -> Dict[str, Recipe]:
108109
TODO: Instead of going version by version, consider going folio by folio.
109110
"""
110111
for version in versions:
111-
dir_path = f'{m_path}/ms-xml/{version}/'
112+
dir_path = f'{manuscript_data_path}/ms-xml/{version}/'
112113
entry_dict = OrderedDict()
113114

114115
for r, d, f in os.walk(dir_path):
115116
for filename in f: # iterate through /ms-xml/{version} folder
116117
# split folio by entry
117118
info = process_file(f'{dir_path}{filename}')
119+
print(f"Loading folio {filename}...")
118120
for identity, text in info.items(): # add each entry to dictionary
119121
entry_dict[identity] = text
120122

@@ -137,6 +139,7 @@ def generate_complete_manuscript(apply_corrections=True) -> Dict[str, Recipe]:
137139
old.versions['tl'] + '\n\n' + tl)
138140
else:
139141
entries[entry_id] = Recipe(entry_id, folio, tc, tcn, tl)
142+
print(f"Generating Recipe object for {entry_id}...")
140143

141144
# if specified, manually rewrite entry properties based on thesaurus.
142145
if apply_corrections:

projects/issue1588/property_count.csv

+5-77
Large diffs are not rendered by default.

projects/issue1588/property_count.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@
3636

3737
# %%
3838
# set up paths
39-
base = os.path.dirname(os.getcwd()) # .../m-k-manuscript-data/"
40-
entry_metadata_path = base + "/metadata/entry_metadata.csv"
39+
manuscript_data_path = os.path.dirname(os.getcwd()) + "/m-k-manuscript-data"
40+
entry_metadata_path = manuscript_data_path + "/metadata/entry_metadata.csv"
4141
output_path = "property_count.csv"
4242

4343
# %%

recipe.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
class Recipe:
1919

2020
def __init__(self, identity: str, folio: str, tc: str, tcn: str, tl: str) -> None:
21-
22-
print(f"Loading entry with folio {folio}, ID {identity}")
2321

2422
self.identity: str = identity # id of the entry
2523
self.folio: str = folio # folio of the entry
@@ -51,7 +49,7 @@ def find_categories(self, text: str) -> List[str]:
5149
return categories[0].split('"')[1].split(';')
5250
return []
5351

54-
def find_title(self, text: str) -> str:
52+
def find_title(self, text: str, remove_del_text=False) -> str:
5553
"""
5654
Use a regex to find text in between head tags. Specifying the version is not necessary since it is
5755
included in the dict comprehension statement where this function is called.
@@ -65,7 +63,10 @@ def find_title(self, text: str) -> str:
6563
text = re.sub(r'\s+', ' ', text.replace('\n', ' '))
6664

6765
titles = re_head.search(text)
68-
return '' if not titles else re_tags.sub('', titles[0])
66+
if remove_del_text:
67+
return '' if not titles else re.sub(r'\s+', ' ', re_tags.sub('', re.sub(r'<del>.*</del>', '', titles[0])))
68+
else:
69+
return '' if not titles else re_tags.sub('', titles[0])
6970

7071
def clean_length(self, text: str) -> int:
7172
# TODO: make it word count instead of character count.
@@ -170,6 +171,16 @@ def find_captions(self, version: str) -> List[str]:
170171

171172
def get_title(self, version: str = 'tl'):
172173
return self.title[version]
174+
175+
def get_head(self, text: str) -> str:
176+
""" search text for text in a <head> tag. """
177+
text = text.replace('<sup>', '[') # mark editor supplied titles with square brackets
178+
text = text.replace('</sup>', ']')
179+
180+
head = re_head.search(text)
181+
if head:
182+
return re_tags.sub('', head[0])
183+
return ''
173184

174185
def get_identity(self) -> str:
175186
""" Getter method for identity. """

0 commit comments

Comments
 (0)