|
| 1 | +# Last Updated | 2020-08-21 |
| 2 | +# Python Modules |
| 3 | +import os |
| 4 | +import sys |
| 5 | +import re |
| 6 | +from typing import List |
| 7 | + |
| 8 | +sys.path.insert(1, './manuscript-object/') |
| 9 | + |
| 10 | +# Third Party Modules |
| 11 | +import pandas as pd |
| 12 | +from datetime import datetime |
| 13 | + |
| 14 | +# Local Modules |
| 15 | +from digital_manuscript import BnF |
| 16 | +from recipe import Recipe |
| 17 | + |
| 18 | +versions = ['tc', 'tcn', 'tl'] |
| 19 | +properties = ['animal', 'body_part', 'currency', 'definition', 'environment', 'material', 'medical', 'measurement', |
| 20 | + 'music', 'plant', 'place', 'personal_name', 'profession', 'sensory', 'tool', 'time', 'weapon', |
| 21 | + 'german', 'greek', 'italian', 'latin', 'occitan', 'poitevin'] |
| 22 | +prop_dict = {'animal': 'al', 'body_part': 'bp', 'currency': 'cn', 'definition': 'df', |
| 23 | + 'environment': 'env', 'material': 'm', 'medical': 'md', 'measurement': 'ms', |
| 24 | + 'music': 'mu', 'plant': 'pa', 'place': 'pl', 'personal_name': 'pn', |
| 25 | + 'profession': 'pro', 'sensory': 'sn', 'tool': 'tl', 'time': 'tmp', 'weapon': 'wp', |
| 26 | + 'german': 'de', 'greek': 'el', 'italian': 'it', 'latin': 'la', 'occitan': 'oc', 'poitevin': 'po',} |
| 27 | + |
| 28 | +m_path = f'{os.getcwd()}' |
| 29 | + |
| 30 | +def update_metadata(manuscript: BnF) -> None: |
| 31 | + """ |
| 32 | + Update /m-k-manuscript-data/metadata/entry_metadata.csv with the current manuscript. Create a Pandas DataFrame |
| 33 | + indexed by entry. Create data columns, and remove the column that contains the entry objects. Save File. |
| 34 | +
|
| 35 | + Input: |
| 36 | + manuscript -- Python object of the manuscript defined in digital_manuscript.py |
| 37 | + Output: |
| 38 | + None |
| 39 | + """ |
| 40 | + # create DataFrame (spreadsheet) with one entry per row |
| 41 | + df = pd.DataFrame(columns=['entry'], data=manuscript.entries.values()) |
| 42 | + df['folio'] = df.entry.apply(lambda x: x.folio) |
| 43 | + df['folio_display'] = df.entry.apply(lambda x: x.folio.lstrip('0')) |
| 44 | + df['div_id'] = df.entry.apply(lambda x: x.identity) |
| 45 | + df['categories'] = df.entry.apply(lambda x: (';'.join(x.categories))) |
| 46 | + df['heading_tc'] = df.entry.apply(lambda x: x.find_title(x.versions['tc'], remove_del_text=True)) |
| 47 | + df['heading_tcn'] = df.entry.apply(lambda x: x.find_title(x.versions['tcn'], remove_del_text=True)) |
| 48 | + df['heading_tl'] = df.entry.apply(lambda x: x.find_title(x.versions['tl'], remove_del_text=True)) |
| 49 | + |
| 50 | + for prop, tag in prop_dict.items(): |
| 51 | + for version in versions: |
| 52 | + df[f'{tag}_{version}'] = df.entry.apply(lambda x: '; '.join(x.get_prop(prop=prop, version=version))) |
| 53 | + # remove entry column, since it only displays memory address |
| 54 | + df.drop(columns=['entry'], inplace=True) |
| 55 | + |
| 56 | + df.to_csv(f'{m_path}/metadata/entry_metadata.csv', index=False) |
| 57 | + |
| 58 | +def update_entries(manuscript: BnF) -> None: |
| 59 | + """ |
| 60 | + Update /m-k-manuscript-data/entries/ with the current manuscript from /ms-xml/. For each version, delete all existing |
| 61 | + entries. Regenerate folio text entry by entry, and save the file. |
| 62 | +
|
| 63 | + Input: |
| 64 | + manuscript -- Python object of the manuscript defined in digital_manuscript.py |
| 65 | + Output: |
| 66 | + None |
| 67 | + """ |
| 68 | + |
| 69 | + for path in [f'{m_path}/entries', f'{m_path}/entries/txt', f'{m_path}/entries/xml']: |
| 70 | + if not os.path.exists(path): |
| 71 | + os.mkdir(path) |
| 72 | + |
| 73 | + for version in versions: |
| 74 | + txt_path = f'{m_path}/entries/txt/{version}' |
| 75 | + xml_path = f'{m_path}/entries/xml/{version}' |
| 76 | + |
| 77 | + # If the entries/txt or xml directory does not exist, create it. Otherwise, clear the directory. |
| 78 | + for path in [txt_path, xml_path]: |
| 79 | + if not os.path.exists(path): |
| 80 | + os.mkdir(path) |
| 81 | + elif len(os.listdir(path)) > 0: # remove existing files |
| 82 | + for f in os.listdir(path): |
| 83 | + os.remove(os.path.join(path, f)) |
| 84 | + |
| 85 | + # Write new files with manuscript object |
| 86 | + for identity, entry in manuscript.entries.items(): |
| 87 | + if identity: # TODO: resolve issue of unidentified entries |
| 88 | + # TODO: ask for a naming convention |
| 89 | + filename_txt = f'{txt_path}/{version}_{entry.identity}.txt' |
| 90 | + filename_xml = f'{xml_path}/{version}_{entry.identity}.xml' |
| 91 | + |
| 92 | + content_txt = entry.text(version, xml=False) |
| 93 | + content_xml = entry.text(version, xml=True) |
| 94 | + |
| 95 | + f_txt = open(filename_txt, 'w') |
| 96 | + f_txt.write(content_txt) |
| 97 | + f_txt.close() |
| 98 | + |
| 99 | + f_xml = open(filename_xml, 'w') |
| 100 | + f_xml.write(content_xml) |
| 101 | + f_xml.close() |
| 102 | + |
| 103 | +def update_all_folios(manuscript: BnF) -> None: |
| 104 | + """ |
| 105 | + Update /m-k-manuscript-data/allFolios/ with the current manuscript from /ms-xml/. |
| 106 | +
|
| 107 | + Input: |
| 108 | + manuscript -- Python object of the manuscript defined in digital_manuscript.py |
| 109 | + Output: |
| 110 | + None |
| 111 | + """ |
| 112 | + for b in [True, False]: # xml and txt respectively |
| 113 | + for version in versions: |
| 114 | + text = '' |
| 115 | + folder = 'xml' if b else 'txt' |
| 116 | + |
| 117 | + # add text entry by entry, with two line breaks in between each |
| 118 | + for identity, entry in manuscript.entries.items(): |
| 119 | + new_text = entry.text(version, xml=b) |
| 120 | + text = f'{text}\n\n{new_text}' if text else new_text |
| 121 | + |
| 122 | + # write file |
| 123 | + f = open(f'{m_path}/allFolios/{folder}/all_{version}.{folder}', 'w') |
| 124 | + f.write(text) |
| 125 | + f.close() |
| 126 | + |
| 127 | +def update_ms(manuscript: BnF) -> None: |
| 128 | + """ |
| 129 | + Update /m-k-manuscript-data/update_ms/ with the current manuscript from /ms-xml/. |
| 130 | + Iterate through /ms-xml/ for each version, remove tags, and save to /ms-txt/. |
| 131 | +
|
| 132 | + Input: |
| 133 | + manuscript -- Python object of the manuscript defined in digital_manuscript.py |
| 134 | + Output: |
| 135 | + None |
| 136 | + """ |
| 137 | + for version in versions: |
| 138 | + for r, d, f in os.walk(f'{m_path}/ms-xml/{version}'): |
| 139 | + for filename in f: # iterate through /ms-xml/{version} folder |
| 140 | + # read xml file |
| 141 | + text = '' |
| 142 | + filepath = f'{m_path}/ms-xml/{version}/{filename}' |
| 143 | + with open(filepath, encoding="utf-8", errors="surrogateescape") as f: |
| 144 | + text = f.read() |
| 145 | + |
| 146 | + # remove xml, normalize whitespace |
| 147 | + text = text.replace('\n', '**NEWLINE**') |
| 148 | + text = re.sub(r'<.*?>', '', text) |
| 149 | + text = text.replace('**NEWLINE**', '\n') |
| 150 | + text = text.strip(' \n') |
| 151 | + |
| 152 | + # write txt file |
| 153 | + txt_filepath = filepath.replace('xml', 'txt') |
| 154 | + f = open(txt_filepath, 'w') |
| 155 | + f.write(text) |
| 156 | + f.close() |
| 157 | + |
| 158 | +def update_time(): |
| 159 | + """ Extract timestamp at the top of this file and update it. """ |
| 160 | + # Initialize date to write and container for the text |
| 161 | + now_str = str(datetime.now()).split(' ')[0] |
| 162 | + lines = [] |
| 163 | + |
| 164 | + # open file, extract text, and modify |
| 165 | + with open('./update.py', 'r') as f: |
| 166 | + lines = f.read().split('\n') |
| 167 | + lines[0] = f'# Last Updated | {now_str}' |
| 168 | + |
| 169 | + # write modified text |
| 170 | + f = open('./update.py', 'w') |
| 171 | + f.write('\n'.join(lines)) |
| 172 | + f.close |
| 173 | + |
| 174 | +def update(): |
| 175 | + |
| 176 | + manuscript = BnF(apply_corrections=False) |
| 177 | + |
| 178 | + print('Updating metadata') |
| 179 | + update_metadata(manuscript) |
| 180 | + |
| 181 | + print('Updating entries') |
| 182 | + update_entries(manuscript) |
| 183 | + |
| 184 | + print('Updating ms-txt') |
| 185 | + update_ms(manuscript) |
| 186 | + |
| 187 | + print('Updating allFolios') |
| 188 | + update_all_folios(manuscript) |
| 189 | + |
| 190 | + update_time() |
| 191 | + |
| 192 | +if __name__ == "__main__": |
| 193 | + update() |
0 commit comments