cu-mkp
diff --git a/‎.vscode/settings.json
-3 b/‎.vscode/settings.json
-3
diff --git a/‎check_update.py
+17 b/‎check_update.py
+17
diff --git a/‎jsonify.py
+1-1 b/‎jsonify.py
+1-1
diff --git a/‎manuscript_helpers.py
+9-6 b/‎manuscript_helpers.py
+9-6
diff --git a/‎projects/issue1588/property_count.csv
+5-77 b/‎projects/issue1588/property_count.csv
+5-77
diff --git a/‎projects/issue1588/property_count.py
+2-2 b/‎projects/issue1588/property_count.py
+2-2
diff --git a/‎recipe.py
+15-4 b/‎recipe.py
+15-4
@@ -0,0 +1,17 @@
+# Last Updated | 2019-11-10
+
+import os
+from datetime import datetime
+
+def check_update():
+  now = datetime.strptime(str(datetime.now()).split(' ')[0], '%Y-%m-%d')
+  with open('./update.py', 'r') as f:
+    text = f.read()
+    comment = text.split('\n')[0]
+    timestamp = datetime.strptime(comment.split('|')[1].strip(), '%Y-%m-%d')
+
+    if timestamp < now:
+      print('The repository has not been updated since ' + str(timestamp) + '. Please run update.py on the day of merging the branch.')
+      assert False
+
+check_update()
@@ -53,7 +53,7 @@ def read_manuscript(manuscript: BnF, df_dict: Dict[str, pd.DataFrame]) -> Dict[s
           df_dict[prop] = df
   return df_dict
 
-def df_to_dict(df = pd.DataFrame) -> Dict[str, Tuple[Reciple, str]]:
+def df_to_dict(df = pd.DataFrame) -> Dict[str, Tuple[Recipe, str]]:
   """
   Convert dataframe into dict of the following format:
   'prefLabel_en1': [(entry1, verbatim_term1), (entry2, verbatim_term2), ...],
 
@@ -10,9 +10,10 @@
               'music', 'plant', 'place', 'personal_name',
               'profession', 'sensory', 'tool', 'time', 'weapon']
 
-cwd = os.getcwd()
-m_path = cwd if 'manuscript-object' not in cwd else f'{cwd}/../'
-m_k_data_to_thesaurus = f'{m_path}/manuscript-object/thesaurus'
+manuscript_data_path = os.path.dirname(os.getcwd()) + "/m-k-manuscript-data"
+thesaurus_path = os.getcwd() + "/thesaurus"
+assert(os.path.exists(manuscript_data_path)), ("Could not find manuscript data directory: " + manuscript_data_path)
+print("Using manuscript data directory:", manuscript_data_path)
 
 def use_thesaurus(entries: Dict[str, Recipe]) -> List[Recipe]:
   """
@@ -27,7 +28,7 @@ def use_thesaurus(entries: Dict[str, Recipe]) -> List[Recipe]:
   Output:
     entries: List[Recipe] -- same as above, but with the thesaurus corrections applied.
   """
-  if not os.path.exists(m_k_data_to_thesaurus):
+  if not os.path.exists(thesaurus_path):
     print('Thesaurus not found. Generating now.')
     os.system(f'python {cwd}/manuscript-object/thesaurus.py')
     print('Finished Generating Thesaurus')
@@ -36,7 +37,7 @@ def use_thesaurus(entries: Dict[str, Recipe]) -> List[Recipe]:
 
   for prop in properties:
     dct = {} # {verbatim_term: prefLabel_en}
-    df = pd.read_csv(f'{m_k_data_to_thesaurus}/{prop}.csv')
+    df = pd.read_csv(f'{thesaurus_path}/{prop}.csv')
 
     # manual_df = manual_corrections[manual_corrections['property'] == prop]
     # manual_dict = {} # verbatim_term, prefLabel_en pairs
@@ -108,13 +109,14 @@ def generate_complete_manuscript(apply_corrections=True) -> Dict[str, Recipe]:
   TODO: Instead of going version by version, consider going folio by folio. 
   """
   for version in versions: 
-    dir_path = f'{m_path}/ms-xml/{version}/'
+    dir_path = f'{manuscript_data_path}/ms-xml/{version}/'
     entry_dict = OrderedDict()
 
     for r, d, f in os.walk(dir_path):
       for filename in f: # iterate through /ms-xml/{version} folder
         # split folio by entry
         info = process_file(f'{dir_path}{filename}')
+        print(f"Loading folio {filename}...")
         for identity, text in info.items(): # add each entry to dictionary
           entry_dict[identity] = text
 
@@ -137,6 +139,7 @@ def generate_complete_manuscript(apply_corrections=True) -> Dict[str, Recipe]:
                                  old.versions['tl'] + '\n\n' + tl)
     else:
       entries[entry_id] = Recipe(entry_id, folio, tc, tcn, tl)
+    print(f"Generating Recipe object for {entry_id}...")
 
   # if specified, manually rewrite entry properties based on thesaurus.
   if apply_corrections:
 
@@ -36,8 +36,8 @@
 
 # %%
 # set up paths
-base = os.path.dirname(os.getcwd()) # .../m-k-manuscript-data/"
-entry_metadata_path = base + "/metadata/entry_metadata.csv"
+manuscript_data_path = os.path.dirname(os.getcwd()) + "/m-k-manuscript-data"
+entry_metadata_path = manuscript_data_path + "/metadata/entry_metadata.csv"
 output_path = "property_count.csv"
 
 # %%
 
@@ -18,8 +18,6 @@
 class Recipe:
 
     def __init__(self, identity: str, folio: str, tc: str, tcn: str, tl: str) -> None:
-
-        print(f"Loading entry with folio {folio}, ID {identity}")
 
         self.identity: str = identity # id of the entry
         self.folio: str = folio # folio of the entry
@@ -51,7 +49,7 @@ def find_categories(self, text: str) -> List[str]:
             return categories[0].split('"')[1].split(';')
         return []
 
-    def find_title(self, text: str) -> str:
+    def find_title(self, text: str, remove_del_text=False) -> str:
         """ 
         Use a regex to find text in between head tags. Specifying the version is not necessary since it is
         included in the dict comprehension statement where this function is called.
@@ -65,7 +63,10 @@ def find_title(self, text: str) -> str:
         text = re.sub(r'\s+', ' ', text.replace('\n', ' '))
 
         titles = re_head.search(text)
-        return '' if not titles else re_tags.sub('', titles[0])
+        if remove_del_text:
+            return '' if not titles else re.sub(r'\s+', ' ', re_tags.sub('', re.sub(r'<del>.*</del>', '', titles[0])))
+        else:
+            return '' if not titles else re_tags.sub('', titles[0])
 
     def clean_length(self, text: str) -> int:
         # TODO: make it word count instead of character count.
@@ -170,6 +171,16 @@ def find_captions(self, version: str) -> List[str]:
 
     def get_title(self, version: str = 'tl'):
         return self.title[version]
+        
+    def get_head(self, text: str) -> str:
+        """ search text for text in a <head> tag. """
+        text = text.replace('<sup>', '[') # mark editor supplied titles with square brackets
+        text = text.replace('</sup>', ']')
+
+        head = re_head.search(text)
+        if head:
+            return re_tags.sub('', head[0])
+        return ''
 
     def get_identity(self) -> str:
         """ Getter method for identity. """