From e301f32a77e6f605f258493df764ae89a0eb2cee Mon Sep 17 00:00:00 2001
From: Kevin Stadler <kevin.stadler@oeaw.ac.at>
Date: Wed, 13 Nov 2024 15:37:44 +0100
Subject: [PATCH] fix: improve openrefine to baserow export

---
 scripts/tsv-to-json.py | 67 ++++++++++++++++++++++++++++++++----------
 1 file changed, 52 insertions(+), 15 deletions(-)

diff --git a/scripts/tsv-to-json.py b/scripts/tsv-to-json.py
index 52e9af2..b5072d5 100755
--- a/scripts/tsv-to-json.py
+++ b/scripts/tsv-to-json.py
@@ -24,7 +24,7 @@
 )
 parser.add_argument(
     "-input",
-    default="thb-20241031.tsv",
+    default="thb-20241113.tsv",
     help="the tsv file exported from OpenRefine (default: %(default)s)",
 )
 
@@ -112,13 +112,13 @@ def getn(n, ss):
     return best
 
 
-def yes_no_maybe(val):
+def yes_no_whatever(val):
     if val == "":
-        return "no"
+        return "nein"
     elif val.lower() == "x":
-        return "yes"
+        return "ja"
     else:
-        return "maybe"
+        return val
 
 
 def origtitle(pub, i):
@@ -155,6 +155,11 @@ def workkey(pub, i):
         bwkey = workkey(pub, i)
         if bwkey:
             origt = origtitle(pub, i)
+            # TODO also count brackets in translated title
+            if origt.count("(") != origt.count(")"):
+                logger.warning(
+                    f'{pub["Signatur"]}: unmatched parentheses in original work title "{origt}"'
+                )
             # store for 2nd pass
             pub["origworks"].append(origt)
 
@@ -236,7 +241,25 @@ def workkey(pub, i):
 
 # infer unique categories
 for k, v in bernhardworks.items():
-    v["title"] = Counter(v["titles"]).most_common(1)[0][0]
+    title_counter = Counter(v["titles"])
+    # print(title_counter)
+    v["title"] = title_counter.most_common(1)[0][0]
+    if any(
+        (
+            v["title"].startswith(prefix)
+            for prefix in [
+                "Watten",
+                "Der Keller",
+                "Minetti",
+                "Der Atem",
+                "Die Kälte",
+                "Die Rosen",
+            ]
+        )
+    ):
+        # choose 2nd most frequent
+        v["title"] = title_counter.most_common(2)[1][0]
+
     v["short_title"] = ""
     if "(" in v["title"]:
         # cut off before '()' # FIXME bwkey should be the shortened thing
@@ -307,6 +330,7 @@ def workkey(pub, i):
             # 'work': work['id'],
             "translators": worktranslators,  # [ t['id'] for t in worktranslators ],
             "title": t.replace("\n", " "),
+            "work_display_title": "",
         }
         if pub[orig(i + 1)] != work["title"]:
             logger.info(
@@ -330,12 +354,14 @@ def workkey(pub, i):
 
     eltern = [el.strip() for el in pub["Eltern"].split(" \\ ")] if pub["Eltern"] else []
 
+    year_display = pub["year"]
     try:
-        int(pub["year"])
+        year = int(pub["year"])
+        # year parsing succeeded, no need to store string representation
+        year_display = ""
     except ValueError:
-        logger.warning(
-            f"{pub['Signatur']} does not have a numeric year ('{pub['year']}')"
-        )
+        logger.info(f"{pub['Signatur']} does not have a numeric year ('{pub['year']}')")
+        year = int(pub["year"][0:4])
 
     assets = (
         pub["Signatur"]
@@ -343,7 +369,10 @@ def workkey(pub, i):
         else ""
     )
     if len(pub["more"]):
-        assets += " " + " ".join([name for name in pub["more"].split(", ")])
+        if " \\ " in pub["more"]:
+            assets += " " + " ".join([name for name in pub["more"].split(" \\ ")])
+        else:
+            assets += " " + " ".join([name for name in pub["more"].split(", ")])
 
     publisher = pub["publisher / publication"]
     publication_details = ""
@@ -359,17 +388,17 @@ def workkey(pub, i):
         "erstpublikation": pub["EP?"].lower() == "x",
         "parents": eltern,
         "title": pub["title"],
-        "year": int(pub["year"][0:4]),
-        "year_display": pub["year"],
+        "year": year,
+        "year_display": year_display,
         "language": pub["language"],
         "contains": ts,
         "publisher": publisher,
         "publication_details": publication_details,
         "isbn": pub["ISBN"],
-        "exemplar_suhrkamp_berlin": yes_no_maybe(
+        "exemplar_suhrkamp_berlin": yes_no_whatever(
             pub["Exemplar Suhrkamp Berlin (03/2023)"]
         ),
-        "exemplar_oeaw": yes_no_maybe(pub["Exemplar ÖAW"]),
+        "exemplar_oeaw": yes_no_whatever(pub["Exemplar ÖAW"]),
         "original_publication": pub["rev. translation, originally published as"],
         "zusatzinfos": pub["zusatzinfos"],
         "images": assets,
@@ -398,3 +427,11 @@ def dump_dict(dct, name):
 dump_dict(translations, "Übersetzung")
 dump_dict(bernhardworks, "BernhardWerk")
 dump_dict(translators, "Übersetzer")
+
+# # dump duplications where the order of contains does not follow the ascending ids
+# for p in publications.values():
+#     cur = p["contains"][0]
+#     for nxt in p["contains"]:
+#         if nxt < cur:
+#             print(p)
+#         cur = nxt