From da9011004634a57b254ee93fbaa0ae35b4223720 Mon Sep 17 00:00:00 2001
From: Eric Kidd <git@randomhacks.net>
Date: Sat, 20 Apr 2024 15:15:18 -0400
Subject: [PATCH] python: Upgrade experiment for text cards

The big changes here:

- Parsing Kindle "My Clippings.txt"
- Adding more context to clippings
- Shorter explanations from GPT-3.5
---
 .../make-text-cards-with-context.py           | 269 ++++++++++++++++++
 python-experiments/parse-kindle-clippings.py  | 128 +++++++++
 2 files changed, 397 insertions(+)
 create mode 100644 python-experiments/make-text-cards-with-context.py
 create mode 100644 python-experiments/parse-kindle-clippings.py
diff --git a/python-experiments/make-text-cards-with-context.py b/python-experiments/make-text-cards-with-context.py
new file mode 100644
index 0000000..13fd6ad
--- /dev/null
+++ b/python-experiments/make-text-cards-with-context.py
@@ -0,0 +1,269 @@
+#!/usr/bin/env python
+#
+# Usage:
+#   python make-text-cards-with-context.py <deck> <source-name> <input-clippings> <input-bilingual-jsonl> <output-csv-file>
+
+import csv
+from dataclasses import asdict, dataclass
+import json
+import re
+from typing import Dict, List, Optional
+from unicodedata import normalize
+
+from dotenv import load_dotenv
+from markdown import markdown
+from openai import OpenAI
+
+
+# Load environment variables. Create a file named `.env` in the same directory as this file
+# and add the following line to it:
+#
+# OPENAI_API_KEY="your-api-key"
+load_dotenv()
+
+def strip_brackets(s: str) -> str:
+    """Remove all brackets from a string."""
+    return s.replace("[[", "").replace("]]", "")
+
+@dataclass(kw_only=True)
+class Alignment:
+    """A bilingual sentence alignment. Technically either side may contain
+    multiple sentences.
+
+    Foreign expressions to be explained may be marked with [[...]]."""
+    foreign: str
+    native: str
+
+    @staticmethod
+    def from_jsonl(path: str) -> List["Alignment"]:
+        """Load alignments from a file in JSONL format, where each
+        line looks like `{ "f": "foreign text", "n": "native text" }`."""
+        alignments = []
+        with open(path, "r", encoding="utf-8") as f:
+            for line in f.readlines():
+                record = json.loads(line)
+                alignments.append(Alignment(
+                    foreign=record["f"],
+                    native=record["n"],
+                ))
+        return alignments
+
+@dataclass(kw_only=True)
+class Card:
+    """An Anki card with optional context.
+
+    Text will be interpreted as Markdown. The "Foreign" text may include [[ ]]
+    marks around phrases that should be explained."""
+    ForeignCurr: str
+    NativeCurr: str
+    ForeignPrev: Optional[str]
+    NativePrev: Optional[str]
+    ForeignNext: Optional[str]
+    NativeNext: Optional[str]
+    Source: Optional[str]
+    Hint: Optional[str]
+    Notes: Optional[str]
+
+    def from_alignments(prev: Optional[Alignment], curr: Alignment, next: Optional[Alignment], *, source: Optional[str] = None) -> "Card":
+        """Create a card from the current alignment and optional context."""
+        return Card(
+            ForeignCurr=curr.foreign,
+            NativeCurr=curr.native,
+            ForeignPrev=prev.foreign if prev else None,
+            NativePrev=prev.native if prev else None,
+            ForeignNext=next.foreign if next else None,
+            NativeNext=next.native if next else None,
+            Source=source,
+            Hint=None,
+            Notes=None,
+        )
+
+    def to_anki_dict(self) -> Dict[str, str]:
+        """Convert the card to a dictionary suitable for writing to an Anki CSV."""
+        d = {}
+        for field, value in asdict(self).items():
+            if value is not None:
+                d[field] = markdown(value.replace("[[", "**").replace("]]", "**"))
+        return d
+
+    def expressions_to_explain(self) -> List[str]:
+        """Return a list of expressions in the foreign text that should be explained."""
+        return re.findall(r"\[\[(.*?)\]\]", self.ForeignCurr)
+
+    def generate_explanations_for_note(self, client: OpenAI):
+        """Generate explanations for the expressions to be explained."""
+        to_explain = self.expressions_to_explain()
+        if not to_explain:
+            return
+
+        # Only keep [[...]] expressions in self.ForeignCurr.
+        context = []
+        if self.ForeignPrev:
+            context.append(strip_brackets(self.ForeignPrev))
+        context.append(self.ForeignCurr)
+        if self.ForeignNext:
+            context.append(strip_brackets(self.ForeignNext))
+
+        # Build a Markdown template for the explanations, to be filled in by the
+        # LLM.
+        explanation_template = []
+        for expression in to_explain:
+            explanation_template.append(f"- **{expression}:**")
+
+        # Prompts.
+        system_message = """\
+You are a skilled language tutor helping an experienced language learner prepare
+an Anki card. Your goal is to explain the meaning of the expressions marked with
+[[ ]], as a Markdown list. Prefer simple translations where they exist, but give
+longer explanations where necessary. Consider whether a marked expression might be
+part of a larger idiom, and if so, explain the whole idiom in this context."""
+
+        prompt_1 = "Los polis nunca lo hubiesen reconocido, pero [[a veces]] parecían casi reacios a perseguirlo.\n\nExplain:\n\n- **a veces:**"
+        response_1 = {
+            "thinking": "**a veces** means \"sometimes\" here, so explain it with a direct translation.",
+            "explanations": "- **a veces:** Sometimes.",
+        }
+        prompt_2 = """Ni [[siquiera]] hay una gramola.\n\nExplain:\n\n- **siquiera:**"""
+        response_2 = {
+            "thinking": "**ni siquiera** means \"not even\" here, but **siquiera** can also mean \"even\", \"if only\" or \"at least\". This might be confusing, so let's clarify.""",
+            "explanations": """- **(ni) siquiera:** Not even. Also:
+    - _Siquiera pudieras llamar para avisar_ "**If only** you could call to let know."
+    - _¿Puedes intentar siquiera hacer algo hoy?_ "Can you **at least** try to do something today?"
+    - _Ni siquiera lo intentes._ "**Don't even** try it.\""""
+        }
+        prompt_3 = f"""\
+{" ".join(context)}
+
+Explain:
+
+{" ".join(explanation_template)}"""
+        print(f"Prompt: {prompt_3}", file=sys.stderr)
+
+        # Declare the function that the model should call.
+        tools = [{
+            "type": "function",
+            "function": {
+                "name": "add_explanations_to_card",
+                "description": "Add the explanation to the current card.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "thinking": {
+                            "type": "string",
+                            "description": "Explain your thoughts about how to prepare this card briefly."
+                        },
+                        "explanations": {
+                            "type": "string",
+                            "description": "If and only if any phrases are marked with [[ ]], this should paramater should be passed, containing a Markdown-formatted list explaining each phrase marked with [[ ]]. It should not contain explanations for any phrases not marked with [[ ]]. If a marked phrase can be explained by a simple definition in English, just give that. If it's more complicated, use a longer explanation."
+                        },
+                    },
+                    "required": ["explanations"]
+                }
+            }
+        }]
+
+        # Generate the explanations using GPT-3.5.
+        response = client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[
+                {"role": "system", "content": system_message},
+                {"role": "user", "content": prompt_1},
+                {"role": "function", "name": "add_explanations_to_card", "content": json.dumps(response_1)},
+                {"role": "user", "content": prompt_2},
+                {"role": "function", "name": "add_explanations_to_card", "content": json.dumps(response_2)},
+                {"role": "user", "content": prompt_3},
+            ],
+            tools=tools,
+            tool_choice={"type": "function", "function": {"name": "add_explanations_to_card"}},
+        )
+
+        # Extract the tool call from the response.
+        tool_calls = response.choices[0].message.tool_calls
+        assert len(tool_calls) == 1
+        args = json.loads(tool_calls[0].function.arguments)
+        print(f"{json.dumps(args, indent=4)}", file=sys.stderr)
+
+        # Add the explanations to the card.
+        self.Notes = args["explanations"]
+
+def highlights_to_cards(highlights: List[str], alignments: List[Alignment], *,
+source: Optional[str] = None) -> List[Card]:
+    """Our input is:
+
+    - A list of foreign-language highlights, typically a single sentence.
+    - A list of bilingual alignments, where each alignment is a pair of sentences.
+    """
+
+    def to_key(s: str) -> str:
+        """Normalize a string for comparison."""
+        return normalize("NFC", re.sub(r"\s+", "", s).replace("—", ""))
+
+    foreign_to_alignments: Dict[str, (Optional[Alignment], Alignment, Optional[Alignment])] = {}
+    for i, alignment in enumerate(alignments):
+        if alignment.foreign not in foreign_to_alignments:
+            prev = alignments[i - 1] if i > 0 else None
+            curr = alignment
+            next = alignments[i + 1] if i < len(alignments) - 1 else None
+            foreign_to_alignments[to_key(alignment.foreign)] = (prev, curr, next)
+
+    cards = []
+    for highlight in highlights:
+        highlight_key = to_key(strip_brackets(highlight))
+        if highlight_key in foreign_to_alignments:
+            prev, curr, next = foreign_to_alignments[highlight_key]
+            curr_with_brackets = Alignment(foreign=highlight, native=curr.native)
+            cards.append(Card.from_alignments(prev, curr_with_brackets, next, source=source))
+        else:
+            print(f"WARNING: Couldn't find: {repr(highlight)}", file=sys.stderr)
+
+    return cards
+
+
+def highlights_and_alignments_to_csv(highlights_path: str, alignments_path: str, out_csv_path: str, *, deck: str, source: Optional[str] = None) -> None:
+    """Read in a file of highlights and a file of bilingual alignments and write
+    the generated cards to a CSV file."""
+
+    # Get our highlights.
+    with open(highlights_path, "r", encoding="utf-8-sig") as f:
+        highlights = f.read().strip().split("\n--\n")
+        if not highlights[-1]:
+            highlights.pop()
+        if highlights and highlights[-1].endswith("\n--"):
+            highlights[-1] = highlights[-1][:-3]
+
+    # Get our alignments and generate cards.
+    alignments = Alignment.from_jsonl(alignments_path)
+    cards = highlights_to_cards(highlights, alignments, source=source)
+
+    # Generate explanations for the cards.
+    client = OpenAI()
+    for card in cards:
+        card.generate_explanations_for_note(client)
+
+    # Write CSV correctly using a library. Note that Anki imports work much
+    # better if we provide a header.
+    with open(out_csv_path, "w", newline="") as f:
+        f.write(f"""#separator:Semicolon
+#html:true
+#notetype:Aligned Text
+#deck:{deck}
+#columns:""")
+        writer = csv.DictWriter(f, fieldnames=["ForeignCurr", "NativeCurr", "ForeignPrev", "NativePrev", "ForeignNext", "NativeNext", "Source", "Hint", "Notes"], delimiter=";")
+        writer.writeheader()
+        writer.writerows(card.to_anki_dict() for card in cards)
+
+# Command line entry point.
+if __name__ == "__main__":
+    import sys
+
+    if len(sys.argv) != 6:
+        print(f"Usage: {sys.argv[0]} <deck> <source-name> <input-highlights-file> <input-alignments-file> <output-csv-file>")
+        sys.exit(1)
+
+    deck = sys.argv[1]
+    source = sys.argv[2]
+    highlights_path = sys.argv[3]
+    alignments_path = sys.argv[4]
+    out_csv_path = sys.argv[5]
+
+    highlights_and_alignments_to_csv(highlights_path, alignments_path, out_csv_path, deck=deck, source=source)
diff --git a/python-experiments/parse-kindle-clippings.py b/python-experiments/parse-kindle-clippings.py
new file mode 100644
index 0000000..26f187c
--- /dev/null
+++ b/python-experiments/parse-kindle-clippings.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+#
+# Usage: python3 parse-kindle-clippings.py <path-to-clippings-file> <book-title> <out-file>
+#
+# This script parses the clippings file from a Kindle device and prints out the
+# highlights in a more readable format.
+#
+# Clippings have the following format:
+#     Book (Series) (Spanish Edition) (Author)
+#     - Votre surlignement sur la page 9 | emplacement 52-53 | Ajouté le samedi 30 mars 2024 23:26:32
+#
+#     Text text text
+#     and more quoted text.
+#     ==========
+#
+# The second line is too dependent on the language of the device, so we'll
+# ignore it.
+#
+# The out file is stored with in the format:
+#
+#     Quote 1.
+#     --
+#     Quote 2.
+#     --
+#
+# Note that some text in the output file may be surrounded by [[ and ]]. This
+# is added later by hand an should be ignored if when we're deciding whether
+# a highlight is already in the output file.
+
+import os
+import re
+import sys
+from typing import List
+
+class Highlight:
+    position: int
+    title: str
+    author: str
+    text: str
+
+    def __init__(self, *, position: int, title: str, author: str, text: str):
+        self.position = position
+        self.title = title
+        self.author = author
+        self.text = text
+
+def parse_clippings_file(path: str) -> List[Highlight]:
+    with open(path, encoding='utf-8-sig') as f:
+        lines = f.readlines()
+
+    highlights: List[Highlight] = []
+
+    # Consume one line at a time.
+    i = 0
+    position = 0
+    while i < len(lines):
+        # The author is the _last_ parenthesized expression.
+        title_author = lines[i].strip()
+        title, author = title_author.rsplit('(', 1)
+        title = title.strip()
+        author = author[:-1].strip()
+        print(f'Title: {repr(title)}')
+        i += 3
+
+        # The text is everything until the next line of equals signs. But strip
+        # leading and trailing whitespace, and convert whitespace to a single
+        # space.
+        text_lines = []
+        while i < len(lines) and lines[i].strip() != '==========':
+            text_lines.append(lines[i].strip())
+            i += 1
+        i += 1
+        text = re.sub(r'\s+', ' ', ' '.join(text_lines)).strip()
+
+        highlights.append(Highlight(position=position, title=title, author=author, text=text))
+        position += 1
+
+    # Now we need to deal with highlights that are subsets of other highlights.
+    # We'll do this by sorting by length, descending, and then iterating through
+    # the highlights and removing any that are substrings of ones we've already
+    # seen.
+    highlights.sort(key=lambda h: len(h.text), reverse=True)
+    seen: set[str] = set()
+    deduped_highlights: List[Highlight] = []
+    for h in highlights:
+        # Check against all the highlights we've already seen.
+        if any(h.text in s for s in seen):
+            continue
+        seen.add(h.text)
+        deduped_highlights.append(h)
+
+    # Now sort by position.
+    deduped_highlights.sort(key=lambda h: h.position)
+    return deduped_highlights
+
+def write_highlights(highlights: List[Highlight], out_file: str):
+    # First, we need to keep track of known highlights.
+    known_highlights: set[str] = set()
+    try:
+        with open(out_file) as f:
+            known_highlights_iter = (
+                kh.replace('[[', '').replace(']]', '').strip()
+                for kh in f.read().split('\n--\n')
+            )
+            known_highlights = set(kh for kh in known_highlights_iter if kh)
+    except FileNotFoundError:
+        pass
+
+    # Then append the new highlights.
+    with open(out_file, 'a') as f:
+        for h in highlights:
+            if h.text in known_highlights:
+                continue
+            f.write(f'{h.text}\n--\n')
+
+if __name__ == '__main__':
+    if len(sys.argv) != 4:
+        print('Usage: python3 parse-kindle-clippings.py <path-to-clippings-file> <book-title> <out-file>')
+        sys.exit(1)
+
+    clippings_file = sys.argv[1]
+    book_title = sys.argv[2]
+    out_file = sys.argv[3]
+
+    highlights = parse_clippings_file(clippings_file)
+    highlights = [h for h in highlights if h.title == book_title]
+    print(f'Found {len(highlights)} highlights for {repr(book_title)}.')
+    write_highlights(highlights, out_file)