From da9011004634a57b254ee93fbaa0ae35b4223720 Mon Sep 17 00:00:00 2001 From: Eric Kidd Date: Sat, 20 Apr 2024 15:15:18 -0400 Subject: [PATCH] python: Upgrade experiment for text cards The big changes here: - Parsing Kindle "My Clippings.txt" - Adding more context to clippings - Shorter explanations from GPT-3.5 --- .../make-text-cards-with-context.py | 269 ++++++++++++++++++ python-experiments/parse-kindle-clippings.py | 128 +++++++++ 2 files changed, 397 insertions(+) create mode 100644 python-experiments/make-text-cards-with-context.py create mode 100644 python-experiments/parse-kindle-clippings.py diff --git a/python-experiments/make-text-cards-with-context.py b/python-experiments/make-text-cards-with-context.py new file mode 100644 index 0000000..13fd6ad --- /dev/null +++ b/python-experiments/make-text-cards-with-context.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python +# +# Usage: +# python make-text-cards-with-context.py + +import csv +from dataclasses import asdict, dataclass +import json +import re +from typing import Dict, List, Optional +from unicodedata import normalize + +from dotenv import load_dotenv +from markdown import markdown +from openai import OpenAI + + +# Load environment variables. Create a file named `.env` in the same directory as this file +# and add the following line to it: +# +# OPENAI_API_KEY="your-api-key" +load_dotenv() + +def strip_brackets(s: str) -> str: + """Remove all brackets from a string.""" + return s.replace("[[", "").replace("]]", "") + +@dataclass(kw_only=True) +class Alignment: + """A bilingual sentence alignment. Technically either side may contain + multiple sentences. + + Foreign expressions to be explained may be marked with [[...]].""" + foreign: str + native: str + + @staticmethod + def from_jsonl(path: str) -> List["Alignment"]: + """Load alignments from a file in JSONL format, where each + line looks like `{ "f": "foreign text", "n": "native text" }`.""" + alignments = [] + with open(path, "r", encoding="utf-8") as f: + for line in f.readlines(): + record = json.loads(line) + alignments.append(Alignment( + foreign=record["f"], + native=record["n"], + )) + return alignments + +@dataclass(kw_only=True) +class Card: + """An Anki card with optional context. + + Text will be interpreted as Markdown. The "Foreign" text may include [[ ]] + marks around phrases that should be explained.""" + ForeignCurr: str + NativeCurr: str + ForeignPrev: Optional[str] + NativePrev: Optional[str] + ForeignNext: Optional[str] + NativeNext: Optional[str] + Source: Optional[str] + Hint: Optional[str] + Notes: Optional[str] + + def from_alignments(prev: Optional[Alignment], curr: Alignment, next: Optional[Alignment], *, source: Optional[str] = None) -> "Card": + """Create a card from the current alignment and optional context.""" + return Card( + ForeignCurr=curr.foreign, + NativeCurr=curr.native, + ForeignPrev=prev.foreign if prev else None, + NativePrev=prev.native if prev else None, + ForeignNext=next.foreign if next else None, + NativeNext=next.native if next else None, + Source=source, + Hint=None, + Notes=None, + ) + + def to_anki_dict(self) -> Dict[str, str]: + """Convert the card to a dictionary suitable for writing to an Anki CSV.""" + d = {} + for field, value in asdict(self).items(): + if value is not None: + d[field] = markdown(value.replace("[[", "**").replace("]]", "**")) + return d + + def expressions_to_explain(self) -> List[str]: + """Return a list of expressions in the foreign text that should be explained.""" + return re.findall(r"\[\[(.*?)\]\]", self.ForeignCurr) + + def generate_explanations_for_note(self, client: OpenAI): + """Generate explanations for the expressions to be explained.""" + to_explain = self.expressions_to_explain() + if not to_explain: + return + + # Only keep [[...]] expressions in self.ForeignCurr. + context = [] + if self.ForeignPrev: + context.append(strip_brackets(self.ForeignPrev)) + context.append(self.ForeignCurr) + if self.ForeignNext: + context.append(strip_brackets(self.ForeignNext)) + + # Build a Markdown template for the explanations, to be filled in by the + # LLM. + explanation_template = [] + for expression in to_explain: + explanation_template.append(f"- **{expression}:**") + + # Prompts. + system_message = """\ +You are a skilled language tutor helping an experienced language learner prepare +an Anki card. Your goal is to explain the meaning of the expressions marked with +[[ ]], as a Markdown list. Prefer simple translations where they exist, but give +longer explanations where necessary. Consider whether a marked expression might be +part of a larger idiom, and if so, explain the whole idiom in this context.""" + + prompt_1 = "Los polis nunca lo hubiesen reconocido, pero [[a veces]] parecían casi reacios a perseguirlo.\n\nExplain:\n\n- **a veces:**" + response_1 = { + "thinking": "**a veces** means \"sometimes\" here, so explain it with a direct translation.", + "explanations": "- **a veces:** Sometimes.", + } + prompt_2 = """Ni [[siquiera]] hay una gramola.\n\nExplain:\n\n- **siquiera:**""" + response_2 = { + "thinking": "**ni siquiera** means \"not even\" here, but **siquiera** can also mean \"even\", \"if only\" or \"at least\". This might be confusing, so let's clarify.""", + "explanations": """- **(ni) siquiera:** Not even. Also: + - _Siquiera pudieras llamar para avisar_ "**If only** you could call to let know." + - _¿Puedes intentar siquiera hacer algo hoy?_ "Can you **at least** try to do something today?" + - _Ni siquiera lo intentes._ "**Don't even** try it.\"""" + } + prompt_3 = f"""\ +{" ".join(context)} + +Explain: + +{" ".join(explanation_template)}""" + print(f"Prompt: {prompt_3}", file=sys.stderr) + + # Declare the function that the model should call. + tools = [{ + "type": "function", + "function": { + "name": "add_explanations_to_card", + "description": "Add the explanation to the current card.", + "parameters": { + "type": "object", + "properties": { + "thinking": { + "type": "string", + "description": "Explain your thoughts about how to prepare this card briefly." + }, + "explanations": { + "type": "string", + "description": "If and only if any phrases are marked with [[ ]], this should paramater should be passed, containing a Markdown-formatted list explaining each phrase marked with [[ ]]. It should not contain explanations for any phrases not marked with [[ ]]. If a marked phrase can be explained by a simple definition in English, just give that. If it's more complicated, use a longer explanation." + }, + }, + "required": ["explanations"] + } + } + }] + + # Generate the explanations using GPT-3.5. + response = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": system_message}, + {"role": "user", "content": prompt_1}, + {"role": "function", "name": "add_explanations_to_card", "content": json.dumps(response_1)}, + {"role": "user", "content": prompt_2}, + {"role": "function", "name": "add_explanations_to_card", "content": json.dumps(response_2)}, + {"role": "user", "content": prompt_3}, + ], + tools=tools, + tool_choice={"type": "function", "function": {"name": "add_explanations_to_card"}}, + ) + + # Extract the tool call from the response. + tool_calls = response.choices[0].message.tool_calls + assert len(tool_calls) == 1 + args = json.loads(tool_calls[0].function.arguments) + print(f"{json.dumps(args, indent=4)}", file=sys.stderr) + + # Add the explanations to the card. + self.Notes = args["explanations"] + +def highlights_to_cards(highlights: List[str], alignments: List[Alignment], *, +source: Optional[str] = None) -> List[Card]: + """Our input is: + + - A list of foreign-language highlights, typically a single sentence. + - A list of bilingual alignments, where each alignment is a pair of sentences. + """ + + def to_key(s: str) -> str: + """Normalize a string for comparison.""" + return normalize("NFC", re.sub(r"\s+", "", s).replace("—", "")) + + foreign_to_alignments: Dict[str, (Optional[Alignment], Alignment, Optional[Alignment])] = {} + for i, alignment in enumerate(alignments): + if alignment.foreign not in foreign_to_alignments: + prev = alignments[i - 1] if i > 0 else None + curr = alignment + next = alignments[i + 1] if i < len(alignments) - 1 else None + foreign_to_alignments[to_key(alignment.foreign)] = (prev, curr, next) + + cards = [] + for highlight in highlights: + highlight_key = to_key(strip_brackets(highlight)) + if highlight_key in foreign_to_alignments: + prev, curr, next = foreign_to_alignments[highlight_key] + curr_with_brackets = Alignment(foreign=highlight, native=curr.native) + cards.append(Card.from_alignments(prev, curr_with_brackets, next, source=source)) + else: + print(f"WARNING: Couldn't find: {repr(highlight)}", file=sys.stderr) + + return cards + + +def highlights_and_alignments_to_csv(highlights_path: str, alignments_path: str, out_csv_path: str, *, deck: str, source: Optional[str] = None) -> None: + """Read in a file of highlights and a file of bilingual alignments and write + the generated cards to a CSV file.""" + + # Get our highlights. + with open(highlights_path, "r", encoding="utf-8-sig") as f: + highlights = f.read().strip().split("\n--\n") + if not highlights[-1]: + highlights.pop() + if highlights and highlights[-1].endswith("\n--"): + highlights[-1] = highlights[-1][:-3] + + # Get our alignments and generate cards. + alignments = Alignment.from_jsonl(alignments_path) + cards = highlights_to_cards(highlights, alignments, source=source) + + # Generate explanations for the cards. + client = OpenAI() + for card in cards: + card.generate_explanations_for_note(client) + + # Write CSV correctly using a library. Note that Anki imports work much + # better if we provide a header. + with open(out_csv_path, "w", newline="") as f: + f.write(f"""#separator:Semicolon +#html:true +#notetype:Aligned Text +#deck:{deck} +#columns:""") + writer = csv.DictWriter(f, fieldnames=["ForeignCurr", "NativeCurr", "ForeignPrev", "NativePrev", "ForeignNext", "NativeNext", "Source", "Hint", "Notes"], delimiter=";") + writer.writeheader() + writer.writerows(card.to_anki_dict() for card in cards) + +# Command line entry point. +if __name__ == "__main__": + import sys + + if len(sys.argv) != 6: + print(f"Usage: {sys.argv[0]} ") + sys.exit(1) + + deck = sys.argv[1] + source = sys.argv[2] + highlights_path = sys.argv[3] + alignments_path = sys.argv[4] + out_csv_path = sys.argv[5] + + highlights_and_alignments_to_csv(highlights_path, alignments_path, out_csv_path, deck=deck, source=source) diff --git a/python-experiments/parse-kindle-clippings.py b/python-experiments/parse-kindle-clippings.py new file mode 100644 index 0000000..26f187c --- /dev/null +++ b/python-experiments/parse-kindle-clippings.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +# +# Usage: python3 parse-kindle-clippings.py +# +# This script parses the clippings file from a Kindle device and prints out the +# highlights in a more readable format. +# +# Clippings have the following format: +# Book (Series) (Spanish Edition) (Author) +# - Votre surlignement sur la page 9 | emplacement 52-53 | Ajouté le samedi 30 mars 2024 23:26:32 +# +# Text text text +# and more quoted text. +# ========== +# +# The second line is too dependent on the language of the device, so we'll +# ignore it. +# +# The out file is stored with in the format: +# +# Quote 1. +# -- +# Quote 2. +# -- +# +# Note that some text in the output file may be surrounded by [[ and ]]. This +# is added later by hand an should be ignored if when we're deciding whether +# a highlight is already in the output file. + +import os +import re +import sys +from typing import List + +class Highlight: + position: int + title: str + author: str + text: str + + def __init__(self, *, position: int, title: str, author: str, text: str): + self.position = position + self.title = title + self.author = author + self.text = text + +def parse_clippings_file(path: str) -> List[Highlight]: + with open(path, encoding='utf-8-sig') as f: + lines = f.readlines() + + highlights: List[Highlight] = [] + + # Consume one line at a time. + i = 0 + position = 0 + while i < len(lines): + # The author is the _last_ parenthesized expression. + title_author = lines[i].strip() + title, author = title_author.rsplit('(', 1) + title = title.strip() + author = author[:-1].strip() + print(f'Title: {repr(title)}') + i += 3 + + # The text is everything until the next line of equals signs. But strip + # leading and trailing whitespace, and convert whitespace to a single + # space. + text_lines = [] + while i < len(lines) and lines[i].strip() != '==========': + text_lines.append(lines[i].strip()) + i += 1 + i += 1 + text = re.sub(r'\s+', ' ', ' '.join(text_lines)).strip() + + highlights.append(Highlight(position=position, title=title, author=author, text=text)) + position += 1 + + # Now we need to deal with highlights that are subsets of other highlights. + # We'll do this by sorting by length, descending, and then iterating through + # the highlights and removing any that are substrings of ones we've already + # seen. + highlights.sort(key=lambda h: len(h.text), reverse=True) + seen: set[str] = set() + deduped_highlights: List[Highlight] = [] + for h in highlights: + # Check against all the highlights we've already seen. + if any(h.text in s for s in seen): + continue + seen.add(h.text) + deduped_highlights.append(h) + + # Now sort by position. + deduped_highlights.sort(key=lambda h: h.position) + return deduped_highlights + +def write_highlights(highlights: List[Highlight], out_file: str): + # First, we need to keep track of known highlights. + known_highlights: set[str] = set() + try: + with open(out_file) as f: + known_highlights_iter = ( + kh.replace('[[', '').replace(']]', '').strip() + for kh in f.read().split('\n--\n') + ) + known_highlights = set(kh for kh in known_highlights_iter if kh) + except FileNotFoundError: + pass + + # Then append the new highlights. + with open(out_file, 'a') as f: + for h in highlights: + if h.text in known_highlights: + continue + f.write(f'{h.text}\n--\n') + +if __name__ == '__main__': + if len(sys.argv) != 4: + print('Usage: python3 parse-kindle-clippings.py ') + sys.exit(1) + + clippings_file = sys.argv[1] + book_title = sys.argv[2] + out_file = sys.argv[3] + + highlights = parse_clippings_file(clippings_file) + highlights = [h for h in highlights if h.title == book_title] + print(f'Found {len(highlights)} highlights for {repr(book_title)}.') + write_highlights(highlights, out_file)