-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
python: Upgrade experiment for text cards
The big changes here: - Parsing Kindle "My Clippings.txt" - Adding more context to clippings - Shorter explanations from GPT-3.5
- Loading branch information
Showing
2 changed files
with
397 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,269 @@ | ||
#!/usr/bin/env python | ||
# | ||
# Usage: | ||
# python make-text-cards-with-context.py <deck> <source-name> <input-clippings> <input-bilingual-jsonl> <output-csv-file> | ||
|
||
import csv | ||
from dataclasses import asdict, dataclass | ||
import json | ||
import re | ||
from typing import Dict, List, Optional | ||
from unicodedata import normalize | ||
|
||
from dotenv import load_dotenv | ||
from markdown import markdown | ||
from openai import OpenAI | ||
|
||
|
||
# Load environment variables. Create a file named `.env` in the same directory as this file | ||
# and add the following line to it: | ||
# | ||
# OPENAI_API_KEY="your-api-key" | ||
load_dotenv() | ||
|
||
def strip_brackets(s: str) -> str: | ||
"""Remove all brackets from a string.""" | ||
return s.replace("[[", "").replace("]]", "") | ||
|
||
@dataclass(kw_only=True) | ||
class Alignment: | ||
"""A bilingual sentence alignment. Technically either side may contain | ||
multiple sentences. | ||
Foreign expressions to be explained may be marked with [[...]].""" | ||
foreign: str | ||
native: str | ||
|
||
@staticmethod | ||
def from_jsonl(path: str) -> List["Alignment"]: | ||
"""Load alignments from a file in JSONL format, where each | ||
line looks like `{ "f": "foreign text", "n": "native text" }`.""" | ||
alignments = [] | ||
with open(path, "r", encoding="utf-8") as f: | ||
for line in f.readlines(): | ||
record = json.loads(line) | ||
alignments.append(Alignment( | ||
foreign=record["f"], | ||
native=record["n"], | ||
)) | ||
return alignments | ||
|
||
@dataclass(kw_only=True) | ||
class Card: | ||
"""An Anki card with optional context. | ||
Text will be interpreted as Markdown. The "Foreign" text may include [[ ]] | ||
marks around phrases that should be explained.""" | ||
ForeignCurr: str | ||
NativeCurr: str | ||
ForeignPrev: Optional[str] | ||
NativePrev: Optional[str] | ||
ForeignNext: Optional[str] | ||
NativeNext: Optional[str] | ||
Source: Optional[str] | ||
Hint: Optional[str] | ||
Notes: Optional[str] | ||
|
||
def from_alignments(prev: Optional[Alignment], curr: Alignment, next: Optional[Alignment], *, source: Optional[str] = None) -> "Card": | ||
"""Create a card from the current alignment and optional context.""" | ||
return Card( | ||
ForeignCurr=curr.foreign, | ||
NativeCurr=curr.native, | ||
ForeignPrev=prev.foreign if prev else None, | ||
NativePrev=prev.native if prev else None, | ||
ForeignNext=next.foreign if next else None, | ||
NativeNext=next.native if next else None, | ||
Source=source, | ||
Hint=None, | ||
Notes=None, | ||
) | ||
|
||
def to_anki_dict(self) -> Dict[str, str]: | ||
"""Convert the card to a dictionary suitable for writing to an Anki CSV.""" | ||
d = {} | ||
for field, value in asdict(self).items(): | ||
if value is not None: | ||
d[field] = markdown(value.replace("[[", "**").replace("]]", "**")) | ||
return d | ||
|
||
def expressions_to_explain(self) -> List[str]: | ||
"""Return a list of expressions in the foreign text that should be explained.""" | ||
return re.findall(r"\[\[(.*?)\]\]", self.ForeignCurr) | ||
|
||
def generate_explanations_for_note(self, client: OpenAI): | ||
"""Generate explanations for the expressions to be explained.""" | ||
to_explain = self.expressions_to_explain() | ||
if not to_explain: | ||
return | ||
|
||
# Only keep [[...]] expressions in self.ForeignCurr. | ||
context = [] | ||
if self.ForeignPrev: | ||
context.append(strip_brackets(self.ForeignPrev)) | ||
context.append(self.ForeignCurr) | ||
if self.ForeignNext: | ||
context.append(strip_brackets(self.ForeignNext)) | ||
|
||
# Build a Markdown template for the explanations, to be filled in by the | ||
# LLM. | ||
explanation_template = [] | ||
for expression in to_explain: | ||
explanation_template.append(f"- **{expression}:**") | ||
|
||
# Prompts. | ||
system_message = """\ | ||
You are a skilled language tutor helping an experienced language learner prepare | ||
an Anki card. Your goal is to explain the meaning of the expressions marked with | ||
[[ ]], as a Markdown list. Prefer simple translations where they exist, but give | ||
longer explanations where necessary. Consider whether a marked expression might be | ||
part of a larger idiom, and if so, explain the whole idiom in this context.""" | ||
|
||
prompt_1 = "Los polis nunca lo hubiesen reconocido, pero [[a veces]] parecían casi reacios a perseguirlo.\n\nExplain:\n\n- **a veces:**" | ||
response_1 = { | ||
"thinking": "**a veces** means \"sometimes\" here, so explain it with a direct translation.", | ||
"explanations": "- **a veces:** Sometimes.", | ||
} | ||
prompt_2 = """Ni [[siquiera]] hay una gramola.\n\nExplain:\n\n- **siquiera:**""" | ||
response_2 = { | ||
"thinking": "**ni siquiera** means \"not even\" here, but **siquiera** can also mean \"even\", \"if only\" or \"at least\". This might be confusing, so let's clarify.""", | ||
"explanations": """- **(ni) siquiera:** Not even. Also: | ||
- _Siquiera pudieras llamar para avisar_ "**If only** you could call to let know." | ||
- _¿Puedes intentar siquiera hacer algo hoy?_ "Can you **at least** try to do something today?" | ||
- _Ni siquiera lo intentes._ "**Don't even** try it.\"""" | ||
} | ||
prompt_3 = f"""\ | ||
{" ".join(context)} | ||
Explain: | ||
{" ".join(explanation_template)}""" | ||
print(f"Prompt: {prompt_3}", file=sys.stderr) | ||
|
||
# Declare the function that the model should call. | ||
tools = [{ | ||
"type": "function", | ||
"function": { | ||
"name": "add_explanations_to_card", | ||
"description": "Add the explanation to the current card.", | ||
"parameters": { | ||
"type": "object", | ||
"properties": { | ||
"thinking": { | ||
"type": "string", | ||
"description": "Explain your thoughts about how to prepare this card briefly." | ||
}, | ||
"explanations": { | ||
"type": "string", | ||
"description": "If and only if any phrases are marked with [[ ]], this should paramater should be passed, containing a Markdown-formatted list explaining each phrase marked with [[ ]]. It should not contain explanations for any phrases not marked with [[ ]]. If a marked phrase can be explained by a simple definition in English, just give that. If it's more complicated, use a longer explanation." | ||
}, | ||
}, | ||
"required": ["explanations"] | ||
} | ||
} | ||
}] | ||
|
||
# Generate the explanations using GPT-3.5. | ||
response = client.chat.completions.create( | ||
model="gpt-3.5-turbo", | ||
messages=[ | ||
{"role": "system", "content": system_message}, | ||
{"role": "user", "content": prompt_1}, | ||
{"role": "function", "name": "add_explanations_to_card", "content": json.dumps(response_1)}, | ||
{"role": "user", "content": prompt_2}, | ||
{"role": "function", "name": "add_explanations_to_card", "content": json.dumps(response_2)}, | ||
{"role": "user", "content": prompt_3}, | ||
], | ||
tools=tools, | ||
tool_choice={"type": "function", "function": {"name": "add_explanations_to_card"}}, | ||
) | ||
|
||
# Extract the tool call from the response. | ||
tool_calls = response.choices[0].message.tool_calls | ||
assert len(tool_calls) == 1 | ||
args = json.loads(tool_calls[0].function.arguments) | ||
print(f"{json.dumps(args, indent=4)}", file=sys.stderr) | ||
|
||
# Add the explanations to the card. | ||
self.Notes = args["explanations"] | ||
|
||
def highlights_to_cards(highlights: List[str], alignments: List[Alignment], *, | ||
source: Optional[str] = None) -> List[Card]: | ||
"""Our input is: | ||
- A list of foreign-language highlights, typically a single sentence. | ||
- A list of bilingual alignments, where each alignment is a pair of sentences. | ||
""" | ||
|
||
def to_key(s: str) -> str: | ||
"""Normalize a string for comparison.""" | ||
return normalize("NFC", re.sub(r"\s+", "", s).replace("—", "")) | ||
|
||
foreign_to_alignments: Dict[str, (Optional[Alignment], Alignment, Optional[Alignment])] = {} | ||
for i, alignment in enumerate(alignments): | ||
if alignment.foreign not in foreign_to_alignments: | ||
prev = alignments[i - 1] if i > 0 else None | ||
curr = alignment | ||
next = alignments[i + 1] if i < len(alignments) - 1 else None | ||
foreign_to_alignments[to_key(alignment.foreign)] = (prev, curr, next) | ||
|
||
cards = [] | ||
for highlight in highlights: | ||
highlight_key = to_key(strip_brackets(highlight)) | ||
if highlight_key in foreign_to_alignments: | ||
prev, curr, next = foreign_to_alignments[highlight_key] | ||
curr_with_brackets = Alignment(foreign=highlight, native=curr.native) | ||
cards.append(Card.from_alignments(prev, curr_with_brackets, next, source=source)) | ||
else: | ||
print(f"WARNING: Couldn't find: {repr(highlight)}", file=sys.stderr) | ||
|
||
return cards | ||
|
||
|
||
def highlights_and_alignments_to_csv(highlights_path: str, alignments_path: str, out_csv_path: str, *, deck: str, source: Optional[str] = None) -> None: | ||
"""Read in a file of highlights and a file of bilingual alignments and write | ||
the generated cards to a CSV file.""" | ||
|
||
# Get our highlights. | ||
with open(highlights_path, "r", encoding="utf-8-sig") as f: | ||
highlights = f.read().strip().split("\n--\n") | ||
if not highlights[-1]: | ||
highlights.pop() | ||
if highlights and highlights[-1].endswith("\n--"): | ||
highlights[-1] = highlights[-1][:-3] | ||
|
||
# Get our alignments and generate cards. | ||
alignments = Alignment.from_jsonl(alignments_path) | ||
cards = highlights_to_cards(highlights, alignments, source=source) | ||
|
||
# Generate explanations for the cards. | ||
client = OpenAI() | ||
for card in cards: | ||
card.generate_explanations_for_note(client) | ||
|
||
# Write CSV correctly using a library. Note that Anki imports work much | ||
# better if we provide a header. | ||
with open(out_csv_path, "w", newline="") as f: | ||
f.write(f"""#separator:Semicolon | ||
#html:true | ||
#notetype:Aligned Text | ||
#deck:{deck} | ||
#columns:""") | ||
writer = csv.DictWriter(f, fieldnames=["ForeignCurr", "NativeCurr", "ForeignPrev", "NativePrev", "ForeignNext", "NativeNext", "Source", "Hint", "Notes"], delimiter=";") | ||
writer.writeheader() | ||
writer.writerows(card.to_anki_dict() for card in cards) | ||
|
||
# Command line entry point. | ||
if __name__ == "__main__": | ||
import sys | ||
|
||
if len(sys.argv) != 6: | ||
print(f"Usage: {sys.argv[0]} <deck> <source-name> <input-highlights-file> <input-alignments-file> <output-csv-file>") | ||
sys.exit(1) | ||
|
||
deck = sys.argv[1] | ||
source = sys.argv[2] | ||
highlights_path = sys.argv[3] | ||
alignments_path = sys.argv[4] | ||
out_csv_path = sys.argv[5] | ||
|
||
highlights_and_alignments_to_csv(highlights_path, alignments_path, out_csv_path, deck=deck, source=source) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
#!/usr/bin/env python3 | ||
# | ||
# Usage: python3 parse-kindle-clippings.py <path-to-clippings-file> <book-title> <out-file> | ||
# | ||
# This script parses the clippings file from a Kindle device and prints out the | ||
# highlights in a more readable format. | ||
# | ||
# Clippings have the following format: | ||
# Book (Series) (Spanish Edition) (Author) | ||
# - Votre surlignement sur la page 9 | emplacement 52-53 | Ajouté le samedi 30 mars 2024 23:26:32 | ||
# | ||
# Text text text | ||
# and more quoted text. | ||
# ========== | ||
# | ||
# The second line is too dependent on the language of the device, so we'll | ||
# ignore it. | ||
# | ||
# The out file is stored with in the format: | ||
# | ||
# Quote 1. | ||
# -- | ||
# Quote 2. | ||
# -- | ||
# | ||
# Note that some text in the output file may be surrounded by [[ and ]]. This | ||
# is added later by hand an should be ignored if when we're deciding whether | ||
# a highlight is already in the output file. | ||
|
||
import os | ||
import re | ||
import sys | ||
from typing import List | ||
|
||
class Highlight: | ||
position: int | ||
title: str | ||
author: str | ||
text: str | ||
|
||
def __init__(self, *, position: int, title: str, author: str, text: str): | ||
self.position = position | ||
self.title = title | ||
self.author = author | ||
self.text = text | ||
|
||
def parse_clippings_file(path: str) -> List[Highlight]: | ||
with open(path, encoding='utf-8-sig') as f: | ||
lines = f.readlines() | ||
|
||
highlights: List[Highlight] = [] | ||
|
||
# Consume one line at a time. | ||
i = 0 | ||
position = 0 | ||
while i < len(lines): | ||
# The author is the _last_ parenthesized expression. | ||
title_author = lines[i].strip() | ||
title, author = title_author.rsplit('(', 1) | ||
title = title.strip() | ||
author = author[:-1].strip() | ||
print(f'Title: {repr(title)}') | ||
i += 3 | ||
|
||
# The text is everything until the next line of equals signs. But strip | ||
# leading and trailing whitespace, and convert whitespace to a single | ||
# space. | ||
text_lines = [] | ||
while i < len(lines) and lines[i].strip() != '==========': | ||
text_lines.append(lines[i].strip()) | ||
i += 1 | ||
i += 1 | ||
text = re.sub(r'\s+', ' ', ' '.join(text_lines)).strip() | ||
|
||
highlights.append(Highlight(position=position, title=title, author=author, text=text)) | ||
position += 1 | ||
|
||
# Now we need to deal with highlights that are subsets of other highlights. | ||
# We'll do this by sorting by length, descending, and then iterating through | ||
# the highlights and removing any that are substrings of ones we've already | ||
# seen. | ||
highlights.sort(key=lambda h: len(h.text), reverse=True) | ||
seen: set[str] = set() | ||
deduped_highlights: List[Highlight] = [] | ||
for h in highlights: | ||
# Check against all the highlights we've already seen. | ||
if any(h.text in s for s in seen): | ||
continue | ||
seen.add(h.text) | ||
deduped_highlights.append(h) | ||
|
||
# Now sort by position. | ||
deduped_highlights.sort(key=lambda h: h.position) | ||
return deduped_highlights | ||
|
||
def write_highlights(highlights: List[Highlight], out_file: str): | ||
# First, we need to keep track of known highlights. | ||
known_highlights: set[str] = set() | ||
try: | ||
with open(out_file) as f: | ||
known_highlights_iter = ( | ||
kh.replace('[[', '').replace(']]', '').strip() | ||
for kh in f.read().split('\n--\n') | ||
) | ||
known_highlights = set(kh for kh in known_highlights_iter if kh) | ||
except FileNotFoundError: | ||
pass | ||
|
||
# Then append the new highlights. | ||
with open(out_file, 'a') as f: | ||
for h in highlights: | ||
if h.text in known_highlights: | ||
continue | ||
f.write(f'{h.text}\n--\n') | ||
|
||
if __name__ == '__main__': | ||
if len(sys.argv) != 4: | ||
print('Usage: python3 parse-kindle-clippings.py <path-to-clippings-file> <book-title> <out-file>') | ||
sys.exit(1) | ||
|
||
clippings_file = sys.argv[1] | ||
book_title = sys.argv[2] | ||
out_file = sys.argv[3] | ||
|
||
highlights = parse_clippings_file(clippings_file) | ||
highlights = [h for h in highlights if h.title == book_title] | ||
print(f'Found {len(highlights)} highlights for {repr(book_title)}.') | ||
write_highlights(highlights, out_file) |