Initial commit

MatMoore · Dec 24, 2022 · aa62c72 · aa62c72
1 parent 280225e
commit aa62c72
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -1,7 +1,13 @@
 # Anki deck analysis
 
+Scripts to analyze my anki decks.
+
 ## Setup
 
 - `python -m venv env`
 - `source ./env/bin/activate`
 - `pip install -r requirements.txt`
+
+## Scripts
+
+- [hanja.py](./hanja.py) - find the most common Hanja characters in my Korean deck
diff --git a/hanja.py b/hanja.py
@@ -0,0 +1,36 @@
+"""
+Find the hanja characters that appear most often in the sino-korean words
+in my Korean anki deck.
+"""
+from anki.collection import Collection
+from anki.notes import Note
+from collections import defaultdict
+
+words_by_hanja_character = defaultdict(list)
+
+COLLECTION_PATH = "/Users/mat/Library/Application Support/Anki2/User 1/collection.anki2"
+KOREAN_DECK = '1541100429539'
+COMMON_CJK_CHARACTERS = range(0x4E00, 0x9FFF)
+
+col = Collection(COLLECTION_PATH)
+note_ids = col.find_notes('deck:Korean AND "note:korean comprehension + production"')
+for note_id in note_ids:
+  note = Note(col=col, id=note_id)
+  items = dict(note.items())
+  hanja = items['Hanja'].strip()
+  korean = items['Korean']
+  english = items['English']
+
+  for character in hanja:
+    if(ord(character)) not in COMMON_CJK_CHARACTERS:
+      continue
+    words_by_hanja_character[character].append((korean, english))
+
+col.close()
+
+sorted_characters = sorted(words_by_hanja_character.items(), key=lambda item: len(item[1]), reverse=True)
+for character_item in sorted_characters[:10]:
+  character, examples = character_item
+  print(character)
+  print(examples)
+  print('')