diff --git a/README.rst b/README.rst index e87f161..9fea4ea 100644 --- a/README.rst +++ b/README.rst @@ -40,7 +40,7 @@ HTMLとスライドは `slides.takanory.net`_ で参照できます。 * Talk * en * `How to learn Japanese with Python `__ - * + * `Slides `__ - * `PyCon APAC 2025 参加報告会 `_ * Tokyo, Japan * 2025 Apr diff --git a/slides/20250516pyconus/code/JLPT_kanji.json b/slides/20250516pyconus/code/JLPT_kanji.json new file mode 100644 index 0000000..fbc74e0 --- /dev/null +++ b/slides/20250516pyconus/code/JLPT_kanji.json @@ -0,0 +1,7 @@ +{ + "1": "丁丑且丘丙丞丹乃之乏乙也亀井亘亜亥亦亨享亭亮仁仙仮仰企伊伍伎伏伐伯伴伶伽但佐佑佳併侃侍侑価侮侯侵促俊俗保修俳俵俸倉倖倣倫倭倹偏健偲偵偽傍傑傘催債傷僕僚僧儀儒償允充克免典兼冒冗冠冴冶准凌凜凝凡凪凱凶凸凹刀刃刈刑削剖剛剣剤剰創功劣励劾勁勅勘勧勲勺匁匠匡匿升卑卓博卯即却卸厄厘厳又及叔叙叡句只叶司吉后吏吐吟呂呈呉哀哉哲唄唆唇唯唱啄啓善喚喝喪喬嗣嘆嘉嘱器噴嚇囚圏圭坑坪垂垣執培基堀堅堕堤堪塀塁塊塑塚塾墓墜墨墳墾壁壇壊壌士壮壱奇奈奉奎奏契奔奨奪奮奴如妃妄妊妙妥妨姫姻姿威娠娯婆婿媒媛嫁嫌嫡嬉嬢孔孟孤宏宗宙宜宣宥宮宰宴宵寂寅密寛寡寧審寮寸射尉尋尚尭就尺尼尽尾尿屈展属履屯岐岬岳峠峡峰峻崇崎崚崩嵐嵩嵯嶺巌巡巣巧己巳巴巽帆帝帥帳幕幣幹幻幽庄序庶康庸廃廉廊廷弁弊弐弓弔弘弥弦弧張弾彗彦彩彪彫彬彰影往征径徐従循微徳徴徹忌忍志応忠怜怠怪恒恕恨恩恭恵悌悔悟悠悦悼惇惑惜惟惣惨惰愁愉愚慈態慎慕慢慧慨慮慰慶憂憤憧憩憲憶憾懇懐懲懸我戒戯房扇扉扱扶批抄把抑抗択披抵抹抽拍拐拒拓拘拙拠拡括拳拷挑挙振挿据捷捺授掌排控推措掲描提揚握揮援揺搬搭携搾摂摘摩撃撤撮撲擁操擦擬攻故敏救敢敦整敵敷斉斎斐斗斜斤斥於施旋旗既旦旨旬旭旺昂昆昌昭是昴晃晋晏晟晨晶智暁暇暉暑暖暢暦暫曙曹朋朔朕朗朱朴朽杉李杏杜条松析枠枢架柄柊某染柚柳柾栓栗栞株核栽桂桃案桐桑桜桟梅梓梢梧梨棄棋棚棟棺椋椎検椰椿楊楓楠楼概榛槙槻槽標模樹樺橘檀欄欣欺欽款歓殉殊殖殴殻毅毬氏汁汐江汰汽沖沙没沢沼沿泌泡泣泰洞津洪洲洵洸派浄浜浦浩浪浸涯淑淡淳添渇渉渋渓渚渥渦湧源溝滅滉滋滑滝滞漂漆漏漠漫漬漱漸潔潜潟潤潮澄澪激濁濫瀬災炉炊炎為烈焦煩煮熊熙熟燎燦燿爵爽爾牧牲犠狂狩独狭猛猟猪献猶猿獄獣獲玄率玖玲珠班琉琢琳琴瑚瑛瑞瑠瑳瑶璃環甚甫甲畔畝異疎疫疾症痘痢痴癒癖皇皐皓盆益盛盟監盤盲盾眉看眸眺眼睡督睦瞬瞭瞳矛矢矯砕砲硝硫碁碑碧碩磁磯礁礎祉祐祥票禄禅禍禎秀秘租秦秩称稀稔稚稜稲稼稿穀穂穏穣穫穴窃窒窮窯竜竣端笙笛第笹筋策箇節範篤簿粋粗粘粛糖糧系糾紀紋納紗紘級紛素紡索紫紬累紳紺絃結絞絢統絹継綜維綱網綸綺綾緊緋締緩緯縁縄縛縦縫縮繁繊織繕繭繰罰罷羅羊義翁翔翠翻翼耀耐耗耶聖聡聴肇肖肝肢肥肪肺胆胎胞胡胤胴脅脈脚脩脱脹腐腸膜膨臨臭至致興舌舎舗舜舶艇艦艶芋芙芝芳芹芽苑苗茂茄茅茉茎茜荘莉莞菊菌菖菫華萌萩葬葵蒔蒼蓄蓉蓮蔦蕉蕗薦薪薫藍藤藩藻蘭虎虐虚虜虞虹蚊蚕蛇蛍蛮蝶融衆街衛衝衡衰衷衿袈裁裂裕裟裸製褐褒襟襲覆覇視覧訂討託訟訳訴診証詐詔評詠詢詩該詳誇誉誓誕誘誠誼諄請諒諭諮諾謀謁謄謙謝謡謹譜譲護豆豚豪貞貢貫貴賀賃賄賊賓賜賠賦購赦赳赴趣距跳践踏躍軌軸較載輔輝輩轄辰辱迅迪迫迭透逐逓逝逮逸遂遇遍遣遥遭遮遵遷遺遼避還邑那邦邪邸郁郎郡郭郷酉酌酔酢酪酬酵酷酸醜醸采釈釣鈴鉛鉢銃銑銘銭鋳鋼錘錠錦錬錯鍛鎌鎖鎮鏡鐘鑑閑閣閥閲闘阻阿附陛陣陥陪陰陳陵陶隆隊随隔障隠隣隷隼雄雅雌雛離雰雷需霊霜霞霧露靖鞠韻響項須頌頑頒頻顕顧颯飢飼飽飾養餓馨駄駆駒駿騎騒騰驚髄鬼魁魂魅魔鮎鮮鯉鯛鯨鳩鳳鴻鵬鶏鶴鷹鹿麗麟麻麿黎黙黛鼓\n", + "2": "並丸久乱乳乾了介仏令仲伸伺低依個倍停傾像億兆児党兵冊再凍刊刷券刺則副劇効勇募勢包匹区卒協占印卵厚双叫召史各含周咲喫営団囲固圧坂均型埋城域塔塗塩境央奥姓委季孫宇宝寺封専将尊導届層岩岸島州巨巻布希帯帽幅干幼庁床底府庫延弱律復快恋患悩憎戸承技担拝拾挟捜捨掃掘採接換損改敬旧昇星普暴曇替札机材村板林枚枝枯柔柱査栄根械棒森植極橋欧武歴殿毒比毛氷永汗汚池沈河沸油況泉泊波泥浅浴涙液涼混清減温測湖湯湾湿準溶滴漁濃濯灯灰炭焼照燃燥爆片版玉珍瓶甘畜略畳療皮皿省県短砂硬磨祈祝祭禁秒移税章童競竹符筆筒算管築簡籍粉粒糸紅純細紹絡綿総緑線編練績缶署群羽翌耕肌肩肯胃胸脂脳腕腰膚臓臣舟航般芸荒荷菓菜著蒸蔵薄虫血衣袋被装裏補複角触訓設詞詰誌課諸講谷豊象貝貨販貯貿賞賢贈超跡踊軍軒軟軽輪輸辛農辺述逆造郊郵量針鈍鉄鉱銅鋭録門防陸隅階隻雇雲零震革順預領額香駐骨麦黄鼻齢\n", + "3": "与両乗予争互亡交他付件任伝似位余例供便係信倒候値偉側偶備働優光全共具内冷処列初判利到制刻割加助努労務勝勤化単危原参反収取受号合向君否吸吹告呼命和商喜回因困園在報増声変夢太夫失好妻娘婚婦存宅守完官定実客害容宿寄富寒寝察対局居差市師席常平幸幾座庭式引当形役彼徒得御必忘忙念怒怖性恐恥息悲情想愛感慣成戦戻所才打払投折抜抱押招指捕掛探支放政敗散数断易昔昨晩景晴暗暮曲更最望期未末束杯果格構様権横機欠次欲歯歳残段殺民求決治法泳洗活流浮消深済渡港満演点然煙熱犯状猫王現球産由申留番疑疲痛登皆盗直相眠石破確示礼祖神福科程種積突窓笑等箱米精約組経給絵絶続緒罪置美老耳職育背能腹舞船良若苦草落葉薬術表要規覚観解記訪許認誤説調談論識警議負財貧責費資賛越路辞込迎返迷追退逃途速連進遅遊過達違遠適選部都配酒閉関降限除険陽際雑難雪静非面靴頂頭頼顔願類飛首馬髪鳴\n", + "4": "不世主事京仕代以会住体作使借元兄公写冬切別力勉動医去口古台同味品員問図地堂場売夏夕多夜妹姉始字安室家少屋工帰広店度建弟強待心思急悪意手持教文料新方旅族早明映春昼曜有服朝業楽歌止正歩死注洋海漢牛物特犬理用田町画界病発目真着知研社私秋究空立答紙終習考者肉自色花英茶親言計試買貸質赤走起足転近送通週運道重野銀開院集青音題風飯飲館駅験魚鳥黒\n", + "5": "一七万三上下中九二五人今休何先入八六円出前北十千午半南友右名四国土外大天女子学小山川左年後日時書月木本来東校母毎気水火父生男白百聞行西見話語読車金長間雨電食高\n" +} \ No newline at end of file diff --git a/slides/20250516pyconus/code/kana2roman.py b/slides/20250516pyconus/code/kana2roman.py new file mode 100644 index 0000000..8910835 --- /dev/null +++ b/slides/20250516pyconus/code/kana2roman.py @@ -0,0 +1,15 @@ +import sys +import jaconv + +def kana2romaji(kana: str) -> str: + """Convert Hiragana and Katakana to Romaji""" + hiragana = jaconv.kata2hira(kana) # Katakana -> Hiragana + return jaconv.kana2alphabet(hiragana) # Hiragana -> alphabet + +def kana_with_romaji_ruby(kana: str) -> str: + """Add romaji ruby to Kana text""" + romaji = kana2romaji(kana) + return f"{kana}{romaji}" + +if __name__ == "__main__": + print(kana_with_romaji_ruby(sys.argv[1])) diff --git a/slides/20250516pyconus/code/kanji_reading.py b/slides/20250516pyconus/code/kanji_reading.py new file mode 100644 index 0000000..0033790 --- /dev/null +++ b/slides/20250516pyconus/code/kanji_reading.py @@ -0,0 +1,16 @@ +import sys +from jaconv import kata2hira +from sudachipy import Dictionary + +tokenizer = Dictionary().create() # create tokenizer + +def add_reading(text: str) -> str: + """Add Hiranaga ruby to text""" + result = "" + for token in tokenizer.tokenize(text): + ruby = kata2hira(token.reading_form()) # to Hiragana + result += f"{token}{ruby}\n" + return result + +if __name__ == "__main__": + print(add_reading(sys.argv[1])) diff --git a/slides/20250516pyconus/code/kanji_reading_romaji.py b/slides/20250516pyconus/code/kanji_reading_romaji.py new file mode 100644 index 0000000..29f2e22 --- /dev/null +++ b/slides/20250516pyconus/code/kanji_reading_romaji.py @@ -0,0 +1,17 @@ +import sys +from jaconv import kata2alphabet +from sudachipy import Dictionary + +tokenizer = Dictionary().create() # create tokenizer + +def add_reading(text: str) -> str: + """Add Hiranaga ruby to text""" + result = "" + for token in tokenizer.tokenize(text): + # ruby = kata2hira(token.reading_form()) # to Hiragana + ruby = kata2alphabet(token.reading_form()) # to Alphabet(romaji) + result += f"{token}{ruby}\n" + return result + +if __name__ == "__main__": + print(add_reading(sys.argv[1])) diff --git a/slides/20250516pyconus/code/kanji_reading_with_level.py b/slides/20250516pyconus/code/kanji_reading_with_level.py new file mode 100644 index 0000000..ad72700 --- /dev/null +++ b/slides/20250516pyconus/code/kanji_reading_with_level.py @@ -0,0 +1,81 @@ +""" +usage: kanji_reading_with_level.py [-h] [-a] [-l {1,2,3,4,5}] text + +Add Furigana to Japanese text + +positional arguments: + text text to add furigana annotation + +options: + -h, --help show this help message and exit + -a Alphabet(Romaji) annotation(default: Hiragana) + -l {1,2,3,4,5} set kanji level +""" +import argparse +import json +import re + +from jaconv import kata2alphabet, kata2hira +from sudachipy import Dictionary + +KANJI = r"[\u3005-\u3007\u4E00-\u9FFF]" # Kanji pattern + +tokenizer = Dictionary().create() # create tokenizer + + +def get_kanji_set(level: str | None) -> set[str]: + """Returns a set of Kanji below the specified JLPT level""" + if level is None: + return set() + + with open("JLPT_kanji.json", encoding="utf-8") as f: + kanji_level_dict = json.load(f) + kanji_set = set() + for l, kanji_list in kanji_level_dict.items(): + if l >= level: + kanji_set.update(set(kanji_list)) + return kanji_set + + +def is_ruby_required(surface: str, kanji_set: set[str]) -> bool: + """Returns whether ruby is required""" + if not kanji_set: # no kanji set -> no level + return True + + kanji_in_surface = set(re.findall(KANJI, surface)) + if not kanji_in_surface: # word without kanji + return False + if kanji_in_surface <= kanji_set: # Kanji within the level + return False + return True + + +def add_reading(text: str, level: str | None, alphabet: bool): + """Add Furigana ruby to text""" + kanji_set = get_kanji_set(level) + result = "" + for token in tokenizer.tokenize(text): + reading = token.reading_form() + if is_ruby_required(str(token), kanji_set): + if alphabet: + ruby = kata2alphabet(reading) # to Alphabet + else: + ruby = kata2hira(reading) # to Hiragana + result += f"{token}{ruby}\n" + else: + result += f"{token}\n" + return result + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Add Furigana to Japanese text") + parser.add_argument( + "-a", + action="store_true", + help="Alphabet(Romaji) annotation(default: Hiragana)", + ) + parser.add_argument("-l", choices="12345", help="set kanji level") + parser.add_argument("text", help="text to add furigana annotation") + args = parser.parse_args() + result = add_reading(args.text, args.l, args.a) + print(result) diff --git a/slides/20250516pyconus/code/make_jlpt_kanji_dict.py b/slides/20250516pyconus/code/make_jlpt_kanji_dict.py new file mode 100644 index 0000000..ae0fbb7 --- /dev/null +++ b/slides/20250516pyconus/code/make_jlpt_kanji_dict.py @@ -0,0 +1,13 @@ +import json +from urllib.request import urlopen + +BASE_URL = "https://raw.githubusercontent.com/obfusk/jiten/refs/heads/master/jiten/res/jlpt/" + +kanji = {} +for level in range(1, 6): # level 1 to 5 + with urlopen(f"{BASE_URL}N{level}-kanji") as f: + data = f.read().decode("utf-8") + kanji[level] = data + +with open("JLPT_kanji.json", "w", encoding="utf-8") as f: + json.dump(kanji, f, ensure_ascii=False, indent=2) diff --git a/slides/20250516pyconus/code/word_segmentation.py b/slides/20250516pyconus/code/word_segmentation.py new file mode 100644 index 0000000..e0fe8b9 --- /dev/null +++ b/slides/20250516pyconus/code/word_segmentation.py @@ -0,0 +1,14 @@ +import sys +from sudachipy import Dictionary + +tokenizer = Dictionary().create() + +def word_segmentation(text: str) -> str: + result = [] + for token in tokenizer.tokenize(text): + word = str(token) + result.append(word) + return " / ".join(result) + +if __name__ == "__main__": + print(word_segmentation(sys.argv[1])) diff --git a/slides/20250516pyconus/code/word_segmentation_with_ruby.py b/slides/20250516pyconus/code/word_segmentation_with_ruby.py new file mode 100644 index 0000000..d02fc9d --- /dev/null +++ b/slides/20250516pyconus/code/word_segmentation_with_ruby.py @@ -0,0 +1,15 @@ +import sys +from sudachipy import Dictionary +from kana2roman import kana_with_romaji_ruby + +tokenizer = Dictionary().create() + +def word_segmentation(text: str) -> str: + result = [] + for token in tokenizer.tokenize(text): + word = kana_with_romaji_ruby(str(token)) + result.append(word) + return " / ".join(result) + +if __name__ == "__main__": + print(word_segmentation(sys.argv[1])) diff --git a/slides/20250516pyconus/images/20250516pyconus.png b/slides/20250516pyconus/images/20250516pyconus.png new file mode 100644 index 0000000..ee50177 Binary files /dev/null and b/slides/20250516pyconus/images/20250516pyconus.png differ diff --git a/slides/20250516pyconus/images/ikebukuro.jpg b/slides/20250516pyconus/images/ikebukuro.jpg new file mode 100644 index 0000000..f75ca89 Binary files /dev/null and b/slides/20250516pyconus/images/ikebukuro.jpg differ diff --git a/slides/20250516pyconus/images/japanese-nlp.png b/slides/20250516pyconus/images/japanese-nlp.png new file mode 100644 index 0000000..ff90935 Binary files /dev/null and b/slides/20250516pyconus/images/japanese-nlp.png differ diff --git a/slides/20250516pyconus/images/pyconjp2025-in-hiroshima.jpg b/slides/20250516pyconus/images/pyconjp2025-in-hiroshima.jpg new file mode 100644 index 0000000..49849d7 Binary files /dev/null and b/slides/20250516pyconus/images/pyconjp2025-in-hiroshima.jpg differ diff --git a/slides/20250516pyconus/images/pyconus2025-logo.svg b/slides/20250516pyconus/images/pyconus2025-logo.svg new file mode 100644 index 0000000..9850fd9 --- /dev/null +++ b/slides/20250516pyconus/images/pyconus2025-logo.svg @@ -0,0 +1,170 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/slides/20250516pyconus/images/slides-takanory-net.png b/slides/20250516pyconus/images/slides-takanory-net.png new file mode 100644 index 0000000..c9b593e Binary files /dev/null and b/slides/20250516pyconus/images/slides-takanory-net.png differ diff --git a/slides/20250516pyconus/images/streamlit_demo.gif b/slides/20250516pyconus/images/streamlit_demo.gif new file mode 100644 index 0000000..e37366f Binary files /dev/null and b/slides/20250516pyconus/images/streamlit_demo.gif differ diff --git a/slides/20250516pyconus/index.md b/slides/20250516pyconus/index.md new file mode 100644 index 0000000..4ece3fd --- /dev/null +++ b/slides/20250516pyconus/index.md @@ -0,0 +1,1054 @@ +```{eval-rst} +:og:image: _images/20250516pyconus.png +:og:image:alt: How to learn Japanese with Python + +.. |cover| image:: images/20250516pyconus.png +``` + +# How to learn **Japanese** w/ **Python** + +Takanori Suzuki + +```{image} images/pyconus2025-logo.svg +:alt: PyCon US 2025 logo +:width: 50% +``` + +PyCon US 2025 / 2025 May 16 + +## Agenda ✅ + +* Background and Motivation / Goal +* Japanese is Difficult +* Python supports Japanese leaning + +```{revealjs-notes} +* The agenda for this talk is as follows. +* I will talk about Background, Motivation and goal of this talk. +* Next, I will introduce some of the difficulties of the Japanese language. +* In the last part, I will explain how Python supports Japanese language learning, with example codes. +``` + +### PyCon US 2024 + +* **Lightning Talk** on the same idea +* I will talk in **more detail** + + + +## Background and Motivation 🏞️ + +```{revealjs-notes} +* Background and Motivation of this talk... +``` + +### Background and Motivation + +* Developing School **Textbook Web** at work + * Japanese NLP to make it **Easier to Learn** +* Python libs could help people **Learn Japanese** + +```{revealjs-notes} +* Background and Motivation of this talk... +* My team is developing web-based Textbook for junior high school students at work. +* I am using Japanese NLP to make the textbook easier to learn. +* Based on this experience, I thought Python libraries could help people learn Japanese. +``` + +### Background and Motivation(cont.) + +* [FSI language difficulty](https://www.fsi-language-courses.org/blog/fsi-language-difficulty/) + * Japanese is "**super-hard languages**" for English speakers to learn + * Catevory V* (More than 88 weeks) + +```{revealjs-notes} +* "FSI language difficulty" reports Japanese is ... +``` + +```{revealjs-break} +:notitle: +``` + +
Language difficulty rankings (for native English speakers)
byu/Homesanto inMapPorn
+ + +### Goal + +* What is **difficult** about Japanese +* How to use **Japanese NLP** libs and APIs +* How Python can support **Japanese learning** + +```{revealjs-notes} +* The goals of this talk are +* You know what is difficult about the Japanese language, +* You know how to use the Japanese NLP libraries and APIs and +* You understand how Python can support Japanese language +``` + +## Photos 📷 Share 🐦 👍 + +`#pyconus` / `@takanory` + +### [`slides.takanory.net`](https://slides.takanory.net) 💻 + +```{image} images/slides-takanory-net.png +:alt: slides.takanory.net +:width: 90% +``` + +```{revealjs-notes} +This slide has been published. +Please visit slides.takanory.net or via QR code and click on the "Slides" link! +``` + +## Who am I? 👤 + +```{revealjs-break} +:notitle: +``` + +- Takanori Suzuki / 鈴木 たかのり ({fab}`twitter` [@takanory](https://twitter.com/takanory)) +- [PyCon JP Association](https://www.pycon.jp/committee/english.html/): Chair +- [BeProud Inc.](https://www.beproud.jp/): Director / Python Climber +- [Python Boot Camp](https://www.pycon.jp/support/bootcamp.html), [Python mini Hack-a-thon](https://pyhack.connpass.com/), [Python Bouldering Club](https://kabepy.connpass.com/) +- Love: Ferrets, LEGO, 🍺 / Hobby: 🎺, 🧗‍♀️ + +![takanory profile](/assets/images/sokidan-square.jpg) +![kuro-chan and kuri-chan](/assets/images/kurokuri.jpg) + +```{revealjs-notes} +I'm Takanori Suzuki. My X(Twitter) is "takanory", please follow me. +I'm a Chair of PyCon JP Association. +And I'm a director of BeProud Inc and my title is "Python CLimber". +I'm also active in several Python related communities in Japan. +``` + +### PyCon JP Association + +{fas}`globe` [`www.pycon.jp`](https://www.pycon.jp/committee/english.html) + +**Nonprofit** organization for **Python users** in Japan, to **promote Python** and supports its development. Further it is **our goal** to hold an annual **PyCon JP**. + +```{revealjs-notes} +PyCon JP Association is a nonprofit... +We hold PyCon JP every year. +Do you know PyCon JP? +``` + +![PyCon JP Association](/assets/images/pyconjp_logo.png) + +### PyCon JP **2025** + +* {fas}`globe` [`2025.pycon.jp`](https://2025.pycon.jp/) +* Date: 2025 **Sep 26**(Fri)-**27**(Sat) +* Place: **Hiroshima**, Japan +* There are **English talks** + +```{image} images/pyconjp2025-in-hiroshima.jpg +:alt: PyCon JP 2025 in Hiroshima +:width: 60% +``` + +```{revealjs-notes} +PyCon JP 2025 will be held in Hiroshima at the end of September. +This is the first PyCon JP to be held outside of Tokyo. +Do you know Hiroshima?... +``` + +### Hiroshima? ⛩️ + +* Fukuoka - **Hiroshima** - Kyoto - Tokyo - Hokkaido +* [Direct flights to Hiroshima - HIJ, Japan](https://www.directflights.com/to/HIJ) + * Seoul, Taipei, Shanghai, Hong Kong, Dalian, Hanoi + +```{image} https://maps.directflights.com/directflights/800/HIJ.jpg +:alt: Direct flights to Hiroshima +:width: 40% +``` + +```{revealjs-notes} +Do you know Hiroshima?... +Hiroshima is west of Tokyo and Kyoto. +Hiroshima has several direct flights from overseas, but sorry, no direct flights from US. +``` + +## Questions {nekochan}`hai` + +### Have you **learned** Japanese? {nekochan}`study` + +### Are you **interested** in Japanese? {nekochan}`miru` + +### Would you like to **visit** Japan? {nekochan}`travel` + +```{revealjs-notes} +Almost everyone. +There is a very good opportunity for you... +``` + +### PyCon JP **2025** + +* 2025 **Sep 26**(Fri)-**27**(Sat) +* {nekochan}`yoshi;;;flip-horizontal` **Hiroshima**, Japan + +```{image} images/pyconjp2025-in-hiroshima.jpg +:alt: PyCon JP 2025 in Hiroshima +:width: 80% +``` + +```{revealjs-notes} +PyCon JP 2025 will be held in Hiroshima in September. +See you again at PyCon JP 2025. +``` + +## Japanese is **Difficult** {nekochan}`yabai` + +* **3 Types** of Characters +* **No Spaces** between Words +* **Multiple Readings** of Kanji + +```{revealjs-notes} +Back to the main topic. +I will show you 3 difficult points of the Japanese language. +``` + +### **3 Types** of Characters + +English | Peach(🍑) | Snake(🐍) +-- | -- | -- +Pronounciation | momo | hebi +Hiragana | もも | へび +Katakana | モモ | ヘビ +Kanji | 桃 | 蛇 + +```{revealjs-notes} +Japanese language has 3 types of characters: hiragana, katakana, and kanji. +This table shows 3 different characters for each peach and snake. +``` + +### **No Spaces** between Words + +すもももももももものうち su mo mo mo mo mo mo mo mo no u chi + +```{revealjs-break} +``` + +すもももももももものうち + +↓ + +すもも/も/もも/も/もも/の/うち + +"Plums and peaches are part of peaches" + +```{revealjs-notes} +This sentence "すもももももももものうち" is a play on words, but it is correct Japanese. +Most Japanese can correctly break this sentence into words. +``` + +### **Multiple Readings** of Kanji + +* **人**: person, people + +```{revealjs-notes} +For example, this kanji character means peason and people. +``` + +```{revealjs-break} +``` + +* 2 **styles of readings** +* **Japanese**-style reading(訓読みkun yomi) +* **Chinese**-style reading(音読みon yomi) + +```{revealjs-notes} +Many kanji caracter have 2 styles of readings. +``` + +```{revealjs-break} +``` + +* **人**: person, people +* **Japanese**-style reading: ひと(hito)、びと(bito) +* **Chinese**-style reading: じん(jin)、にん(nin) + +```{revealjs-notes} +The kanji caracter has a total of four different readings. +``` + +```{revealjs-break} +``` + +* Japanese-style reading: ひと(hito)、びと(bito) +* Chinese-style reading: じん(jin)、にん(nin) +* Can you read? + * 小人 (Small person) + * 日本人 (Japanese) + +```{revealjs-notes} +What do you think these idioms read? +``` + +```{revealjs-break} +``` + +* ko**bito** (Samll person) + * Japanese-style reading: ひと(hito)、びと(bito) +* nihon**jin** (Japanese) + * Chinese-style reading: じん(jin)、にん(nin) + +```{revealjs-notes} +The 1st one is Japanese-style reading, "Kobito". +The 2nd one is Chinese-style reading, "Nihonjin". +``` + + +### Japanese is **Difficult**!! {nekochan}`scream` + +* **3 Types** of Characters +* **No Spaces** between Words +* **Multiple Readings** of Kanji + +```{revealjs-notes} +Japanese is Difficult!! But... +``` + +## {fab}`python` **Python** supports **Japanese** leaning + +```{revealjs-notes} +We have Python!! +``` + +## **``** HTML Tag 💎 + +```{revealjs-notes} +I will explain the ruby tag before I talk about Python +``` + +### What is **Ruby** ? + +* ルビruby characters are **small annotation** [^ruby] +* Usually placed **above** the text +* (Not a Programming Language) + +[^ruby]: [Ruby character - Wikipedia](https://en.wikipedia.org/wiki/Ruby_character) + +### **``** HTML Tag 💎 + +* `` represents **small annotations** [^ruby-tag] +* `` specifies the **ruby text** component + +PyConPython Conference +USUnited States +2025 + +```html +PyConPython Conference +USUnited States +2025 +``` + +[^ruby-tag]: [``: The Ruby Annotation element](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/ruby) + +```{revealjs-notes} +If I write a ruby tag like this, it will be displayed like this in a web browser +``` + +### Indicate **pronunciation** with `` + +* **Alphabet** annotation: Pronunciation + +パイコンpa i ko n +あめりかa me ri ka +(PyCon America) + +```html +パイコンpa i ko n +あめりかa me ri ka +``` + +```{revealjs-notes} +This slide uses the ruby tag to indicate pronunciation with alphabets. +``` + +```{revealjs-break} +``` + +* **Hiragana** annotation: Readings +* ふりがなfu ri ga na + +アメリカあめりか +合衆国がっしゅうこく +(The United States of America) + +```html +アメリカあめりか +合衆国がっしゅうこく +``` + +```{revealjs-notes} +The ruby tag is also used for Furigana, the reading of other characters in hiragana. +``` + +### Figured out **``** Tag {nekochan}`naruhodo` + +```{revealjs-notes} +Now we have figured out the the ruby tag, let's move on to Python. +``` + +## **Hiragana** and **Katakana** (あ / ア) + +Snake(🐍) / hebi / へび / ヘビ + +### **Hiragana** and **Katakana** + +* Hiragana and Katakana are **phonogram** +* 1 character represent a phoneme(speech sound) + * Like a Japanese **alphabet** +* Hiragana: あかさたな a ka sa ta na... +* Katakana: アカサタナ a ka sa ta na... + +```{revealjs-break} +``` + +* Basically use Hiragana + * あめりか a me ri ka (America) +* Katakana is used for foreign words + * パイコンpa i ko n (PyCon) + +### **Romanization** of Japanese (Romaji) + +* **Alphabet** to represent Japanese +* **Romaji** is often used on **Information Sign** + +![Ikebukuro station](images/ikebukuro.jpg) + +* Learn **Hiragana**/**Katakana** using Romaji + +### jaconv + +* [jaconv](https://github.com/ikegami-yukino/jaconv): interconverter for Hiragana, Katakana, alphabet and etc. + +```bash +$ python3.12 -m venv env +$ . env/bin/activate +(env) pip install jaconv +``` + +```pycon +>>> import jaconv +>>> jaconv.kana2alphabet("あめりか") # Hiragana -> alphabet +'amerika' +>>> jaconv.kata2alphabet("パイコン") # Katakana -> alphabet +'paikon' +``` + +### Add **Romaji** annotation + +kana2roman.py + +```{revealjs-literalinclude} code/kana2roman.py +:data-line-numbers: 2,4-7|9-12 +``` + +```{revealjs-break} +``` + +```bash +(env) $ python kana2roman.py "パイコン あめりか" +パイコン あめりかpaikon amerika +``` + +パイコン あめりかpaikon amerika + +### Can read **Hiragana** and **Katakana** {nekochan}`good` + +## **No Spaces** between Words + +すもももももももものうち su mo mo mo mo mo mo mo mo no u chi + +```{revealjs-break} +``` + +* Japanese has **no spaces** between words +* Use **Dictionary** to **Recognise** words +* Japanese **Morphological Analyzer** library required + +### Japanese **Morphological Analyzer** + +* see: {fab}`github` [taishi-i/awesome-japanese-nlp-resources](https://github.com/taishi-i/awesome-japanese-nlp-resources?tab=readme-ov-file#morphology-analysis) + +```{image} images/japanese-nlp.png +:alt: Japanese Morphological Analyzers +:width: 60% +``` + +```{revealjs-notes} +There are many morphological analyzer libraries for Japanese. +``` + +### Japanese **Morphological Analyzer** + +* SudachiPy: [pypi.org/project/SudachiPy](https://pypi.org/project/SudachiPy/) +* SudachiDcit: [pypi.org/project/SudachiDict-core](https://pypi.org/project/SudachiDict-core/) + +```bash +(env) $ pip install sudachipy sudachidict_core +``` + +```{revealjs-notes} +In this case, I use SudachiPy and SudachiDict +``` + +### SudachiPy + +* Made with **Rust**, Very **Fast** +* **Three Types** of Dictionaries + * Small: small vocabulary + * **Core**: basic vocabulary (**default**) + * Full: miscellaneous proper nouns + +```{revealjs-notes} +SudachiPy is made by Rust and is very fast. +SudachiDict has three types of different dictionaries with different numbers of vocabularies. +Here I use core dictionary, the default. +``` + +### **Word Segmentation** + +* **Split** the words using **Dictionary** + +```pycon +>>> from sudachipy import Dictionary +>>> tokenizer = Dictionary().create() +>>> text = "すもももももももものうち" +>>> for token in tokenizer.tokenize(text): +... print(token) +... +すもも +も +もも +も +もも +の +うち +``` + +### **Word Segmentation** + +word_segmentation.py + +```{revealjs-literalinclude} code/word_segmentation.py +:data-line-numbers: 2,4|6-11 +``` + +```{revealjs-break} +``` + +```bash +(env) $ python word_segmentation.py すもももももももものうち +すもも / も / もも / も / もも / の / うち +``` + +すもも / も / もも / も / もも / の / うち + +* **Cannot read** Hiragana? + +### **Word Segmentation** with Romaji + +word_segmentation_with_ruby.py + +```{revealjs-literalinclude} code/word_segmentation_with_ruby.py +:data-line-numbers: 3,10 +``` + +```{revealjs-break} +``` + +```bash +(env) $ python word_segmentation_with_ruby.py すもももももももものうち +すももsumomo / mo / ももmomo / mo / ももmomo / no / うちuchi +``` + +すももsumomo / mo / ももmomo / mo / ももmomo / no / うちuchi + +### Can **split** into **Words** {nekochan}`clap` + +```{revealjs-notes} +You can correctly split Japanese text into words! +``` + +## **Multiple Readings** of Kanji + +kobito (Small person) + +nihonjin (Japanese) + +### **Multiple Readings** of Kanji + +* **人**: person, people +* 🇯🇵 **Japanese**-style reading(訓読みkun yomi): + * ひとhi toびとbi to +* 🇨🇳 **Chinese**-style reading(音読みon yomi): + * じんji nにんni n + +```{revealjs-break} +``` + +* 小**人** (Small person): + 🇯🇵 ko **びと**bi to +* 日本**人** (Japanese): + 🇨🇳 ni ほんho n **じん**jin + +### {nekochan}`pokan` + +```{revealjs-notes} +It's difficult, but not only this. +``` + +### Multiple Readings of **Kanji idioms** + +* Same combination but **different readings** +* **一人**: One person + * **一人** (One person) + * **一人**前 (One serving) + +```{revealjs-break} +``` + +* Same combination but **different readings** +* **一人**: One person + * **一人** (One person): **ひとり**hi to ri 🇯🇵 + * **一人**前 (One serving): **いちにん**i chi ni n まえma e 🇨🇳 + +### {nekochan}`yabai;1.5em` {nekochan}`yabai;1.5em` + +```{revealjs-notes} +Terrible... +And there is more... +``` + +### **Special readings** of Kanji idioms + +* 一 **人** (One person): **ひとり**hi to ri 🇯🇵 +* 二 **人** (Two people) +* 三 **人** (Three people) + +```{revealjs-break} +``` + +* 一 **人** (One person): **ひとり**hi to ri 🇯🇵 +* 二 **人** (Two people): **ふたり**fu ta ri 🇯🇵 +* 三 **人** (Three people): **さんにん**sa n ni n 🇨🇳 + +```{revealjs-notes} +These are special readings of Kanji idioms. +``` + +```{revealjs-break} +``` + +* Other special readings +* 大人: **おとな**o to na (Adult) +* 玄人: **くろうと**ku ro u to (Professional) +* 防人: **さきもり**sa ki mo ri (soldiers garrisoned at strategic posts in Kyushu in ancient times) + +### {nekochan}`scream;2em` {nekochan}`scream;2em` {nekochan}`scream;2em` + +```{revealjs-notes} +Oh my gosh +``` + +## Get **Reading** of Kanji + +一**人**one person日本**人**Japanese大**人**adult一**人**前one servingラーメン🍜食べるeat + +One Japanese adult eats one serving of ramen + +```{revealjs-notes} +There are 4 same kanji character, all with different readings. +``` + +### Get **Reading** of Kanji + +* Use **SudachiPy** and **SudachiDict** again +* `reading_form()`: Reading in Katakana + +```pycon +>>> from sudachipy import Dictionary +>>> tokenizer = Dictionary().create() # Make tokenizer +>>> text = "一人の日本人の大人が一人前のラーメンを食べる" +>>> for token in tokenizer.tokenize(text): # Word segmentation +... (str(token), token.reading_form()) # Get reading +... +('一人', 'ヒトリ') +('の', 'ノ') +('日本人', 'ニホンジン') +('の', 'ノ') +('大人', 'オトナ') +... +``` + +```{revealjs-break} +``` + +* Looks good {nekochan}`good` +* Cannot read **Katakana**? + +```pycon +('一人', 'ヒトリ') +('の', 'ノ') +('日本人', 'ニホンジン') +('の', 'ノ') +('大人', 'オトナ') +... +``` + +```{revealjs-break} +``` + +* Cannot read **Katakana**? Use **jaconv**! + +``` +>>> from jaconv import kata2hira, kata2alphabet +>>> for token in tokenizer.tokenize(text): +... reading = token.reading_form() +... hiragana = kata2hira(reading) # to Hiragana +... romaji = kata2alphabet(reading) # to Alphabet(romaji) +... (str(token), reading, hiragana, romaji) +... +('一人', 'ヒトリ', 'ひとり', 'hitori') +('の', 'ノ', 'の', 'no') +('日本人', 'ニホンジン', 'にほんじん', 'nihonjin') +('の', 'ノ', 'の', 'no') +('大人', 'オトナ', 'おとな', 'otona') +... +``` + +### Can get **Reading** to **Kanji** {nekochan}`yatta` + +### **Add Reading** to Kanji + +kanji_reading.py + +```{revealjs-literalinclude} code/kanji_reading.py +:data-line-numbers: 3,5|7-13 +``` + +```{revealjs-break} +``` + +一人ひとり + +日本人にほんじん + +大人おとな + +一人前いちにんまえ + +ラーメンらーめん + +食べるたべる + +```bash +(env) $ python kanji_reading.py 一人の日本人の大人が一人前のラーメンを食べる +一人ひとり + +日本人にほんじん + +大人おとな +... +``` + +```{revealjs-break} +``` + +kanji_reading_romaji.py + +```{revealjs-literalinclude} code/kanji_reading_romaji.py +:data-line-numbers: 11,12 +``` + +```{revealjs-break} +``` + +一人hitori +no +日本人nihonjin +no +大人otona +ga +一人前ichininmae +no +ラーメンraーmen +wo +食べるtaberu + +```bash +(env) $ python kanji_reading_romaji.py 一人の日本人の大人が一人前のラーメンを食べる +一人hitori +no +日本人nihonjin +no +大人otona +``` + +### Can read **Kanji** {nekochan}`medetai` + +## Kanji **level** support {nekochan}`tunda` + +### Kanji **level** support {nekochan}`tunda` + + * If you study Japanese, you may know the **JLPT** [^jlpt] +* JLPT has **N1**(difficult) ~ **N5**(easy) levels [^jlpt-level] + +```{image} https://www.jlpt.jp/e/resource/img_common/logo.gif +:alt: JLPT logo +:target: https://www.jlpt.jp/e/index.html +``` + +[^jlpt]: [What is the Japanese-Language Proficiency Test? Index | JLPT Japanese-Language Proficiency Test](https://www.jlpt.jp/e/about/index.html) +[^jlpt-level]: [N1-N5: Summary of Linguistic Competence Required for Each Level | JLPT Japanese-Language Proficiency Test](https://www.jlpt.jp/e/about/levelsummary.html) + +### Readings corresponding to **Kanji levels** {nekochan}`kamon` + +```{revealjs-notes} +I want to create readings corresponding to kanji levels. +``` + +### **Kanji list** for each level + +* [jiten](https://pypi.org/project/jiten/) has JLPT Kanji lists + * + +### Make JLPT **Kanji level dict** + +make_jlpt_kanji_dict.py + +```{revealjs-literalinclude} code/make_jlpt_kanji_dict.py +:language: python +:data-line-numbers: 2-10|1,12-13 +``` + +```{revealjs-break} +``` + +* Kanji dict is ready!! {nekochan}`naosu` + +```bash +% python make_jlpt_kanji_dict.py +``` + +```{revealjs-literalinclude} code/JLPT_kanji.json +``` + +### Get reading with **Kanji level** + +* `-a`: Alphabet annotation(default: Hiragana) +* `-l`: Kanji level option + +```text +% python kanji_reading_with_level.py -h +usage: kanji_reading_with_level.py [-h] [-a] [-l {1,2,3,4,5}] text + +Add Furigana to Japanese text + +positional arguments: + text text to add furigana annotation + +options: + -h, --help show this help message and exit + -a Alphabet(Romaji) annotation(default: Hiragana) + -l {1,2,3,4,5} set kanji level +``` + +```{revealjs-notes} +I have created an annotation script that supports Kanji levels. +``` + +```{revealjs-break} +``` + +```bash +% python kanji_reading_with_level.py 日本語を勉強する +``` + +日本語にほんご + +勉強べんきょう +するする +(default) + +```bash +% python kanji_reading_with_level.py -a 日本語を勉強する +``` + +日本語nihongo +wo +勉強benkyou +するsuru +(Alphabet(romaji)) + +```bash +% python kanji_reading_with_level.py -l 5 日本語を勉強する +``` + +日本語 +を +勉強べんきょう +する +(N5 level) + +```{revealjs-notes} +I will explain the code. +``` + +### Parse arguments + +* Process `-a` and `-l` with **argparse** +* Call `add_reading()` function + +```{revealjs-literalinclude} code/kanji_reading_with_level.py +:data-line-numbers: 70-79|80 +``` + +### Get **Kanji set** with level + +* Get Kanji set with `get_kanji_set(level)` + +```{revealjs-literalinclude} code/kanji_reading_with_level.py +:data-line-numbers: 53-55 +``` + +```{revealjs-notes} +In add_reading() func, to get the Kanji set, specify the level argument to the get_kanji_set() func. +``` + +```{revealjs-break} +``` + +* Load `"JLPT_kanji.json"` +* Create a Kanji set is **easier than level** + +```{revealjs-literalinclude} code/kanji_reading_with_level.py +:data-line-numbers: 26-32|33-37 +``` + +```{revealjs-notes} +get_kanji_set() func reads Kanjis per level from JSON. +Then Create a Kanji set that is easier than the level. +``` + +### Is **ruby required**? + +* **Ruby / not Ruby** with `is_ruby_required()` + +```{revealjs-literalinclude} code/kanji_reading_with_level.py +:data-line-numbers: 53,57-59 +``` + +```{revealjs-notes} +Determine if ruby is required or not with the is_ruby_required() function for each token. +``` + +```{revealjs-break} +``` + +* Get all Kanjis -> `kanji_in_surface` +* Kanjis are **within** the level or **above** + +```{revealjs-literalinclude} code/kanji_reading_with_level.py +:data-line-numbers: 40-45|46-50 +``` + +```{revealjs-notes} +In the function, I get all Kanjis in the token and assign them to kanji_in_surface. +Next, determine if there is a Kanji and if Kanjis are within the level or above the level. +``` + +### Add **Ruby text** + +* `is_ruby_required() == True`: add Ruby +* `alphabet`: Alphabet or Hiragana(default) + +```{revealjs-literalinclude} code/kanji_reading_with_level.py +:data-line-numbers: 57-67|60-63 +``` + +```{revealjs-notes} +Finally, if is_ruby_required() is True, add ruby text. +And, depending on the value of alphabet, ruby is converted to alphabet or hiragana +``` + +### Can handle **Kanji level**!! {nekochan}`yatta` + +No Level: 日本語にほんご + +勉強べんきょう +するする + +N5: 日本語 +を +勉強べんきょう +する + +N4: 日本語 +を +勉強 +する + +```{revealjs-notes} +Now, We can handle Kanji level. +You can learn Kanji at your own Japanese level. +``` + +## **Sample** App {nekochan}`work` + +* {fab}`github` [learn_jp_pyconus.py](https://github.com/takanory/learn-jp-with-python/blob/main/learn_jp_pyconus.py) + +```bash +% git clone https://github.com/takanory/learn-jp-with-python.git +% cd learn-jp-with-python/ +% python3.12 -m venv env +% . env/bin/activate +(env) % pip install -r requirements.txt +(env) % streamlit run learn_jp_pyconus.py +``` + +```{revealjs-break} +:notitle: +``` + +```{image} images/streamlit_demo.gif +:width: 85% +``` + +## **Summary** {nekochan}`juutai` + +* Japanese is **Difficult** + * 3 Characters, No spaces, Kanji readings +* Python supports Japanese learning + * **jaconv**: Interconverter + * **SudachiPy**: Morphological analyzer +* **Kanji level** support + +## 🇯🇵 ❤️ {fab}`python` + +Learn **Japanese** with **Python** + +```{revealjs-notes} +You can learn Japanese with Python. +Please try to create your own Japanese learning tool! +``` + +## Thank you {nekochan}`pray` + +{fas}`desktop` [slides.takanory.net](https://slides.takanory.net/) +{fas}`code` [sample code](https://github.com/takanory/slides/tree/master/slides/20250516pyconapac/code) + +{fab}`twitter` [takanory](https://twitter.com/takanory) +{fab}`github` [takanory](https://github.com/takanory/) +{fab}`linkedin` [takanory](https://www.linkedin.com/in/takanory/) +{fab}`untappd` [takanory](https://untappd.com/user/takanory/) + +![takanory profile](/assets/images/sokidan-square.jpg) +![kuro-chan and kuri-chan](/assets/images/kurokuri.jpg) diff --git a/slides/conf.py b/slides/conf.py index 955c59d..ec0249d 100644 --- a/slides/conf.py +++ b/slides/conf.py @@ -30,6 +30,7 @@ extensions = [ "myst_parser", "sphinx_revealjs", + "sphinx_revealjs.ext.footnotes", "sphinxext.opengraph", "sphinx_design", "atsphinx.audioplayer", @@ -66,6 +67,7 @@ # myst_enable_extensions = [ # "substitution", #] +myst_footnote_sort = False # -- for sphinxext-opengraph ogp_site_url = "https://slides.takanory.net/slides/"