-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Deployed 5aa853a with MkDocs version: 1.6.1
- Loading branch information
Showing
9 changed files
with
338 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,12 @@ | ||
include ../../../Makefile.in | ||
|
||
## should generate all blog materials | ||
generate-materials: setup-venv main-python clean-venv | ||
## generates all blog artifacts | ||
generate-artifacts: setup-venv main-python clean-venv | ||
|
||
## generate figures | ||
main-python: | ||
. ${VENV_NAME}/bin/activate && python main.py | ||
|
||
##! clean generated files | ||
clean-generated-materials: | ||
clean-generated-artifacts: | ||
rm -rf img/generated |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
include ../../../Makefile.in | ||
|
||
generate-artifacts: | ||
|
||
clean-generated-artifacts: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
"""Download and display emoji (just to search and choose a few). | ||
How to use (in a jupyter notebook cell): | ||
```python | ||
eh = EmojiHandler(download=True) | ||
eh.show() | ||
``` | ||
""" | ||
|
||
import random | ||
from pathlib import Path | ||
|
||
import pandas as pd | ||
import requests | ||
from IPython.display import HTML, display | ||
|
||
|
||
class EmojiHandler: | ||
|
||
def __init__(self, download=False, urls=None, data_dir="data"): | ||
self.data_dir = Path(data_dir) | ||
# There might be more, I don't know | ||
self.urls = [ | ||
"https://unicode.org/Public/emoji/latest/emoji-zwj-sequences.txt", | ||
"https://unicode.org/Public/emoji/latest/emoji-sequences.txt", | ||
"https://unicode.org/Public/emoji/latest/emoji-test.txt", | ||
"https://unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt", | ||
"https://unicode.org/Public/16.0.0/ucd/emoji/emoji-variation-sequences.txt", | ||
] | ||
|
||
if download: | ||
self.download_emoji_files() | ||
self._data = self.get_data() | ||
|
||
def download_emoji_files(self): | ||
self.data_dir.mkdir(parents=True, exist_ok=True) | ||
for url in self.urls: | ||
response = requests.get(url) | ||
response.raise_for_status() | ||
with open(self.data_dir / url.split("/")[-1], "w") as h: | ||
h.write(response.text) | ||
|
||
@property | ||
def count(self): | ||
return len(self._data) | ||
|
||
@property | ||
def data(self): | ||
return pd.DataFrame(self._data) | ||
|
||
def get_data(self): | ||
def expand_ranges(data): | ||
"""I keep the same name for all expanded codepoints.""" | ||
|
||
def expand_range(lb_hex, ub_hex): | ||
lb = int(lb_hex, 16) | ||
ub = int(ub_hex, 16) | ||
return [hex(i)[2:].upper() for i in range(lb, ub + 1)] | ||
|
||
updated_data = [] | ||
for d in data: | ||
if ".." in d["seq"]: | ||
code_points = expand_range(*d["seq"].split("..")) | ||
updated_data.extend( | ||
[ | ||
{"seq": code_point, "name": d["name"]} | ||
for code_point in code_points | ||
] | ||
) | ||
else: | ||
updated_data.append(d) | ||
return updated_data | ||
|
||
def parse_emoji_file_content(file_content): | ||
data = [] | ||
for line in file_content.splitlines(): | ||
if line and not line.startswith("#"): | ||
line_split = line.split(";") | ||
seq = line_split[0].strip() | ||
# name = line_split[-1].split("#")[0].strip() | ||
name = line_split[-1].strip() | ||
data.append({"seq": seq, "name": name}) | ||
return data | ||
|
||
data = [] | ||
for file in Path(self.data_dir).glob("**/*"): | ||
with open(file, "r") as h: | ||
file_content = h.read() | ||
data.extend(parse_emoji_file_content(file_content)) | ||
|
||
return expand_ranges(data) | ||
|
||
def filter_data(self, keyword): | ||
df = self.data | ||
return df[df["name"].str.contains(keyword, case=False)].to_dict("records") | ||
|
||
@staticmethod | ||
def show_str(data: str, size=100): | ||
display(HTML(f'<span style="font-size: {size}px;">{data}</span>')) | ||
|
||
@staticmethod | ||
def form_zwj_emoji(seq): | ||
return "".join([chr(int(c, 16)) for c in seq.split(" ")]) | ||
|
||
def show(self, n=50, filter=None, index=None, size=40): | ||
"""Show n random emoji or the emoji with the given index.""" | ||
|
||
data = self.filter_data(filter) if filter else self._data | ||
|
||
table_html = '<table style="border-collapse: collapse;">' | ||
|
||
if len(data) == 0: | ||
print("No data") | ||
return | ||
|
||
if n > len(data): | ||
n = len(data) | ||
print("Limit to available data.") | ||
|
||
if index is not None: | ||
n = 1 | ||
sample = data[index] | ||
else: | ||
sample = random.sample(data, n) | ||
|
||
numb_cols = min(10, n) | ||
numb_rows = n // numb_cols | ||
k = 0 | ||
for i in range(numb_rows): | ||
table_html += "<tr>" | ||
for j in range(numb_cols): | ||
s = self.form_zwj_emoji(sample[k]["seq"]) | ||
table_html += f'<td style="font-size: {size}px; padding: 7px;">{s}</td>' | ||
k += 1 | ||
table_html += "</tr>" | ||
|
||
table_html += "</table>" | ||
display(HTML(table_html)) | ||
|
||
def __repr__(self): | ||
return f"[EmojiHandler] {self.count}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
"""Using CPython is assumed. | ||
Dirty numerical verification that my understanding on PEP393 and the CPython code is | ||
correct. | ||
""" | ||
|
||
import ctypes | ||
import random | ||
import sys | ||
import unicodedata | ||
|
||
|
||
class Pep393VerifyEncoding: | ||
"""Numerically verify encoding used in PEP 393. | ||
Warning | ||
-------- | ||
This is a one-time-test only. The code is not organised well! | ||
""" | ||
|
||
def __init__(self, numb_test=100000): | ||
self.numb_tests = numb_test | ||
self.max_code_poit = 1114112 | ||
|
||
self.data = { | ||
cp: unicodedata.category(chr(cp)) for cp in range(self.max_code_poit) | ||
} | ||
self.categories = set(self.data.values()) | ||
self.categories_to_include = self.categories - {"Cs"} | ||
|
||
def verify(self): | ||
self.verify_case1() | ||
self.verify_case2() | ||
self.verify_case3() | ||
self.verify_case4() | ||
self.verify_surrogate_points_fail_utf32() | ||
self.verify_nonsurrogate_points_ok_utf32() | ||
|
||
def verify_surrogate_points_fail_utf32(self): | ||
""" | ||
Cs code points can happily be encoded using utf-32 but there is a problem | ||
because, given a code point CP (i.e., CP is an integer), we use chr(CP) and | ||
the output of chr(.) cannot be trusted because there are no characters | ||
associated with a Cs point. | ||
""" | ||
k = 0 | ||
surogate_code_points = self.get_surogate_code_points() | ||
for cp in surogate_code_points: | ||
try: | ||
self.verify_one_code_point(cp, "utf-32") | ||
except: | ||
k += 1 | ||
assert k == len(surogate_code_points) | ||
|
||
def verify_nonsurrogate_points_ok_utf32(self): | ||
for cp in self.get_nonsurogate_code_points(): | ||
self.verify_one_code_point(cp, "utf-32") | ||
|
||
def verify_one_code_point(self, code_point, encoding="utf-32"): | ||
character = chr(code_point) | ||
return character.encode(encoding) | ||
|
||
def get_surogate_code_points(self): | ||
return [k for k, v in self.data.items() if v == "Cs"] | ||
|
||
def get_nonsurogate_code_points(self): | ||
return [k for k, v in self.data.items() if v != "Cs"] | ||
|
||
def verify_case1(self): | ||
range1 = self.filter_code_points(0, 2**7 - 1) | ||
range2 = self.filter_code_points(0, 2**7 - 1) | ||
|
||
for _ in range(self.numb_tests): | ||
i1 = random.sample(range1, 1)[0] | ||
i2 = random.sample(range2, 1)[0] | ||
s = chr(i1) + chr(i2) | ||
|
||
e1 = self.memory_dump(s)[40:-1].hex() | ||
# utf-8 consides with ascii in case 1 | ||
e2 = s.encode("ascii").hex() | ||
e3 = s.encode("utf-8").hex() | ||
assert e1 == e2 | ||
assert e1 == e3 | ||
|
||
def verify_case2(self): | ||
range1 = self.filter_code_points(0, 2**8 - 1) | ||
range2 = self.filter_code_points(2**7, 2**8 - 1) | ||
|
||
for _ in range(self.numb_tests): | ||
i1 = random.sample(range1, 1)[0] | ||
i2 = random.sample(range2, 1)[0] | ||
s = chr(i1) + chr(i2) | ||
|
||
e1 = self.memory_dump(s)[56:-1].hex() | ||
e2 = s.encode("latin-1").hex() | ||
e3 = s.encode("utf-16")[2:].hex() # [2:] removes BOM | ||
e3 = e3[:2] + e3[-4:-2] | ||
assert e1 == e2 | ||
assert e1 == e3 | ||
|
||
def verify_case3(self): | ||
range1 = self.filter_code_points(256, 2**16 - 1) | ||
range2 = self.filter_code_points(0, 2**16 - 1) | ||
|
||
for _ in range(self.numb_tests): | ||
i1 = random.sample(range1, 1)[0] | ||
i2 = random.sample(range2, 1)[0] | ||
s = chr(i1) + chr(i2) | ||
|
||
e1 = self.memory_dump(s)[56:-2].hex() | ||
e2 = s.encode("utf-16")[2:].hex() # [2:] removes BOM | ||
assert e1 == e2 | ||
|
||
def verify_case4(self): | ||
range1 = self.filter_code_points(2**16 - 1, self.max_code_poit) | ||
range2 = self.filter_code_points(0, self.max_code_poit) | ||
|
||
for _ in range(self.numb_tests): | ||
i1 = random.sample(range1, 1)[0] | ||
i2 = random.sample(range2, 1)[0] | ||
s = chr(i1) + chr(i2) | ||
|
||
e1 = self.memory_dump(s)[56:-4].hex() | ||
e2 = s.encode("utf-32")[4:].hex() # [4:] removes BOM | ||
assert e1 == e2 | ||
|
||
@staticmethod | ||
def memory_dump(s): | ||
address_of_s = id(s) # assuming CPython | ||
buffer_s = (ctypes.c_char * sys.getsizeof(s)).from_address(address_of_s) | ||
return bytes(buffer_s) | ||
|
||
def filter_code_points(self, lb=0, ub=None): | ||
"""Filter code points. | ||
https://www.compart.com/en/unicode/category | ||
""" | ||
if ub is None: | ||
ub = self.max_code_poit | ||
|
||
selected_code_points = [] | ||
for cp, category in self.data.items(): | ||
if category in self.categories_to_include: | ||
if cp <= ub and cp >= lb: | ||
selected_code_points.append(cp) | ||
|
||
return selected_code_points | ||
|
||
|
||
def cast_back(x): | ||
return ctypes.cast(id(x), ctypes.py_object).value | ||
return ctypes.cast(id(x), ctypes.py_object).value | ||
|
||
|
||
def main(): | ||
v = Pep393VerifyEncoding() | ||
v.verify() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,8 @@ | ||
all: | ||
@echo "Be reasonable! Check available targets." | ||
|
||
generate-materials: | ||
cd 202411-summer-walking-challenge && make generate-materials | ||
generate-artifacts: | ||
cd 202411-summer-walking-challenge && make generate-artifacts | ||
|
||
clean-generated-materials: | ||
cd 202411-summer-walking-challenge && make clean-generated-materials | ||
clean-generated-artifacts: | ||
cd 202411-summer-walking-challenge && make clean-generated-artifacts |
Large diffs are not rendered by default.
Oops, something went wrong.
Oops, something went wrong.