Skip to content

Commit c75c712

Browse files
author
dimitrod
committed
Deployed 5aa853a with MkDocs version: 1.6.1
1 parent 13fb1b3 commit c75c712

File tree

9 files changed

+338
-19
lines changed

9 files changed

+338
-19
lines changed
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
include ../../../Makefile.in
22

3-
## should generate all blog materials
4-
generate-materials: setup-venv main-python clean-venv
3+
## generates all blog artifacts
4+
generate-artifacts: setup-venv main-python clean-venv
55

66
## generate figures
77
main-python:
88
. ${VENV_NAME}/bin/activate && python main.py
99

1010
##! clean generated files
11-
clean-generated-materials:
11+
clean-generated-artifacts:
1212
rm -rf img/generated

blog/202412-python-strings/Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
include ../../../Makefile.in
2+
3+
generate-artifacts:
4+
5+
clean-generated-artifacts:

blog/202412-python-strings/emoji.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
"""Download and display emoji (just to search and choose a few).
2+
3+
How to use (in a jupyter notebook cell):
4+
5+
```python
6+
eh = EmojiHandler(download=True)
7+
eh.show()
8+
```
9+
10+
"""
11+
12+
import random
13+
from pathlib import Path
14+
15+
import pandas as pd
16+
import requests
17+
from IPython.display import HTML, display
18+
19+
20+
class EmojiHandler:
21+
22+
def __init__(self, download=False, urls=None, data_dir="data"):
23+
self.data_dir = Path(data_dir)
24+
# There might be more, I don't know
25+
self.urls = [
26+
"https://unicode.org/Public/emoji/latest/emoji-zwj-sequences.txt",
27+
"https://unicode.org/Public/emoji/latest/emoji-sequences.txt",
28+
"https://unicode.org/Public/emoji/latest/emoji-test.txt",
29+
"https://unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt",
30+
"https://unicode.org/Public/16.0.0/ucd/emoji/emoji-variation-sequences.txt",
31+
]
32+
33+
if download:
34+
self.download_emoji_files()
35+
self._data = self.get_data()
36+
37+
def download_emoji_files(self):
38+
self.data_dir.mkdir(parents=True, exist_ok=True)
39+
for url in self.urls:
40+
response = requests.get(url)
41+
response.raise_for_status()
42+
with open(self.data_dir / url.split("/")[-1], "w") as h:
43+
h.write(response.text)
44+
45+
@property
46+
def count(self):
47+
return len(self._data)
48+
49+
@property
50+
def data(self):
51+
return pd.DataFrame(self._data)
52+
53+
def get_data(self):
54+
def expand_ranges(data):
55+
"""I keep the same name for all expanded codepoints."""
56+
57+
def expand_range(lb_hex, ub_hex):
58+
lb = int(lb_hex, 16)
59+
ub = int(ub_hex, 16)
60+
return [hex(i)[2:].upper() for i in range(lb, ub + 1)]
61+
62+
updated_data = []
63+
for d in data:
64+
if ".." in d["seq"]:
65+
code_points = expand_range(*d["seq"].split(".."))
66+
updated_data.extend(
67+
[
68+
{"seq": code_point, "name": d["name"]}
69+
for code_point in code_points
70+
]
71+
)
72+
else:
73+
updated_data.append(d)
74+
return updated_data
75+
76+
def parse_emoji_file_content(file_content):
77+
data = []
78+
for line in file_content.splitlines():
79+
if line and not line.startswith("#"):
80+
line_split = line.split(";")
81+
seq = line_split[0].strip()
82+
# name = line_split[-1].split("#")[0].strip()
83+
name = line_split[-1].strip()
84+
data.append({"seq": seq, "name": name})
85+
return data
86+
87+
data = []
88+
for file in Path(self.data_dir).glob("**/*"):
89+
with open(file, "r") as h:
90+
file_content = h.read()
91+
data.extend(parse_emoji_file_content(file_content))
92+
93+
return expand_ranges(data)
94+
95+
def filter_data(self, keyword):
96+
df = self.data
97+
return df[df["name"].str.contains(keyword, case=False)].to_dict("records")
98+
99+
@staticmethod
100+
def show_str(data: str, size=100):
101+
display(HTML(f'<span style="font-size: {size}px;">{data}</span>'))
102+
103+
@staticmethod
104+
def form_zwj_emoji(seq):
105+
return "".join([chr(int(c, 16)) for c in seq.split(" ")])
106+
107+
def show(self, n=50, filter=None, index=None, size=40):
108+
"""Show n random emoji or the emoji with the given index."""
109+
110+
data = self.filter_data(filter) if filter else self._data
111+
112+
table_html = '<table style="border-collapse: collapse;">'
113+
114+
if len(data) == 0:
115+
print("No data")
116+
return
117+
118+
if n > len(data):
119+
n = len(data)
120+
print("Limit to available data.")
121+
122+
if index is not None:
123+
n = 1
124+
sample = data[index]
125+
else:
126+
sample = random.sample(data, n)
127+
128+
numb_cols = min(10, n)
129+
numb_rows = n // numb_cols
130+
k = 0
131+
for i in range(numb_rows):
132+
table_html += "<tr>"
133+
for j in range(numb_cols):
134+
s = self.form_zwj_emoji(sample[k]["seq"])
135+
table_html += f'<td style="font-size: {size}px; padding: 7px;">{s}</td>'
136+
k += 1
137+
table_html += "</tr>"
138+
139+
table_html += "</table>"
140+
display(HTML(table_html))
141+
142+
def __repr__(self):
143+
return f"[EmojiHandler] {self.count}"

blog/202412-python-strings/index.html

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1130,7 +1130,7 @@ <h3 id="bytes-objects">Bytes objects<a class="headerlink" href="#bytes-objects"
11301130
<li><code class="language-python highlight"><span class="s2">&quot;a&quot;</span></code>: <code>97, 0</code></li>
11311131
<li><code class="language-python highlight"><span class="s2">&quot;👨&quot;</span></code>: <code>61, 216, 104, 220</code></li>
11321132
</ul>
1133-
<p>or Using <code class="language-python highlight"><span class="n">a_man</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">&quot;utf-8&quot;</span><span class="p">)</span></code>:</p>
1133+
<p>or using <code class="language-python highlight"><span class="n">a_man</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">&quot;utf-8&quot;</span><span class="p">)</span></code>:</p>
11341134
<ul>
11351135
<li><code class="language-python highlight"><span class="s2">&quot;a&quot;</span></code>: <code>97</code></li>
11361136
<li><code class="language-python highlight"><span class="s2">&quot;👨&quot;</span></code>: <code>240, 159, 145, 168</code>.</li>
@@ -1139,6 +1139,8 @@ <h3 id="bytes-objects">Bytes objects<a class="headerlink" href="#bytes-objects"
11391139
compatibility with ASCII and efficient data storage, while UTF-16 and UTF-32 allow for
11401140
faster processing of a larger range of characters. Having the possibility to
11411141
easily/efficiently change representations is convenient.</p>
1142+
<p>Of course, bytes objects can be used in other contexts as well. For example, <code class="language-python highlight"><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">to_bytes</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="n">byteorder</span><span class="o">=</span><span class="s1">&#39;little&#39;</span><span class="p">)</span></code> would return the bytes representation of the
1143+
integer <code>1</code> (in little endian).</p>
11421144
<h2 id="immutability">Immutability<a class="headerlink" href="#immutability" title="Permanent link">#</a></h2>
11431145
<p>The design decision to have immutable string in python has far-reaching implication
11441146
related to e.g., hashing, performance optimizations, garbage collection, thread safety
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
"""Using CPython is assumed.
2+
3+
Dirty numerical verification that my understanding on PEP393 and the CPython code is
4+
correct.
5+
6+
"""
7+
8+
import ctypes
9+
import random
10+
import sys
11+
import unicodedata
12+
13+
14+
class Pep393VerifyEncoding:
15+
"""Numerically verify encoding used in PEP 393.
16+
17+
Warning
18+
--------
19+
This is a one-time-test only. The code is not organised well!
20+
21+
"""
22+
23+
def __init__(self, numb_test=100000):
24+
self.numb_tests = numb_test
25+
self.max_code_poit = 1114112
26+
27+
self.data = {
28+
cp: unicodedata.category(chr(cp)) for cp in range(self.max_code_poit)
29+
}
30+
self.categories = set(self.data.values())
31+
self.categories_to_include = self.categories - {"Cs"}
32+
33+
def verify(self):
34+
self.verify_case1()
35+
self.verify_case2()
36+
self.verify_case3()
37+
self.verify_case4()
38+
self.verify_surrogate_points_fail_utf32()
39+
self.verify_nonsurrogate_points_ok_utf32()
40+
41+
def verify_surrogate_points_fail_utf32(self):
42+
"""
43+
44+
Cs code points can happily be encoded using utf-32 but there is a problem
45+
because, given a code point CP (i.e., CP is an integer), we use chr(CP) and
46+
the output of chr(.) cannot be trusted because there are no characters
47+
associated with a Cs point.
48+
49+
"""
50+
k = 0
51+
surogate_code_points = self.get_surogate_code_points()
52+
for cp in surogate_code_points:
53+
try:
54+
self.verify_one_code_point(cp, "utf-32")
55+
except:
56+
k += 1
57+
assert k == len(surogate_code_points)
58+
59+
def verify_nonsurrogate_points_ok_utf32(self):
60+
for cp in self.get_nonsurogate_code_points():
61+
self.verify_one_code_point(cp, "utf-32")
62+
63+
def verify_one_code_point(self, code_point, encoding="utf-32"):
64+
character = chr(code_point)
65+
return character.encode(encoding)
66+
67+
def get_surogate_code_points(self):
68+
return [k for k, v in self.data.items() if v == "Cs"]
69+
70+
def get_nonsurogate_code_points(self):
71+
return [k for k, v in self.data.items() if v != "Cs"]
72+
73+
def verify_case1(self):
74+
range1 = self.filter_code_points(0, 2**7 - 1)
75+
range2 = self.filter_code_points(0, 2**7 - 1)
76+
77+
for _ in range(self.numb_tests):
78+
i1 = random.sample(range1, 1)[0]
79+
i2 = random.sample(range2, 1)[0]
80+
s = chr(i1) + chr(i2)
81+
82+
e1 = self.memory_dump(s)[40:-1].hex()
83+
# utf-8 consides with ascii in case 1
84+
e2 = s.encode("ascii").hex()
85+
e3 = s.encode("utf-8").hex()
86+
assert e1 == e2
87+
assert e1 == e3
88+
89+
def verify_case2(self):
90+
range1 = self.filter_code_points(0, 2**8 - 1)
91+
range2 = self.filter_code_points(2**7, 2**8 - 1)
92+
93+
for _ in range(self.numb_tests):
94+
i1 = random.sample(range1, 1)[0]
95+
i2 = random.sample(range2, 1)[0]
96+
s = chr(i1) + chr(i2)
97+
98+
e1 = self.memory_dump(s)[56:-1].hex()
99+
e2 = s.encode("latin-1").hex()
100+
e3 = s.encode("utf-16")[2:].hex() # [2:] removes BOM
101+
e3 = e3[:2] + e3[-4:-2]
102+
assert e1 == e2
103+
assert e1 == e3
104+
105+
def verify_case3(self):
106+
range1 = self.filter_code_points(256, 2**16 - 1)
107+
range2 = self.filter_code_points(0, 2**16 - 1)
108+
109+
for _ in range(self.numb_tests):
110+
i1 = random.sample(range1, 1)[0]
111+
i2 = random.sample(range2, 1)[0]
112+
s = chr(i1) + chr(i2)
113+
114+
e1 = self.memory_dump(s)[56:-2].hex()
115+
e2 = s.encode("utf-16")[2:].hex() # [2:] removes BOM
116+
assert e1 == e2
117+
118+
def verify_case4(self):
119+
range1 = self.filter_code_points(2**16 - 1, self.max_code_poit)
120+
range2 = self.filter_code_points(0, self.max_code_poit)
121+
122+
for _ in range(self.numb_tests):
123+
i1 = random.sample(range1, 1)[0]
124+
i2 = random.sample(range2, 1)[0]
125+
s = chr(i1) + chr(i2)
126+
127+
e1 = self.memory_dump(s)[56:-4].hex()
128+
e2 = s.encode("utf-32")[4:].hex() # [4:] removes BOM
129+
assert e1 == e2
130+
131+
@staticmethod
132+
def memory_dump(s):
133+
address_of_s = id(s) # assuming CPython
134+
buffer_s = (ctypes.c_char * sys.getsizeof(s)).from_address(address_of_s)
135+
return bytes(buffer_s)
136+
137+
def filter_code_points(self, lb=0, ub=None):
138+
"""Filter code points.
139+
140+
https://www.compart.com/en/unicode/category
141+
"""
142+
if ub is None:
143+
ub = self.max_code_poit
144+
145+
selected_code_points = []
146+
for cp, category in self.data.items():
147+
if category in self.categories_to_include:
148+
if cp <= ub and cp >= lb:
149+
selected_code_points.append(cp)
150+
151+
return selected_code_points
152+
153+
154+
def cast_back(x):
155+
return ctypes.cast(id(x), ctypes.py_object).value
156+
return ctypes.cast(id(x), ctypes.py_object).value
157+
158+
159+
def main():
160+
v = Pep393VerifyEncoding()
161+
v.verify()
162+
163+
164+
if __name__ == "__main__":
165+
main()

blog/Makefile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
all:
22
@echo "Be reasonable! Check available targets."
33

4-
generate-materials:
5-
cd 202411-summer-walking-challenge && make generate-materials
4+
generate-artifacts:
5+
cd 202411-summer-walking-challenge && make generate-artifacts
66

7-
clean-generated-materials:
8-
cd 202411-summer-walking-challenge && make clean-generated-materials
7+
clean-generated-artifacts:
8+
cd 202411-summer-walking-challenge && make clean-generated-artifacts

search/search_index.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)