Skip to content

Commit

Permalink
Deployed 5aa853a with MkDocs version: 1.6.1
Browse files Browse the repository at this point in the history
  • Loading branch information
drdv committed Dec 9, 2024
1 parent 13fb1b3 commit c75c712
Show file tree
Hide file tree
Showing 9 changed files with 338 additions and 19 deletions.
6 changes: 3 additions & 3 deletions blog/202411-summer-walking-challenge/Makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
include ../../../Makefile.in

## should generate all blog materials
generate-materials: setup-venv main-python clean-venv
## generates all blog artifacts
generate-artifacts: setup-venv main-python clean-venv

## generate figures
main-python:
. ${VENV_NAME}/bin/activate && python main.py

##! clean generated files
clean-generated-materials:
clean-generated-artifacts:
rm -rf img/generated
5 changes: 5 additions & 0 deletions blog/202412-python-strings/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
include ../../../Makefile.in

generate-artifacts:

clean-generated-artifacts:
143 changes: 143 additions & 0 deletions blog/202412-python-strings/emoji.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
"""Download and display emoji (just to search and choose a few).
How to use (in a jupyter notebook cell):
```python
eh = EmojiHandler(download=True)
eh.show()
```
"""

import random
from pathlib import Path

import pandas as pd
import requests
from IPython.display import HTML, display


class EmojiHandler:

def __init__(self, download=False, urls=None, data_dir="data"):
self.data_dir = Path(data_dir)
# There might be more, I don't know
self.urls = [
"https://unicode.org/Public/emoji/latest/emoji-zwj-sequences.txt",
"https://unicode.org/Public/emoji/latest/emoji-sequences.txt",
"https://unicode.org/Public/emoji/latest/emoji-test.txt",
"https://unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt",
"https://unicode.org/Public/16.0.0/ucd/emoji/emoji-variation-sequences.txt",
]

if download:
self.download_emoji_files()
self._data = self.get_data()

def download_emoji_files(self):
self.data_dir.mkdir(parents=True, exist_ok=True)
for url in self.urls:
response = requests.get(url)
response.raise_for_status()
with open(self.data_dir / url.split("/")[-1], "w") as h:
h.write(response.text)

@property
def count(self):
return len(self._data)

@property
def data(self):
return pd.DataFrame(self._data)

def get_data(self):
def expand_ranges(data):
"""I keep the same name for all expanded codepoints."""

def expand_range(lb_hex, ub_hex):
lb = int(lb_hex, 16)
ub = int(ub_hex, 16)
return [hex(i)[2:].upper() for i in range(lb, ub + 1)]

updated_data = []
for d in data:
if ".." in d["seq"]:
code_points = expand_range(*d["seq"].split(".."))
updated_data.extend(
[
{"seq": code_point, "name": d["name"]}
for code_point in code_points
]
)
else:
updated_data.append(d)
return updated_data

def parse_emoji_file_content(file_content):
data = []
for line in file_content.splitlines():
if line and not line.startswith("#"):
line_split = line.split(";")
seq = line_split[0].strip()
# name = line_split[-1].split("#")[0].strip()
name = line_split[-1].strip()
data.append({"seq": seq, "name": name})
return data

data = []
for file in Path(self.data_dir).glob("**/*"):
with open(file, "r") as h:
file_content = h.read()
data.extend(parse_emoji_file_content(file_content))

return expand_ranges(data)

def filter_data(self, keyword):
df = self.data
return df[df["name"].str.contains(keyword, case=False)].to_dict("records")

@staticmethod
def show_str(data: str, size=100):
display(HTML(f'<span style="font-size: {size}px;">{data}</span>'))

@staticmethod
def form_zwj_emoji(seq):
return "".join([chr(int(c, 16)) for c in seq.split(" ")])

def show(self, n=50, filter=None, index=None, size=40):
"""Show n random emoji or the emoji with the given index."""

data = self.filter_data(filter) if filter else self._data

table_html = '<table style="border-collapse: collapse;">'

if len(data) == 0:
print("No data")
return

if n > len(data):
n = len(data)
print("Limit to available data.")

if index is not None:
n = 1
sample = data[index]
else:
sample = random.sample(data, n)

numb_cols = min(10, n)
numb_rows = n // numb_cols
k = 0
for i in range(numb_rows):
table_html += "<tr>"
for j in range(numb_cols):
s = self.form_zwj_emoji(sample[k]["seq"])
table_html += f'<td style="font-size: {size}px; padding: 7px;">{s}</td>'
k += 1
table_html += "</tr>"

table_html += "</table>"
display(HTML(table_html))

def __repr__(self):
return f"[EmojiHandler] {self.count}"
4 changes: 3 additions & 1 deletion blog/202412-python-strings/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -1130,7 +1130,7 @@ <h3 id="bytes-objects">Bytes objects<a class="headerlink" href="#bytes-objects"
<li><code class="language-python highlight"><span class="s2">&quot;a&quot;</span></code>: <code>97, 0</code></li>
<li><code class="language-python highlight"><span class="s2">&quot;👨&quot;</span></code>: <code>61, 216, 104, 220</code></li>
</ul>
<p>or Using <code class="language-python highlight"><span class="n">a_man</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">&quot;utf-8&quot;</span><span class="p">)</span></code>:</p>
<p>or using <code class="language-python highlight"><span class="n">a_man</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">&quot;utf-8&quot;</span><span class="p">)</span></code>:</p>
<ul>
<li><code class="language-python highlight"><span class="s2">&quot;a&quot;</span></code>: <code>97</code></li>
<li><code class="language-python highlight"><span class="s2">&quot;👨&quot;</span></code>: <code>240, 159, 145, 168</code>.</li>
Expand All @@ -1139,6 +1139,8 @@ <h3 id="bytes-objects">Bytes objects<a class="headerlink" href="#bytes-objects"
compatibility with ASCII and efficient data storage, while UTF-16 and UTF-32 allow for
faster processing of a larger range of characters. Having the possibility to
easily/efficiently change representations is convenient.</p>
<p>Of course, bytes objects can be used in other contexts as well. For example, <code class="language-python highlight"><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">to_bytes</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="n">byteorder</span><span class="o">=</span><span class="s1">&#39;little&#39;</span><span class="p">)</span></code> would return the bytes representation of the
integer <code>1</code> (in little endian).</p>
<h2 id="immutability">Immutability<a class="headerlink" href="#immutability" title="Permanent link">#</a></h2>
<p>The design decision to have immutable string in python has far-reaching implication
related to e.g., hashing, performance optimizations, garbage collection, thread safety
Expand Down
165 changes: 165 additions & 0 deletions blog/202412-python-strings/verify_string_encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
"""Using CPython is assumed.
Dirty numerical verification that my understanding on PEP393 and the CPython code is
correct.
"""

import ctypes
import random
import sys
import unicodedata


class Pep393VerifyEncoding:
"""Numerically verify encoding used in PEP 393.
Warning
--------
This is a one-time-test only. The code is not organised well!
"""

def __init__(self, numb_test=100000):
self.numb_tests = numb_test
self.max_code_poit = 1114112

self.data = {
cp: unicodedata.category(chr(cp)) for cp in range(self.max_code_poit)
}
self.categories = set(self.data.values())
self.categories_to_include = self.categories - {"Cs"}

def verify(self):
self.verify_case1()
self.verify_case2()
self.verify_case3()
self.verify_case4()
self.verify_surrogate_points_fail_utf32()
self.verify_nonsurrogate_points_ok_utf32()

def verify_surrogate_points_fail_utf32(self):
"""
Cs code points can happily be encoded using utf-32 but there is a problem
because, given a code point CP (i.e., CP is an integer), we use chr(CP) and
the output of chr(.) cannot be trusted because there are no characters
associated with a Cs point.
"""
k = 0
surogate_code_points = self.get_surogate_code_points()
for cp in surogate_code_points:
try:
self.verify_one_code_point(cp, "utf-32")
except:
k += 1
assert k == len(surogate_code_points)

def verify_nonsurrogate_points_ok_utf32(self):
for cp in self.get_nonsurogate_code_points():
self.verify_one_code_point(cp, "utf-32")

def verify_one_code_point(self, code_point, encoding="utf-32"):
character = chr(code_point)
return character.encode(encoding)

def get_surogate_code_points(self):
return [k for k, v in self.data.items() if v == "Cs"]

def get_nonsurogate_code_points(self):
return [k for k, v in self.data.items() if v != "Cs"]

def verify_case1(self):
range1 = self.filter_code_points(0, 2**7 - 1)
range2 = self.filter_code_points(0, 2**7 - 1)

for _ in range(self.numb_tests):
i1 = random.sample(range1, 1)[0]
i2 = random.sample(range2, 1)[0]
s = chr(i1) + chr(i2)

e1 = self.memory_dump(s)[40:-1].hex()
# utf-8 consides with ascii in case 1
e2 = s.encode("ascii").hex()
e3 = s.encode("utf-8").hex()
assert e1 == e2
assert e1 == e3

def verify_case2(self):
range1 = self.filter_code_points(0, 2**8 - 1)
range2 = self.filter_code_points(2**7, 2**8 - 1)

for _ in range(self.numb_tests):
i1 = random.sample(range1, 1)[0]
i2 = random.sample(range2, 1)[0]
s = chr(i1) + chr(i2)

e1 = self.memory_dump(s)[56:-1].hex()
e2 = s.encode("latin-1").hex()
e3 = s.encode("utf-16")[2:].hex() # [2:] removes BOM
e3 = e3[:2] + e3[-4:-2]
assert e1 == e2
assert e1 == e3

def verify_case3(self):
range1 = self.filter_code_points(256, 2**16 - 1)
range2 = self.filter_code_points(0, 2**16 - 1)

for _ in range(self.numb_tests):
i1 = random.sample(range1, 1)[0]
i2 = random.sample(range2, 1)[0]
s = chr(i1) + chr(i2)

e1 = self.memory_dump(s)[56:-2].hex()
e2 = s.encode("utf-16")[2:].hex() # [2:] removes BOM
assert e1 == e2

def verify_case4(self):
range1 = self.filter_code_points(2**16 - 1, self.max_code_poit)
range2 = self.filter_code_points(0, self.max_code_poit)

for _ in range(self.numb_tests):
i1 = random.sample(range1, 1)[0]
i2 = random.sample(range2, 1)[0]
s = chr(i1) + chr(i2)

e1 = self.memory_dump(s)[56:-4].hex()
e2 = s.encode("utf-32")[4:].hex() # [4:] removes BOM
assert e1 == e2

@staticmethod
def memory_dump(s):
address_of_s = id(s) # assuming CPython
buffer_s = (ctypes.c_char * sys.getsizeof(s)).from_address(address_of_s)
return bytes(buffer_s)

def filter_code_points(self, lb=0, ub=None):
"""Filter code points.
https://www.compart.com/en/unicode/category
"""
if ub is None:
ub = self.max_code_poit

selected_code_points = []
for cp, category in self.data.items():
if category in self.categories_to_include:
if cp <= ub and cp >= lb:
selected_code_points.append(cp)

return selected_code_points


def cast_back(x):
return ctypes.cast(id(x), ctypes.py_object).value
return ctypes.cast(id(x), ctypes.py_object).value


def main():
v = Pep393VerifyEncoding()
v.verify()


if __name__ == "__main__":
main()
8 changes: 4 additions & 4 deletions blog/Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
all:
@echo "Be reasonable! Check available targets."

generate-materials:
cd 202411-summer-walking-challenge && make generate-materials
generate-artifacts:
cd 202411-summer-walking-challenge && make generate-artifacts

clean-generated-materials:
cd 202411-summer-walking-challenge && make clean-generated-materials
clean-generated-artifacts:
cd 202411-summer-walking-challenge && make clean-generated-artifacts
2 changes: 1 addition & 1 deletion search/search_index.json

Large diffs are not rendered by default.

Loading

0 comments on commit c75c712

Please sign in to comment.