Deployed 5aa853a with MkDocs version: 1.6.1

dimitrod · dimitrod · commit c75c7121ad1d · 2024-12-09T07:26:05.000+01:00
diff --git a/blog/202411-summer-walking-challenge/Makefile b/blog/202411-summer-walking-challenge/Makefile
@@ -1,12 +1,12 @@
 include ../../../Makefile.in
 
-## should generate all blog materials
-generate-materials: setup-venv main-python clean-venv
+## generates all blog artifacts
+generate-artifacts: setup-venv main-python clean-venv
 
 ## generate figures
 main-python:
 	. ${VENV_NAME}/bin/activate && python main.py
 
 ##! clean generated files
-clean-generated-materials:
+clean-generated-artifacts:
 	rm -rf img/generated
diff --git a/blog/202412-python-strings/Makefile b/blog/202412-python-strings/Makefile
@@ -0,0 +1,5 @@
+include ../../../Makefile.in
+
+generate-artifacts:
+
+clean-generated-artifacts:
diff --git a/blog/202412-python-strings/emoji.py b/blog/202412-python-strings/emoji.py
@@ -0,0 +1,143 @@
+"""Download and display emoji (just to search and choose a few).
+
+How to use (in a jupyter notebook cell):
+
+```python
+eh = EmojiHandler(download=True)
+eh.show()
+```
+
+"""
+
+import random
+from pathlib import Path
+
+import pandas as pd
+import requests
+from IPython.display import HTML, display
+
+
+class EmojiHandler:
+
+    def __init__(self, download=False, urls=None, data_dir="data"):
+        self.data_dir = Path(data_dir)
+        # There might be more, I don't know
+        self.urls = [
+            "https://unicode.org/Public/emoji/latest/emoji-zwj-sequences.txt",
+            "https://unicode.org/Public/emoji/latest/emoji-sequences.txt",
+            "https://unicode.org/Public/emoji/latest/emoji-test.txt",
+            "https://unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt",
+            "https://unicode.org/Public/16.0.0/ucd/emoji/emoji-variation-sequences.txt",
+        ]
+
+        if download:
+            self.download_emoji_files()
+        self._data = self.get_data()
+
+    def download_emoji_files(self):
+        self.data_dir.mkdir(parents=True, exist_ok=True)
+        for url in self.urls:
+            response = requests.get(url)
+            response.raise_for_status()
+            with open(self.data_dir / url.split("/")[-1], "w") as h:
+                h.write(response.text)
+
+    @property
+    def count(self):
+        return len(self._data)
+
+    @property
+    def data(self):
+        return pd.DataFrame(self._data)
+
+    def get_data(self):
+        def expand_ranges(data):
+            """I keep the same name for all expanded codepoints."""
+
+            def expand_range(lb_hex, ub_hex):
+                lb = int(lb_hex, 16)
+                ub = int(ub_hex, 16)
+                return [hex(i)[2:].upper() for i in range(lb, ub + 1)]
+
+            updated_data = []
+            for d in data:
+                if ".." in d["seq"]:
+                    code_points = expand_range(*d["seq"].split(".."))
+                    updated_data.extend(
+                        [
+                            {"seq": code_point, "name": d["name"]}
+                            for code_point in code_points
+                        ]
+                    )
+                else:
+                    updated_data.append(d)
+            return updated_data
+
+        def parse_emoji_file_content(file_content):
+            data = []
+            for line in file_content.splitlines():
+                if line and not line.startswith("#"):
+                    line_split = line.split(";")
+                    seq = line_split[0].strip()
+                    # name = line_split[-1].split("#")[0].strip()
+                    name = line_split[-1].strip()
+                    data.append({"seq": seq, "name": name})
+            return data
+
+        data = []
+        for file in Path(self.data_dir).glob("**/*"):
+            with open(file, "r") as h:
+                file_content = h.read()
+            data.extend(parse_emoji_file_content(file_content))
+
+        return expand_ranges(data)
+
+    def filter_data(self, keyword):
+        df = self.data
+        return df[df["name"].str.contains(keyword, case=False)].to_dict("records")
+
+    @staticmethod
+    def show_str(data: str, size=100):
+        display(HTML(f'<span style="font-size: {size}px;">{data}</span>'))
+
+    @staticmethod
+    def form_zwj_emoji(seq):
+        return "".join([chr(int(c, 16)) for c in seq.split(" ")])
+
+    def show(self, n=50, filter=None, index=None, size=40):
+        """Show n random emoji or the emoji with the given index."""
+
+        data = self.filter_data(filter) if filter else self._data
+
+        table_html = '<table style="border-collapse: collapse;">'
+
+        if len(data) == 0:
+            print("No data")
+            return
+
+        if n > len(data):
+            n = len(data)
+            print("Limit to available data.")
+
+        if index is not None:
+            n = 1
+            sample = data[index]
+        else:
+            sample = random.sample(data, n)
+
+        numb_cols = min(10, n)
+        numb_rows = n // numb_cols
+        k = 0
+        for i in range(numb_rows):
+            table_html += "<tr>"
+            for j in range(numb_cols):
+                s = self.form_zwj_emoji(sample[k]["seq"])
+                table_html += f'<td style="font-size: {size}px; padding: 7px;">{s}</td>'
+                k += 1
+            table_html += "</tr>"
+
+        table_html += "</table>"
+        display(HTML(table_html))
+
+    def __repr__(self):
+        return f"[EmojiHandler] {self.count}"
diff --git a/blog/202412-python-strings/index.html b/blog/202412-python-strings/index.html
@@ -1130,7 +1130,7 @@ <h3 id="bytes-objects">Bytes objects<a class="headerlink" href="#bytes-objects"
 <li><code class="language-python highlight"><span class="s2">&quot;a&quot;</span></code>: <code>97, 0</code></li>
 <li><code class="language-python highlight"><span class="s2">&quot;👨&quot;</span></code>: <code>61, 216, 104, 220</code></li>
 </ul>
-<p>or Using <code class="language-python highlight"><span class="n">a_man</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">&quot;utf-8&quot;</span><span class="p">)</span></code>:</p>
+<p>or using <code class="language-python highlight"><span class="n">a_man</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">&quot;utf-8&quot;</span><span class="p">)</span></code>:</p>
 <ul>
 <li><code class="language-python highlight"><span class="s2">&quot;a&quot;</span></code>: <code>97</code></li>
 <li><code class="language-python highlight"><span class="s2">&quot;👨&quot;</span></code>: <code>240, 159, 145, 168</code>.</li>
@@ -1139,6 +1139,8 @@ <h3 id="bytes-objects">Bytes objects<a class="headerlink" href="#bytes-objects"
 compatibility with ASCII and efficient data storage, while UTF-16 and UTF-32 allow for
 faster processing of a larger range of characters. Having the possibility to
 easily/efficiently change representations is convenient.</p>
+<p>Of course, bytes objects can be used in other contexts as well. For example, <code class="language-python highlight"><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">to_bytes</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="n">byteorder</span><span class="o">=</span><span class="s1">&#39;little&#39;</span><span class="p">)</span></code> would return the bytes representation of the
+integer <code>1</code> (in little endian).</p>
 <h2 id="immutability">Immutability<a class="headerlink" href="#immutability" title="Permanent link">#</a></h2>
 <p>The design decision to have immutable string in python has far-reaching implication
 related to e.g., hashing, performance optimizations, garbage collection, thread safety
diff --git a/blog/202412-python-strings/verify_string_encoding.py b/blog/202412-python-strings/verify_string_encoding.py
@@ -0,0 +1,165 @@
+"""Using CPython is assumed.
+
+Dirty numerical verification that my understanding on PEP393 and the CPython code is
+correct.
+
+"""
+
+import ctypes
+import random
+import sys
+import unicodedata
+
+
+class Pep393VerifyEncoding:
+    """Numerically verify encoding used in PEP 393.
+
+    Warning
+    --------
+    This is a one-time-test only. The code is not organised well!
+
+    """
+
+    def __init__(self, numb_test=100000):
+        self.numb_tests = numb_test
+        self.max_code_poit = 1114112
+
+        self.data = {
+            cp: unicodedata.category(chr(cp)) for cp in range(self.max_code_poit)
+        }
+        self.categories = set(self.data.values())
+        self.categories_to_include = self.categories - {"Cs"}
+
+    def verify(self):
+        self.verify_case1()
+        self.verify_case2()
+        self.verify_case3()
+        self.verify_case4()
+        self.verify_surrogate_points_fail_utf32()
+        self.verify_nonsurrogate_points_ok_utf32()
+
+    def verify_surrogate_points_fail_utf32(self):
+        """
+
+        Cs code points can happily be encoded using utf-32 but there is a problem
+        because, given a code point CP (i.e., CP is an integer), we use chr(CP) and
+        the output of chr(.) cannot be trusted because there are no characters
+        associated with a Cs point.
+
+        """
+        k = 0
+        surogate_code_points = self.get_surogate_code_points()
+        for cp in surogate_code_points:
+            try:
+                self.verify_one_code_point(cp, "utf-32")
+            except:
+                k += 1
+        assert k == len(surogate_code_points)
+
+    def verify_nonsurrogate_points_ok_utf32(self):
+        for cp in self.get_nonsurogate_code_points():
+            self.verify_one_code_point(cp, "utf-32")
+
+    def verify_one_code_point(self, code_point, encoding="utf-32"):
+        character = chr(code_point)
+        return character.encode(encoding)
+
+    def get_surogate_code_points(self):
+        return [k for k, v in self.data.items() if v == "Cs"]
+
+    def get_nonsurogate_code_points(self):
+        return [k for k, v in self.data.items() if v != "Cs"]
+
+    def verify_case1(self):
+        range1 = self.filter_code_points(0, 2**7 - 1)
+        range2 = self.filter_code_points(0, 2**7 - 1)
+
+        for _ in range(self.numb_tests):
+            i1 = random.sample(range1, 1)[0]
+            i2 = random.sample(range2, 1)[0]
+            s = chr(i1) + chr(i2)
+
+            e1 = self.memory_dump(s)[40:-1].hex()
+            # utf-8 consides with ascii in case 1
+            e2 = s.encode("ascii").hex()
+            e3 = s.encode("utf-8").hex()
+            assert e1 == e2
+            assert e1 == e3
+
+    def verify_case2(self):
+        range1 = self.filter_code_points(0, 2**8 - 1)
+        range2 = self.filter_code_points(2**7, 2**8 - 1)
+
+        for _ in range(self.numb_tests):
+            i1 = random.sample(range1, 1)[0]
+            i2 = random.sample(range2, 1)[0]
+            s = chr(i1) + chr(i2)
+
+            e1 = self.memory_dump(s)[56:-1].hex()
+            e2 = s.encode("latin-1").hex()
+            e3 = s.encode("utf-16")[2:].hex()  # [2:] removes BOM
+            e3 = e3[:2] + e3[-4:-2]
+            assert e1 == e2
+            assert e1 == e3
+
+    def verify_case3(self):
+        range1 = self.filter_code_points(256, 2**16 - 1)
+        range2 = self.filter_code_points(0, 2**16 - 1)
+
+        for _ in range(self.numb_tests):
+            i1 = random.sample(range1, 1)[0]
+            i2 = random.sample(range2, 1)[0]
+            s = chr(i1) + chr(i2)
+
+            e1 = self.memory_dump(s)[56:-2].hex()
+            e2 = s.encode("utf-16")[2:].hex()  # [2:] removes BOM
+            assert e1 == e2
+
+    def verify_case4(self):
+        range1 = self.filter_code_points(2**16 - 1, self.max_code_poit)
+        range2 = self.filter_code_points(0, self.max_code_poit)
+
+        for _ in range(self.numb_tests):
+            i1 = random.sample(range1, 1)[0]
+            i2 = random.sample(range2, 1)[0]
+            s = chr(i1) + chr(i2)
+
+            e1 = self.memory_dump(s)[56:-4].hex()
+            e2 = s.encode("utf-32")[4:].hex()  # [4:] removes BOM
+            assert e1 == e2
+
+    @staticmethod
+    def memory_dump(s):
+        address_of_s = id(s)  # assuming CPython
+        buffer_s = (ctypes.c_char * sys.getsizeof(s)).from_address(address_of_s)
+        return bytes(buffer_s)
+
+    def filter_code_points(self, lb=0, ub=None):
+        """Filter code points.
+
+        https://www.compart.com/en/unicode/category
+        """
+        if ub is None:
+            ub = self.max_code_poit
+
+        selected_code_points = []
+        for cp, category in self.data.items():
+            if category in self.categories_to_include:
+                if cp <= ub and cp >= lb:
+                    selected_code_points.append(cp)
+
+        return selected_code_points
+
+
+def cast_back(x):
+    return ctypes.cast(id(x), ctypes.py_object).value
+    return ctypes.cast(id(x), ctypes.py_object).value
+
+
+def main():
+    v = Pep393VerifyEncoding()
+    v.verify()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/blog/Makefile b/blog/Makefile
@@ -1,8 +1,8 @@
 all:
 	@echo "Be reasonable! Check available targets."
 
-generate-materials:
-	cd 202411-summer-walking-challenge && make generate-materials
+generate-artifacts:
+	cd 202411-summer-walking-challenge && make generate-artifacts
 
-clean-generated-materials:
-	cd 202411-summer-walking-challenge && make clean-generated-materials
+clean-generated-artifacts:
+	cd 202411-summer-walking-challenge && make clean-generated-artifacts
diff --git a/search/search_index.json b/search/search_index.json
diff --git a/sitemap.xml b/sitemap.xml
diff --git a/sitemap.xml.gz b/sitemap.xml.gz