Merge pull request #773 from googlefonts/glyphdata-one-more-time

The continued adventures of GlyphData
googlefonts · Apr 15, 2024 · 6a4248b · 6a4248b
2 parents 8160e98 + ba61f37
commit 6a4248b
Show file tree

Hide file tree

Showing 17 changed files with 74,003 additions and 4,480 deletions.
diff --git a/glyphs-reader/Cargo.toml b/glyphs-reader/Cargo.toml
@@ -9,6 +9,7 @@ edition = "2021"
 
 [dependencies]
 plist_derive = { path = "plist_derive" }
+quick-xml = "0.31"
 ordered-float.workspace = true
 kurbo.workspace = true
 
@@ -21,6 +22,15 @@ regex.workspace = true
 
 chrono.workspace = true
 smol_str.workspace = true
+serde.workspace = true
+bincode.workspace = true
 
 [dev-dependencies]
 pretty_assertions.workspace = true
+
+[build-dependencies]
+quick-xml = "0.31"
+smol_str.workspace = true
+serde.workspace = true
+thiserror.workspace = true
+bincode.workspace = true
diff --git a/glyphs-reader/build.rs b/glyphs-reader/build.rs
@@ -0,0 +1,34 @@
+use std::env;
+use std::path::Path;
+
+include!("src/glyphdata/glyphdata_impl.rs");
+
+fn parse_xml_files() -> Result<Vec<GlyphInfo>, GlyphDataError> {
+    let mut one = parse_xml_file("data/GlyphData.xml")?;
+    let two = parse_xml_file("data/GlyphData_Ideographs.xml")?;
+    one.extend(two);
+    Ok(one)
+}
+
+fn parse_xml_file(path: &str) -> Result<Vec<GlyphInfo>, GlyphDataError> {
+    let Ok(bytes) = std::fs::read(path) else {
+        panic!("failed to read path '{path}'");
+    };
+    parse_entries(&bytes)
+}
+
+// tell cargo when to rerun this script
+fn register_dependencies() {
+    println!("cargo::rerun-if-changed=data");
+    println!("cargo::rerun-if-changed=src/glyphdata/glyphdata_impl.rs");
+}
+
+fn main() {
+    let out_dir = env::var_os("OUT_DIR").unwrap();
+    let dest_path = Path::new(&out_dir).join("glyphdata.bin");
+    let parsed = parse_xml_files().expect("failed to parse GlyphData xml files");
+    let bytes = bincode::serialize(&parsed).expect("bincode failed");
+    std::fs::write(dest_path, bytes).unwrap();
+
+    register_dependencies()
+}
diff --git a/glyphs-reader/data/GlyphData.xml b/glyphs-reader/data/GlyphData.xml
diff --git a/glyphs-reader/data/GlyphData_Ideographs.xml b/glyphs-reader/data/GlyphData_Ideographs.xml
diff --git a/glyphs-reader/data/GlyphData_override_test.xml b/glyphs-reader/data/GlyphData_override_test.xml
@@ -0,0 +1,26 @@
+<!--Used to test that user-provided data overrides the bundled data-->
+<?xml version="1.0" encoding="UTF-8" ?>
+<!DOCTYPE glyphData [
+<!ELEMENT glyphData (glyph)+>
+<!ATTLIST glyphData
+	format			CDATA		#IMPLIED>
+<!ELEMENT glyph EMPTY>
+<!ATTLIST glyph
+	unicode			CDATA		#IMPLIED
+	unicodeLegacy	CDATA		#IMPLIED
+	name			CDATA		#REQUIRED
+	category		CDATA		#REQUIRED
+	subCategory		CDATA		#IMPLIED
+	case			CDATA		#IMPLIED
+	direction		CDATA		#IMPLIED
+	script			CDATA		#IMPLIED
+	description		CDATA		#IMPLIED
+	production		CDATA		#IMPLIED
+	altNames		CDATA		#IMPLIED>
+]>
+<glyphData>
+	<glyph unicode="0030" name="zero" category="Other" subCategory="Ligature" description="DIGIT ZERO" />
+	<glyph unicode="0043" name="C" category="Number" case="upper" script="latin" description="LATIN CAPITAL LETTER C" />
+	<glyph unicode="021C" name="Yogh" category="Letter" case="upper" script="latin" production="Yolo" description="LATIN CAPITAL LETTER YOGH" />
+</glyphData>
+
diff --git a/glyphs-reader/data/README.md b/glyphs-reader/data/README.md
@@ -0,0 +1,10 @@
+# GlyphData.xml
+
+This directory includes XML files containing data used by glyphs.app. This data
+is extracted from the glyphsLib python package using the `update.py` script
+contained here.
+
+That data in turn is takend from the [GlyphsInfo](https://github.com/schriftgestalt/GlyphsInfo)
+repository.
+
+This data is bundled into our crate using a `build.rs` script in the crate root.
diff --git a/glyphs-reader/data/VERSION b/glyphs-reader/data/VERSION
@@ -0,0 +1,2 @@
+XML files copied from glyphsLib version 6.6.6.
+(this file generated by update.py)
diff --git a/glyphs-reader/data/update.py b/glyphs-reader/data/update.py
@@ -0,0 +1,43 @@
+"""Update bundled xml files
+
+We try to match the behaviour of the python toolchain, so we want to ship the
+same data files as are currently bundled in glyphsLib. This script copies those
+files out of the currently active version of glyphsLib.
+
+Usage:
+    python data/update.py
+"""
+
+import glyphsLib
+from importlib import resources
+import os
+import shutil
+
+def script_dir():
+    return os.path.dirname(os.path.abspath(__file__))
+
+def get_data_file(filepath):
+    return resources.files(glyphsLib).joinpath("data").joinpath(filepath)
+
+
+def copy_data_files():
+    target_dir = script_dir()
+    for target in ["GlyphData.xml", "GlyphData_Ideographs.xml"]:
+        file = get_data_file(target)
+        target = os.path.join(target_dir, target)
+        with file.open("rb") as source, open(target, "wb") as dest:
+            shutil.copyfileobj(source, dest)
+
+def write_version_file():
+    version = glyphsLib.__version__
+    with open(os.path.join(script_dir(), 'VERSION'), 'w') as f:
+        f.write(f"XML files copied from glyphsLib version {version}.\n"
+                "(this file generated by update.py)\n")
+
+def main(_):
+    copy_data_files()
+    write_version_file()
+
+
+if __name__ == "__main__":
+    main(None)
diff --git a/glyphs-reader/src/glyphdata.rs b/glyphs-reader/src/glyphdata.rs
@@ -0,0 +1,146 @@
+//! determining glyph properties
+//!
+//! This module provides access to glyph info extracted from bundled
+//! (and potentially user-provided) data files.
+
+// NOTE: we define the types and parsing code in a separate file, so that
+// we can borrow it in our build.rs script without causing a cycle
+mod glyphdata_impl;
+use std::{
+    collections::{HashMap, HashSet},
+    path::Path,
+};
+
+pub use glyphdata_impl::*;
+use smol_str::SmolStr;
+
+static BUNDLED_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/glyphdata.bin"));
+
+/// A queryable set of glyph data
+///
+/// This is generally expensive to create, and is intended to be cached, or
+/// used behind a OnceCell. It is never modified after initial creation.
+pub struct GlyphData {
+    // The info for all the glyphs we know of.
+    data: Vec<GlyphInfo>,
+    // the values in all maps are indices into the `data` vec. we use u32 to save space.
+    name_map: HashMap<SmolStr, u32>,
+    unicode_map: HashMap<u32, u32>,
+    alt_name_map: HashMap<SmolStr, u32>,
+}
+
+impl GlyphData {
+    /// Create a new data set, optionally loading user provided overrides
+    pub fn new(user_overrides: Option<&Path>) -> Result<Self, GlyphDataError> {
+        let user_overrides = user_overrides
+            .map(|path| {
+                let bytes = std::fs::read(path).map_err(|err| GlyphDataError::UserFile {
+                    path: path.to_owned(),
+                    reason: err.kind(),
+                });
+                bytes.and_then(|xml| parse_entries(&xml))
+            })
+            .transpose()?;
+        let bundled = load_bundled_data();
+        let all_entries = match user_overrides {
+            Some(user_overrides) => merge_data(bundled, user_overrides),
+            None => bundled,
+        };
+
+        Ok(Self::new_impl(all_entries))
+    }
+
+    fn new_impl(entries: Vec<GlyphInfo>) -> Self {
+        let mut name_map = HashMap::with_capacity(entries.len());
+        let mut unicode_map = HashMap::with_capacity(entries.len());
+        let mut alt_name_map = HashMap::new();
+
+        for (i, entry) in entries.iter().enumerate() {
+            name_map.insert(entry.name.clone(), i as u32);
+            if let Some(cp) = entry.unicode {
+                unicode_map.insert(cp, i as _);
+            }
+            for alt in &entry.alt_names {
+                alt_name_map.insert(alt.clone(), i as _);
+            }
+        }
+
+        Self {
+            data: entries,
+            name_map,
+            unicode_map,
+            alt_name_map,
+        }
+    }
+
+    /// Look up info for a glyph by name
+    ///
+    /// This checks primary names first, and alternates afterwards.
+    pub fn get_by_name(&self, name: impl AsRef<str>) -> Option<&GlyphInfo> {
+        let name = name.as_ref();
+        self.name_map
+            .get(name)
+            .or_else(|| self.alt_name_map.get(name))
+            .and_then(|idx| self.data.get(*idx as usize))
+    }
+
+    /// Look up info for a glyph by codepoint
+    pub fn get_by_codepoint(&self, codepoint: u32) -> Option<&GlyphInfo> {
+        self.unicode_map
+            .get(&codepoint)
+            .and_then(|idx| self.data.get(*idx as usize))
+    }
+}
+
+fn load_bundled_data() -> Vec<GlyphInfo> {
+    bincode::deserialize(BUNDLED_DATA).unwrap()
+}
+
+fn merge_data(mut base: Vec<GlyphInfo>, overrides: Vec<GlyphInfo>) -> Vec<GlyphInfo> {
+    let skip_names = overrides
+        .iter()
+        .map(|info| &info.name)
+        .collect::<HashSet<_>>();
+    base.retain(|info| !skip_names.contains(&info.name));
+    base.extend(overrides);
+    base
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_bundled_data() {
+        let data = load_bundled_data();
+        assert_eq!(data.len(), 73329);
+    }
+
+    #[test]
+    fn simple_overrides() {
+        let overrides = vec![GlyphInfo {
+            name: "A".into(),
+            category: Category::Mark,
+            subcategory: Subcategory::SpacingCombining,
+            unicode: Some(b'A' as u32),
+            production: None,
+            alt_names: Default::default(),
+        }];
+        let bundled = load_bundled_data();
+        let merged = merge_data(bundled, overrides);
+        let data = GlyphData::new_impl(merged);
+
+        assert_eq!(data.get_by_name("A").unwrap().category, Category::Mark);
+    }
+
+    #[test]
+    fn overrides_from_file() {
+        let data = GlyphData::new(Some(Path::new("./data/GlyphData_override_test.xml"))).unwrap();
+        assert_eq!(data.get_by_name("zero").unwrap().category, Category::Other);
+        assert_eq!(data.get_by_name("C").unwrap().category, Category::Number);
+        assert_eq!(
+            data.get_by_name("Yogh").unwrap().production,
+            Some("Yolo".into())
+        );
+    }
+}