Skip to content

Commit

Permalink
Merge pull request #773 from googlefonts/glyphdata-one-more-time
Browse files Browse the repository at this point in the history
The continued adventures of GlyphData
  • Loading branch information
cmyr authored Apr 15, 2024
2 parents 8160e98 + ba61f37 commit 6a4248b
Show file tree
Hide file tree
Showing 17 changed files with 74,003 additions and 4,480 deletions.
10 changes: 10 additions & 0 deletions glyphs-reader/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ edition = "2021"

[dependencies]
plist_derive = { path = "plist_derive" }
quick-xml = "0.31"
ordered-float.workspace = true
kurbo.workspace = true

Expand All @@ -21,6 +22,15 @@ regex.workspace = true

chrono.workspace = true
smol_str.workspace = true
serde.workspace = true
bincode.workspace = true

[dev-dependencies]
pretty_assertions.workspace = true

[build-dependencies]
quick-xml = "0.31"
smol_str.workspace = true
serde.workspace = true
thiserror.workspace = true
bincode.workspace = true
34 changes: 34 additions & 0 deletions glyphs-reader/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use std::env;
use std::path::Path;

include!("src/glyphdata/glyphdata_impl.rs");

fn parse_xml_files() -> Result<Vec<GlyphInfo>, GlyphDataError> {
let mut one = parse_xml_file("data/GlyphData.xml")?;
let two = parse_xml_file("data/GlyphData_Ideographs.xml")?;
one.extend(two);
Ok(one)
}

fn parse_xml_file(path: &str) -> Result<Vec<GlyphInfo>, GlyphDataError> {
let Ok(bytes) = std::fs::read(path) else {
panic!("failed to read path '{path}'");
};
parse_entries(&bytes)
}

// tell cargo when to rerun this script
fn register_dependencies() {
println!("cargo::rerun-if-changed=data");
println!("cargo::rerun-if-changed=src/glyphdata/glyphdata_impl.rs");
}

fn main() {
let out_dir = env::var_os("OUT_DIR").unwrap();
let dest_path = Path::new(&out_dir).join("glyphdata.bin");
let parsed = parse_xml_files().expect("failed to parse GlyphData xml files");
let bytes = bincode::serialize(&parsed).expect("bincode failed");
std::fs::write(dest_path, bytes).unwrap();

register_dependencies()
}
33,013 changes: 33,013 additions & 0 deletions glyphs-reader/data/GlyphData.xml

Large diffs are not rendered by default.

40,358 changes: 40,358 additions & 0 deletions glyphs-reader/data/GlyphData_Ideographs.xml

Large diffs are not rendered by default.

26 changes: 26 additions & 0 deletions glyphs-reader/data/GlyphData_override_test.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<!--Used to test that user-provided data overrides the bundled data-->
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE glyphData [
<!ELEMENT glyphData (glyph)+>
<!ATTLIST glyphData
format CDATA #IMPLIED>
<!ELEMENT glyph EMPTY>
<!ATTLIST glyph
unicode CDATA #IMPLIED
unicodeLegacy CDATA #IMPLIED
name CDATA #REQUIRED
category CDATA #REQUIRED
subCategory CDATA #IMPLIED
case CDATA #IMPLIED
direction CDATA #IMPLIED
script CDATA #IMPLIED
description CDATA #IMPLIED
production CDATA #IMPLIED
altNames CDATA #IMPLIED>
]>
<glyphData>
<glyph unicode="0030" name="zero" category="Other" subCategory="Ligature" description="DIGIT ZERO" />
<glyph unicode="0043" name="C" category="Number" case="upper" script="latin" description="LATIN CAPITAL LETTER C" />
<glyph unicode="021C" name="Yogh" category="Letter" case="upper" script="latin" production="Yolo" description="LATIN CAPITAL LETTER YOGH" />
</glyphData>

10 changes: 10 additions & 0 deletions glyphs-reader/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# GlyphData.xml

This directory includes XML files containing data used by glyphs.app. This data
is extracted from the glyphsLib python package using the `update.py` script
contained here.

That data in turn is takend from the [GlyphsInfo](https://github.com/schriftgestalt/GlyphsInfo)
repository.

This data is bundled into our crate using a `build.rs` script in the crate root.
2 changes: 2 additions & 0 deletions glyphs-reader/data/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
XML files copied from glyphsLib version 6.6.6.
(this file generated by update.py)
43 changes: 43 additions & 0 deletions glyphs-reader/data/update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""Update bundled xml files
We try to match the behaviour of the python toolchain, so we want to ship the
same data files as are currently bundled in glyphsLib. This script copies those
files out of the currently active version of glyphsLib.
Usage:
python data/update.py
"""

import glyphsLib
from importlib import resources
import os
import shutil

def script_dir():
return os.path.dirname(os.path.abspath(__file__))

def get_data_file(filepath):
return resources.files(glyphsLib).joinpath("data").joinpath(filepath)


def copy_data_files():
target_dir = script_dir()
for target in ["GlyphData.xml", "GlyphData_Ideographs.xml"]:
file = get_data_file(target)
target = os.path.join(target_dir, target)
with file.open("rb") as source, open(target, "wb") as dest:
shutil.copyfileobj(source, dest)

def write_version_file():
version = glyphsLib.__version__
with open(os.path.join(script_dir(), 'VERSION'), 'w') as f:
f.write(f"XML files copied from glyphsLib version {version}.\n"
"(this file generated by update.py)\n")

def main(_):
copy_data_files()
write_version_file()


if __name__ == "__main__":
main(None)
146 changes: 146 additions & 0 deletions glyphs-reader/src/glyphdata.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
//! determining glyph properties
//!
//! This module provides access to glyph info extracted from bundled
//! (and potentially user-provided) data files.

// NOTE: we define the types and parsing code in a separate file, so that
// we can borrow it in our build.rs script without causing a cycle
mod glyphdata_impl;
use std::{
collections::{HashMap, HashSet},
path::Path,
};

pub use glyphdata_impl::*;
use smol_str::SmolStr;

static BUNDLED_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/glyphdata.bin"));

/// A queryable set of glyph data
///
/// This is generally expensive to create, and is intended to be cached, or
/// used behind a OnceCell. It is never modified after initial creation.
pub struct GlyphData {
// The info for all the glyphs we know of.
data: Vec<GlyphInfo>,
// the values in all maps are indices into the `data` vec. we use u32 to save space.
name_map: HashMap<SmolStr, u32>,
unicode_map: HashMap<u32, u32>,
alt_name_map: HashMap<SmolStr, u32>,
}

impl GlyphData {
/// Create a new data set, optionally loading user provided overrides
pub fn new(user_overrides: Option<&Path>) -> Result<Self, GlyphDataError> {
let user_overrides = user_overrides
.map(|path| {
let bytes = std::fs::read(path).map_err(|err| GlyphDataError::UserFile {
path: path.to_owned(),
reason: err.kind(),
});
bytes.and_then(|xml| parse_entries(&xml))
})
.transpose()?;
let bundled = load_bundled_data();
let all_entries = match user_overrides {
Some(user_overrides) => merge_data(bundled, user_overrides),
None => bundled,
};

Ok(Self::new_impl(all_entries))
}

fn new_impl(entries: Vec<GlyphInfo>) -> Self {
let mut name_map = HashMap::with_capacity(entries.len());
let mut unicode_map = HashMap::with_capacity(entries.len());
let mut alt_name_map = HashMap::new();

for (i, entry) in entries.iter().enumerate() {
name_map.insert(entry.name.clone(), i as u32);
if let Some(cp) = entry.unicode {
unicode_map.insert(cp, i as _);
}
for alt in &entry.alt_names {
alt_name_map.insert(alt.clone(), i as _);
}
}

Self {
data: entries,
name_map,
unicode_map,
alt_name_map,
}
}

/// Look up info for a glyph by name
///
/// This checks primary names first, and alternates afterwards.
pub fn get_by_name(&self, name: impl AsRef<str>) -> Option<&GlyphInfo> {
let name = name.as_ref();
self.name_map
.get(name)
.or_else(|| self.alt_name_map.get(name))
.and_then(|idx| self.data.get(*idx as usize))
}

/// Look up info for a glyph by codepoint
pub fn get_by_codepoint(&self, codepoint: u32) -> Option<&GlyphInfo> {
self.unicode_map
.get(&codepoint)
.and_then(|idx| self.data.get(*idx as usize))
}
}

fn load_bundled_data() -> Vec<GlyphInfo> {
bincode::deserialize(BUNDLED_DATA).unwrap()
}

fn merge_data(mut base: Vec<GlyphInfo>, overrides: Vec<GlyphInfo>) -> Vec<GlyphInfo> {
let skip_names = overrides
.iter()
.map(|info| &info.name)
.collect::<HashSet<_>>();
base.retain(|info| !skip_names.contains(&info.name));
base.extend(overrides);
base
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_bundled_data() {
let data = load_bundled_data();
assert_eq!(data.len(), 73329);
}

#[test]
fn simple_overrides() {
let overrides = vec![GlyphInfo {
name: "A".into(),
category: Category::Mark,
subcategory: Subcategory::SpacingCombining,
unicode: Some(b'A' as u32),
production: None,
alt_names: Default::default(),
}];
let bundled = load_bundled_data();
let merged = merge_data(bundled, overrides);
let data = GlyphData::new_impl(merged);

assert_eq!(data.get_by_name("A").unwrap().category, Category::Mark);
}

#[test]
fn overrides_from_file() {
let data = GlyphData::new(Some(Path::new("./data/GlyphData_override_test.xml"))).unwrap();
assert_eq!(data.get_by_name("zero").unwrap().category, Category::Other);
assert_eq!(data.get_by_name("C").unwrap().category, Category::Number);
assert_eq!(
data.get_by_name("Yogh").unwrap().production,
Some("Yolo".into())
);
}
}
Loading

0 comments on commit 6a4248b

Please sign in to comment.