Skip to content

Commit d481963

Browse files
committed
working updates
1 parent 8bd12e8 commit d481963

File tree

3 files changed

+108
-2
lines changed

3 files changed

+108
-2
lines changed
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
use crate::fmt_list;
2+
use crate::raw_emitter::RawEmitter;
3+
use std::collections::HashMap;
4+
use std::fmt::Write as _;
5+
use std::ops::Range;
6+
7+
8+
impl RawEmitter {
9+
pub fn emit_cascading_map(&mut self, ranges: &[Range<u32>]) -> bool {
10+
11+
let mut map: [u8; 256] = [
12+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
13+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
14+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
15+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
17+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
18+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
19+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
20+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
21+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
22+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
23+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
24+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
25+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28+
];
29+
30+
let points = ranges.iter().flat_map(
31+
|r| (r.start..r.end).into_iter().collect::<Vec<u32>>()
32+
).collect::<Vec<u32>>();
33+
34+
println!("there are {} points", points.len());
35+
36+
// how many distinct ranges need to be counted?
37+
let mut codepoints_by_high_bytes = HashMap::<usize, Vec<u32>>::new();
38+
for point in points {
39+
// assert that there is no whitespace over the 0x3000 range.
40+
assert!(point <= 0x3000, "the highest unicode whitespace value has changed");
41+
let high_bytes = point as usize >> 8;
42+
let codepoints = codepoints_by_high_bytes.entry(high_bytes).or_insert_with(Vec::new);
43+
codepoints.push(point);
44+
}
45+
46+
let mut bit_for_high_byte = 1u8;
47+
let mut arms = Vec::<String>::new();
48+
49+
let mut high_bytes: Vec<usize> = codepoints_by_high_bytes.keys().map(|k| k.clone()).collect();
50+
high_bytes.sort();
51+
for high_byte in high_bytes {
52+
let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap();
53+
if codepoints.len() == 1 {
54+
let ch = codepoints.pop().unwrap();
55+
arms.push(format!("{} => c as u32 == {:#04x}", high_byte, ch));
56+
continue;
57+
}
58+
// more than 1 codepoint in this arm
59+
for codepoint in codepoints {
60+
map[(*codepoint & 0xff) as usize] |= bit_for_high_byte;
61+
}
62+
arms.push(format!(
63+
"{} => WHITESPACE_MAP[c as usize & 0xff] & {} != 0",
64+
high_byte,
65+
bit_for_high_byte)
66+
);
67+
bit_for_high_byte <<= 1;
68+
}
69+
70+
writeln!(
71+
&mut self.file,
72+
"static WHITESPACE_MAP: [u8; 256] = [{}];",
73+
fmt_list(map.iter())
74+
)
75+
.unwrap();
76+
self.bytes_used += 256;
77+
78+
79+
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
80+
writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap();
81+
for arm in arms {
82+
writeln!(&mut self.file, " {},", arm).unwrap();
83+
}
84+
writeln!(&mut self.file, " _ => false,").unwrap();
85+
writeln!(&mut self.file, " }}").unwrap();
86+
writeln!(&mut self.file, "}}").unwrap();
87+
88+
true
89+
}
90+
}

src/tools/unicode-table-generator/src/main.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,10 @@ use ucd_parse::Codepoints;
7878
mod case_mapping;
7979
mod raw_emitter;
8080
mod skiplist;
81+
mod cascading_map;
8182
mod unicode_download;
8283

83-
use raw_emitter::{emit_codepoints, RawEmitter};
84+
use raw_emitter::{emit_codepoints, emit_whitespace, RawEmitter};
8485

8586
static PROPERTIES: &[&str] = &[
8687
"Alphabetic",
@@ -241,8 +242,13 @@ fn main() {
241242
let mut modules = Vec::new();
242243
for (property, ranges) in ranges_by_property {
243244
let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>();
245+
244246
let mut emitter = RawEmitter::new();
245-
emit_codepoints(&mut emitter, &ranges);
247+
if property == &"White_Space" {
248+
emit_whitespace(&mut emitter, &ranges);
249+
} else {
250+
emit_codepoints(&mut emitter, &ranges);
251+
}
246252

247253
modules.push((property.to_lowercase().to_string(), emitter.file));
248254
println!(

src/tools/unicode-table-generator/src/raw_emitter.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,16 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
170170
}
171171
}
172172

173+
pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
174+
emitter.blank_line();
175+
176+
let mut cascading = emitter.clone();
177+
cascading.emit_cascading_map(&ranges);
178+
*emitter = cascading;
179+
emitter.desc = String::from("cascading");
180+
181+
}
182+
173183
struct Canonicalized {
174184
canonical_words: Vec<u64>,
175185
canonicalized_words: Vec<(u8, u8)>,

0 commit comments

Comments
 (0)