Skip to content

Commit 1cf92cf

Browse files
committed
Context 0.1: MultiLookupHuffmanTable
1 parent 7e8d52f commit 1cf92cf

File tree

3 files changed

+533
-77
lines changed

3 files changed

+533
-77
lines changed

crates/binjs_io/src/context/huffman/mod.rs

+209-44
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ pub mod read;
1010

1111
/// A newtype for `u8` used to count the length of a key in bits.
1212
#[derive(
13+
Constructor,
1314
Debug,
1415
Default,
1516
Display,
@@ -82,30 +83,63 @@ impl BitSequence {
8283
pub fn new(bits: u32, bit_len: BitLen) -> Self {
8384
Self { bits, bit_len }
8485
}
86+
8587
pub fn bits(&self) -> u32 {
8688
self.bits
8789
}
90+
8891
/// The number of bits of `bits` to use.
8992
pub fn bit_len(&self) -> BitLen {
9093
self.bit_len
9194
}
95+
96+
/// Split the bits into a prefix of `bit_len` bits and a suffix containing the
97+
/// remaining bits.
98+
///
99+
/// If `bit_len` is larger than the number of bits, the prefix is padded with
100+
/// lower-weight bits into `bit_len` bits.
101+
pub fn split_bits(&self, bit_len: BitLen) -> (u32, u32) {
102+
debug_assert!(bit_len.as_u8() <= 32);
103+
if self.bit_len <= bit_len {
104+
let padding = bit_len - self.bit_len;
105+
(self.bits << padding, 0)
106+
} else {
107+
let shift = self.bit_len - bit_len;
108+
match shift.into() {
109+
32u8 => (0, self.bits), // Special case: cannot >> 32
110+
shift => (
111+
self.bits >> shift,
112+
self.bits & (std::u32::MAX >> 32 - shift),
113+
),
114+
}
115+
}
116+
}
117+
92118
/// Split the bits into a prefix of `bit_len` bits and a suffix of `self.bit_len - bit_len`
93119
/// bits.
94120
///
95121
/// # Failure
96122
///
97123
/// This function panics if `bit_len > self.bit_len`.
98-
pub fn split(&self, bit_len: BitLen) -> (u32, u32) {
99-
let shift = self.bit_len - bit_len;
100-
match shift.into() {
101-
0u8 => (self.bits, 0), // Special case: cannot >> 32
102-
32u8 => (0, self.bits), // Special case: cannot >> 32
103-
shift => (
104-
self.bits >> shift,
105-
self.bits & (std::u32::MAX >> 32 - shift),
124+
pub fn split(&self, bit_len: BitLen) -> (BitSequence, BitSequence) {
125+
let (prefix, suffix) = self.split_bits(bit_len);
126+
(
127+
BitSequence::new(prefix, bit_len),
128+
BitSequence::new(
129+
suffix,
130+
if self.bit_len >= bit_len {
131+
self.bit_len - bit_len
132+
} else {
133+
BitLen::new(0)
134+
},
106135
),
107-
}
136+
)
108137
}
138+
139+
/// Add lowest-weight to this bit sequence bits until it reaches
140+
/// a sufficient bit length.
141+
///
142+
/// Does nothing if the bit sequence already has a sufficient bitlength.
109143
pub fn pad_lowest_to(&self, total_bit_len: BitLen) -> Cow<BitSequence> {
110144
assert!(total_bit_len.0 <= 32u8);
111145
if total_bit_len <= self.bit_len {
@@ -117,21 +151,93 @@ impl BitSequence {
117151
}
118152
Cow::Owned(BitSequence::new(self.bits << shift, total_bit_len))
119153
}
154+
155+
/// Prepend a sequence of bits to a sequencce.s
156+
pub fn with_prefix(&self, prefix: &BitSequence) -> Self {
157+
assert!((prefix.bit_len() + self.bit_len()).as_u8() <= 32);
158+
let bits = self.bits | (prefix.bits() << self.bit_len);
159+
let bit_len = self.bit_len + prefix.bit_len;
160+
BitSequence::new(bits, bit_len)
161+
}
162+
163+
/// Return a range representing all possible suffixes of this `BitSequence`
164+
/// containing exactly `bit_len` bits.
165+
///
166+
/// If this `BitSequence` is already at least `bit_len` bits long, we
167+
/// truncate the `BitSequence` to `bit_len` bits by removing the
168+
/// lower-weight bits and there is only one such suffix.
169+
///
170+
/// ```
171+
/// use binjs_io::context::huffman::{ BitLen, BitSequence };
172+
///
173+
/// let zero = BitSequence::new(0, BitLen::new(0));
174+
///
175+
/// let range = zero.suffixes(BitLen::new(0));
176+
/// assert_eq!(range, 0..1);
177+
///
178+
/// let range = zero.suffixes(BitLen::new(2));
179+
/// assert_eq!(range, 0..4);
180+
///
181+
/// let range = zero.suffixes(BitLen::new(3));
182+
/// assert_eq!(range, 0..8);
183+
///
184+
/// let range = zero.suffixes(BitLen::new(4));
185+
/// assert_eq!(range, 0..16);
186+
///
187+
/// let sequence = BitSequence::new(0b00000100, BitLen::new(3));
188+
///
189+
/// let range = sequence.suffixes(BitLen::new(0));
190+
/// assert_eq!(range, 0..1);
191+
///
192+
/// let range = sequence.suffixes(BitLen::new(2));
193+
/// assert_eq!(range, 2..3);
194+
///
195+
/// let range = sequence.suffixes(BitLen::new(3));
196+
/// assert_eq!(range, 4..5);
197+
///
198+
/// let range = sequence.suffixes(BitLen::new(4));
199+
/// assert_eq!(range, 8..10); // 0b000001000 to 0b00001001 included
200+
/// ```
201+
pub fn suffixes(&self, bit_len: BitLen) -> std::ops::Range<u32> {
202+
debug_assert!(bit_len.as_u8() as usize <= 8 * std::mem::size_of_val(&self.bits()));
203+
debug_assert!(
204+
std::mem::size_of_val(&self.bits()) == std::mem::size_of::<u32>(),
205+
"The arithmetics relies upon the fact that we're only using `u32` for Huffman keys"
206+
);
207+
let (first, last) = if bit_len <= self.bit_len() {
208+
// We have too many bits, we need to truncate the bits,
209+
// then return a single element.
210+
let shearing: u8 = (self.bit_len() - bit_len).as_u8();
211+
let first = if shearing == 32 {
212+
0
213+
} else {
214+
self.bits() >> shearing
215+
};
216+
(first, first)
217+
} else {
218+
// We need to pad with lower-weight 0s.
219+
let padding: u8 = (bit_len - self.bit_len()).as_u8();
220+
let first = self.bits() << padding;
221+
let len = std::u32::MAX >> (8 * std::mem::size_of::<u32>() as u8 - padding);
222+
(first, first + len)
223+
};
224+
first..(last + 1)
225+
}
120226
}
121227

122228
#[test]
123229
fn test_bit_sequence_split() {
124230
let bits = 0b11111111_11111111_00000000_00000000;
125231
let key = BitSequence::new(bits, BitLen(32));
126-
assert_eq!(key.split(BitLen(0)), (0, bits));
127-
assert_eq!(key.split(BitLen(32)), (bits, 0));
128-
assert_eq!(key.split(BitLen(16)), (0b11111111_11111111, 0));
232+
assert_eq!(key.split_bits(BitLen(0)), (0, bits));
233+
assert_eq!(key.split_bits(BitLen(32)), (bits, 0));
234+
assert_eq!(key.split_bits(BitLen(16)), (0b11111111_11111111, 0));
129235

130236
let bits = 0b00000000_00000000_00000000_11111111;
131237
let key = BitSequence::new(bits, BitLen(16));
132-
assert_eq!(key.split(BitLen(0)), (0, bits));
133-
assert_eq!(key.split(BitLen(16)), (bits, 0));
134-
assert_eq!(key.split(BitLen(8)), (0, 0b11111111));
238+
assert_eq!(key.split_bits(BitLen(0)), (0, bits));
239+
assert_eq!(key.split_bits(BitLen(16)), (bits, 0));
240+
assert_eq!(key.split_bits(BitLen(8)), (0, 0b11111111));
135241
}
136242

137243
/// A Huffman key
@@ -159,6 +265,10 @@ impl Key {
159265
Key(BitSequence { bits, bit_len })
160266
}
161267

268+
pub fn from_bit_sequence(sequence: BitSequence) -> Self {
269+
Self::new(sequence.bits, sequence.bit_len)
270+
}
271+
162272
/// The bits in this Key.
163273
///
164274
/// # Invariant
@@ -176,6 +286,11 @@ impl Key {
176286
pub fn as_bit_sequence(&self) -> &BitSequence {
177287
&self.0
178288
}
289+
290+
pub fn with_prefix(&self, prefix: &BitSequence) -> Self {
291+
let sequence = self.0.with_prefix(prefix);
292+
Key::from_bit_sequence(sequence)
293+
}
179294
}
180295

181296
/// A node in the Huffman tree.
@@ -219,43 +334,46 @@ impl<T> PartialEq for Node<T> {
219334
}
220335
impl<T> Eq for Node<T> {}
221336

222-
/// Keys associated to a sequence of values.
337+
/// Codebook associated to a sequence of values.
223338
#[derive(Clone, Debug)]
224-
pub struct Keys<T> {
225-
/// The longest bit length that actually appears in `keys`.
339+
pub struct Codebook<T> {
340+
/// The longest bit length that actually appears in `mappings`.
226341
highest_bit_len: BitLen,
227342

228343
/// The sequence of keys.
229344
///
230345
/// Order is meaningful.
231-
keys: Vec<(T, Key)>,
346+
mappings: Vec<(T, Key)>,
232347
}
233348

234-
impl<T> Keys<T> {
349+
impl<T> Codebook<T> {
350+
/// The number of elements in this Codebook.
235351
pub fn len(&self) -> usize {
236-
self.keys.len()
352+
self.mappings.len()
237353
}
354+
355+
/// The longest bit length that acctually appears in this Codebook.
238356
pub fn highest_bit_len(&self) -> BitLen {
239357
self.highest_bit_len
240358
}
241359
}
242360

243-
impl<T> IntoIterator for Keys<T> {
361+
impl<T> IntoIterator for Codebook<T> {
244362
type Item = (T, Key);
245363
type IntoIter = std::vec::IntoIter<(T, Key)>;
246364
fn into_iter(self) -> Self::IntoIter {
247-
self.keys.into_iter()
365+
self.mappings.into_iter()
248366
}
249367
}
250368

251-
impl<T> Keys<T>
369+
impl<T> Codebook<T>
252370
where
253371
T: Ord + Clone,
254372
{
255-
/// Compute a `Keys` from a sequence of values.
373+
/// Compute a `Codebook` from a sequence of values.
256374
///
257375
/// Optionally, `max_bit_len` may specify a largest acceptable bit length.
258-
/// If `Keys` may not be computed without exceeding this bit length,
376+
/// If the `Codebook` may not be computed without exceeding this bit length,
259377
/// fail with `Err(problemantic_bit_len)`.
260378
///
261379
/// The current implementation only attempts to produce the best compression
@@ -278,11 +396,11 @@ where
278396
let counter = map.entry(item).or_insert(0.into());
279397
*counter += 1.into();
280398
}
281-
// Then compute the `Keys`.
399+
// Then compute the `Codebook`.
282400
Self::from_instances(map, max_bit_len)
283401
}
284402

285-
/// Compute a `Keys` from a sequence of values
403+
/// Compute a `Codebook` from a sequence of values
286404
/// with a number of instances already attached.
287405
///
288406
/// The current implementation only attempts to produce the best compression
@@ -305,27 +423,27 @@ where
305423

306424
// The bits associated to the next value.
307425
let mut bits = 0;
308-
let mut keys = Vec::with_capacity(bit_lengths.len());
426+
let mut mappings = Vec::with_capacity(bit_lengths.len());
309427

310428
for i in 0..bit_lengths.len() - 1 {
311429
let (bit_len, symbol, next_bit_len) = (
312430
bit_lengths[i].1,
313431
bit_lengths[i].0.clone(),
314432
bit_lengths[i + 1].1,
315433
);
316-
keys.push((symbol.clone(), Key::new(bits, bit_len)));
434+
mappings.push((symbol.clone(), Key::new(bits, bit_len)));
317435
bits = (bits + 1) << (next_bit_len - bit_len);
318436
if bit_len > highest_bit_len {
319437
highest_bit_len = bit_len;
320438
}
321439
}
322440
// Handle the last element.
323441
let (ref symbol, bit_len) = bit_lengths[bit_lengths.len() - 1];
324-
keys.push((symbol.clone(), Key::new(bits, bit_len)));
442+
mappings.push((symbol.clone(), Key::new(bits, bit_len)));
325443

326444
return Ok(Self {
327445
highest_bit_len,
328-
keys,
446+
mappings,
329447
});
330448
}
331449

@@ -412,26 +530,73 @@ where
412530
#[test]
413531
fn test_coded_from_sequence() {
414532
let sample = "appl";
415-
let coded = Keys::from_sequence(sample.chars(), std::u8::MAX).unwrap();
533+
let coded = Codebook::from_sequence(sample.chars(), std::u8::MAX).unwrap();
416534

417535
// Symbol 'p' appears twice, we should see 3 codes.
418-
assert_eq!(coded.keys.len(), 3);
536+
assert_eq!(coded.mappings.len(), 3);
419537

420538
// Check order of symbols.
421-
assert_eq!(coded.keys[0].0, 'p');
422-
assert_eq!(coded.keys[1].0, 'a');
423-
assert_eq!(coded.keys[2].0, 'l');
539+
assert_eq!(coded.mappings[0].0, 'p');
540+
assert_eq!(coded.mappings[1].0, 'a');
541+
assert_eq!(coded.mappings[2].0, 'l');
424542

425543
// Check bit length of symbols.
426-
assert_eq!(coded.keys[0].1.bit_len(), 1.into());
427-
assert_eq!(coded.keys[1].1.bit_len(), 2.into());
428-
assert_eq!(coded.keys[2].1.bit_len(), 2.into());
544+
assert_eq!(coded.mappings[0].1.bit_len(), 1.into());
545+
assert_eq!(coded.mappings[1].1.bit_len(), 2.into());
546+
assert_eq!(coded.mappings[2].1.bit_len(), 2.into());
429547

430548
// Check code of symbols.
431-
assert_eq!(coded.keys[0].1.bits(), 0b00);
432-
assert_eq!(coded.keys[1].1.bits(), 0b10);
433-
assert_eq!(coded.keys[2].1.bits(), 0b11);
549+
assert_eq!(coded.mappings[0].1.bits(), 0b00);
550+
assert_eq!(coded.mappings[1].1.bits(), 0b10);
551+
assert_eq!(coded.mappings[2].1.bits(), 0b11);
434552

435553
// Let's try again with a limit to 1 bit paths.
436-
assert_eq!(Keys::from_sequence(sample.chars(), 1).unwrap_err(), 2);
554+
assert_eq!(Codebook::from_sequence(sample.chars(), 1).unwrap_err(), 2);
555+
}
556+
557+
impl<T> Codebook<T> {
558+
/// Create an empty Codebook
559+
pub fn new() -> Self {
560+
Self {
561+
highest_bit_len: BitLen::new(0),
562+
mappings: vec![],
563+
}
564+
}
565+
566+
/// Create an empty Codebook
567+
pub fn with_capacity(len: usize) -> Self {
568+
Self {
569+
highest_bit_len: BitLen::new(0),
570+
mappings: Vec::with_capacity(len),
571+
}
572+
}
573+
574+
/// Add a mapping to a Codebook.
575+
///
576+
/// This method does **not** check that the resulting Codebook is correct.
577+
pub unsafe fn add_mapping(&mut self, value: T, key: Key) {
578+
if key.bit_len() > self.highest_bit_len {
579+
self.highest_bit_len = key.bit_len();
580+
}
581+
self.mappings.push((value, key));
582+
}
583+
584+
/// Return the mappings of a Codebook.
585+
pub fn mappings(self) -> Vec<(T, Key)> {
586+
self.mappings
587+
}
588+
589+
pub fn map<F, U>(self, mut f: F) -> Codebook<U>
590+
where
591+
F: FnMut(T) -> U,
592+
{
593+
Codebook {
594+
highest_bit_len: self.highest_bit_len,
595+
mappings: self
596+
.mappings
597+
.into_iter()
598+
.map(|(value, key)| (f(value), key))
599+
.collect(),
600+
}
601+
}
437602
}

0 commit comments

Comments
 (0)