Skip to content

Commit 7e8d52f

Browse files
committed
Context 0.1: Introducing SingleLookupHuffmanTable
This table provides best performance but may only be used reasonably for small bit lengths due to its memory cost. We'll implement a more space-efficient (but not as fast) MultiLookupHuffmanTable in a followup patch.
1 parent b55f745 commit 7e8d52f

File tree

2 files changed

+455
-40
lines changed

2 files changed

+455
-40
lines changed

Diff for: crates/binjs_io/src/context/huffman.rs renamed to crates/binjs_io/src/context/huffman/mod.rs

+147-40
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
use io::statistics::Instances;
22

3+
use std::borrow::Cow;
34
use std::cmp::Ordering;
45
use std::collections::{BinaryHeap, HashMap};
56
use std::hash::Hash;
67

8+
/// Reading from bitstreams and decoding their contents using Huffman tables.
9+
pub mod read;
10+
711
/// A newtype for `u8` used to count the length of a key in bits.
812
#[derive(
913
Debug,
@@ -25,6 +29,11 @@ use std::hash::Hash;
2529
Eq,
2630
)]
2731
pub struct BitLen(u8);
32+
impl BitLen {
33+
pub fn as_u8(&self) -> u8 {
34+
self.0
35+
}
36+
}
2837

2938
/// Convenience implementation of operator `<<` in
3039
/// `bits << bit_len`
@@ -34,6 +43,12 @@ impl std::ops::Shl<BitLen> for u32 {
3443
self << Into::<u8>::into(rhs)
3544
}
3645
}
46+
impl std::ops::Shl<BitLen> for usize {
47+
type Output = usize;
48+
fn shl(self, rhs: BitLen) -> usize {
49+
self << Into::<u8>::into(rhs)
50+
}
51+
}
3752

3853
/// Convenience implementation of operator `>>` in
3954
/// `bits >> bit_len`
@@ -43,56 +58,125 @@ impl std::ops::Shr<BitLen> for u32 {
4358
self >> Into::<u8>::into(rhs)
4459
}
4560
}
61+
impl std::ops::Shr<BitLen> for usize {
62+
type Output = usize;
63+
fn shr(self, rhs: BitLen) -> usize {
64+
self >> Into::<u8>::into(rhs)
65+
}
66+
}
4667

4768
/// The largerst acceptable length for a key.
4869
///
4970
/// Hardcoded in the format.
5071
const MAX_CODE_BIT_LENGTH: u8 = 20;
5172

52-
// privacy barrier
53-
mod key {
54-
use context::huffman::BitLen;
55-
56-
/// A Huffman key
57-
#[derive(Debug)]
58-
pub struct Key {
59-
/// The bits in the key.
60-
///
61-
/// Note that we only use the `bit_len` lowest-weight bits.
62-
/// Any other bit MUST BE 0.
73+
/// A sequence of bits, read from a bit stream.
74+
///
75+
/// Typically used for lookup of entries in Huffman tables.
76+
#[derive(Clone, Debug, PartialEq, Eq)]
77+
pub struct BitSequence {
6378
bits: u32,
64-
65-
/// The number of bits of `bits` to use.
6679
bit_len: BitLen,
6780
}
68-
impl Key {
69-
/// Create a new Key.
81+
impl BitSequence {
7082
pub fn new(bits: u32, bit_len: BitLen) -> Self {
71-
debug_assert!({let bit_len : u8 = bit_len.into(); bit_len <= 32});
72-
debug_assert!({let bit_len : u8 = bit_len.into(); if bit_len < 32 { bits >> bit_len == 0 } else { true }});
73-
Key {
74-
bits,
75-
bit_len,
83+
Self { bits, bit_len }
84+
}
85+
pub fn bits(&self) -> u32 {
86+
self.bits
87+
}
88+
/// The number of bits of `bits` to use.
89+
pub fn bit_len(&self) -> BitLen {
90+
self.bit_len
91+
}
92+
/// Split the bits into a prefix of `bit_len` bits and a suffix of `self.bit_len - bit_len`
93+
/// bits.
94+
///
95+
/// # Failure
96+
///
97+
/// This function panics if `bit_len > self.bit_len`.
98+
pub fn split(&self, bit_len: BitLen) -> (u32, u32) {
99+
let shift = self.bit_len - bit_len;
100+
match shift.into() {
101+
0u8 => (self.bits, 0), // Special case: cannot >> 32
102+
32u8 => (0, self.bits), // Special case: cannot >> 32
103+
shift => (
104+
self.bits >> shift,
105+
self.bits & (std::u32::MAX >> 32 - shift),
106+
),
76107
}
77108
}
109+
pub fn pad_lowest_to(&self, total_bit_len: BitLen) -> Cow<BitSequence> {
110+
assert!(total_bit_len.0 <= 32u8);
111+
if total_bit_len <= self.bit_len {
112+
return Cow::Borrowed(self);
113+
}
114+
let shift = total_bit_len - self.bit_len;
115+
if shift.0 == 32u8 {
116+
return Cow::Owned(BitSequence::new(0, BitLen(32)));
117+
}
118+
Cow::Owned(BitSequence::new(self.bits << shift, total_bit_len))
119+
}
120+
}
78121

79-
/// The bits in the key.
122+
#[test]
123+
fn test_bit_sequence_split() {
124+
let bits = 0b11111111_11111111_00000000_00000000;
125+
let key = BitSequence::new(bits, BitLen(32));
126+
assert_eq!(key.split(BitLen(0)), (0, bits));
127+
assert_eq!(key.split(BitLen(32)), (bits, 0));
128+
assert_eq!(key.split(BitLen(16)), (0b11111111_11111111, 0));
129+
130+
let bits = 0b00000000_00000000_00000000_11111111;
131+
let key = BitSequence::new(bits, BitLen(16));
132+
assert_eq!(key.split(BitLen(0)), (0, bits));
133+
assert_eq!(key.split(BitLen(16)), (bits, 0));
134+
assert_eq!(key.split(BitLen(8)), (0, 0b11111111));
135+
}
136+
137+
/// A Huffman key
138+
#[derive(Clone, Debug, PartialEq, Eq)]
139+
pub struct Key(BitSequence);
140+
141+
impl Key {
142+
/// Create a new Key.
80143
///
81144
/// Note that we only use the `bit_len` lowest-weight bits.
82-
/// Any other bit is guaranteed to be 0.
145+
/// Any other bit MUST BE 0.
146+
pub fn new(bits: u32, bit_len: BitLen) -> Self {
147+
debug_assert!({
148+
let bit_len: u8 = bit_len.into();
149+
bit_len <= 32
150+
});
151+
debug_assert!({
152+
let bit_len: u8 = bit_len.into();
153+
if bit_len < 32 {
154+
bits >> bit_len == 0
155+
} else {
156+
true
157+
}
158+
});
159+
Key(BitSequence { bits, bit_len })
160+
}
161+
162+
/// The bits in this Key.
163+
///
164+
/// # Invariant
165+
///
166+
/// Only the `self.bit_len()` lowest-weight bits may be non-0.
83167
pub fn bits(&self) -> u32 {
84-
self.bits
168+
self.0.bits
85169
}
86170

87171
/// The number of bits of `bits` to use.
88172
pub fn bit_len(&self) -> BitLen {
89-
self.bit_len
173+
self.0.bit_len
90174
}
91-
}
92-
93-
} // mod key
94175

95-
use self::key::Key;
176+
pub fn as_bit_sequence(&self) -> &BitSequence {
177+
&self.0
178+
}
179+
}
96180

97181
/// A node in the Huffman tree.
98182
struct Node<T> {
@@ -136,17 +220,34 @@ impl<T> PartialEq for Node<T> {
136220
impl<T> Eq for Node<T> {}
137221

138222
/// Keys associated to a sequence of values.
139-
#[derive(Debug)]
140-
pub struct Keys<T>
141-
where
142-
T: Ord + Clone,
143-
{
223+
#[derive(Clone, Debug)]
224+
pub struct Keys<T> {
225+
/// The longest bit length that actually appears in `keys`.
226+
highest_bit_len: BitLen,
227+
144228
/// The sequence of keys.
145229
///
146230
/// Order is meaningful.
147231
keys: Vec<(T, Key)>,
148232
}
149233

234+
impl<T> Keys<T> {
235+
pub fn len(&self) -> usize {
236+
self.keys.len()
237+
}
238+
pub fn highest_bit_len(&self) -> BitLen {
239+
self.highest_bit_len
240+
}
241+
}
242+
243+
impl<T> IntoIterator for Keys<T> {
244+
type Item = (T, Key);
245+
type IntoIter = std::vec::IntoIter<(T, Key)>;
246+
fn into_iter(self) -> Self::IntoIter {
247+
self.keys.into_iter()
248+
}
249+
}
250+
150251
impl<T> Keys<T>
151252
where
152253
T: Ord + Clone,
@@ -155,12 +256,12 @@ where
155256
///
156257
/// Optionally, `max_bit_len` may specify a largest acceptable bit length.
157258
/// If `Keys` may not be computed without exceeding this bit length,
158-
/// fail with `Err(problemantic_bit_length)`.
259+
/// fail with `Err(problemantic_bit_len)`.
159260
///
160261
/// The current implementation only attempts to produce the best compression
161-
/// level. This may cause us to exceed `max_bit_length` even though an
262+
/// level. This may cause us to exceed `max_bit_len` even though an
162263
/// alternative table, with a lower compression level, would let us
163-
/// proceed without exceeding `max_bit_length`.
264+
/// proceed without exceeding `max_bit_len`.
164265
///
165266
/// # Performance
166267
///
@@ -185,9 +286,9 @@ where
185286
/// with a number of instances already attached.
186287
///
187288
/// The current implementation only attempts to produce the best compression
188-
/// level. This may cause us to exceed `max_bit_length` even though an
289+
/// level. This may cause us to exceed `max_bit_len` even though an
189290
/// alternative table, with a lower compression level, would let us
190-
/// proceed without exceeding `max_bit_length`.
291+
/// proceed without exceeding `max_bit_len`.
191292
///
192293
/// # Requirement
193294
///
@@ -197,9 +298,9 @@ where
197298
S: IntoIterator<Item = (T, Instances)>,
198299
{
199300
let mut bit_lengths = Self::compute_bit_lengths(source, max_bit_len)?;
301+
let mut highest_bit_len = BitLen(0);
200302

201303
// Canonicalize order: (BitLen, T)
202-
// As values of `T` are
203304
bit_lengths.sort_unstable_by_key(|&(ref value, ref bit_len)| (*bit_len, value.clone()));
204305

205306
// The bits associated to the next value.
@@ -214,12 +315,18 @@ where
214315
);
215316
keys.push((symbol.clone(), Key::new(bits, bit_len)));
216317
bits = (bits + 1) << (next_bit_len - bit_len);
318+
if bit_len > highest_bit_len {
319+
highest_bit_len = bit_len;
320+
}
217321
}
218322
// Handle the last element.
219323
let (ref symbol, bit_len) = bit_lengths[bit_lengths.len() - 1];
220324
keys.push((symbol.clone(), Key::new(bits, bit_len)));
221325

222-
return Ok(Self { keys });
326+
return Ok(Self {
327+
highest_bit_len,
328+
keys,
329+
});
223330
}
224331

225332
/// Convert a sequence of values labelled by their number of instances

0 commit comments

Comments
 (0)