@@ -10,6 +10,7 @@ pub mod read;
10
10
11
11
/// A newtype for `u8` used to count the length of a key in bits.
12
12
#[ derive(
13
+ Constructor ,
13
14
Debug ,
14
15
Default ,
15
16
Display ,
@@ -82,30 +83,63 @@ impl BitSequence {
82
83
pub fn new ( bits : u32 , bit_len : BitLen ) -> Self {
83
84
Self { bits, bit_len }
84
85
}
86
+
85
87
pub fn bits ( & self ) -> u32 {
86
88
self . bits
87
89
}
90
+
88
91
/// The number of bits of `bits` to use.
89
92
pub fn bit_len ( & self ) -> BitLen {
90
93
self . bit_len
91
94
}
95
+
96
+ /// Split the bits into a prefix of `bit_len` bits and a suffix containing the
97
+ /// remaining bits.
98
+ ///
99
+ /// If `bit_len` is larger than the number of bits, the prefix is padded with
100
+ /// lower-weight bits into `bit_len` bits.
101
+ pub fn split_bits ( & self , bit_len : BitLen ) -> ( u32 , u32 ) {
102
+ debug_assert ! ( bit_len. as_u8( ) <= 32 ) ;
103
+ if self . bit_len <= bit_len {
104
+ let padding = bit_len - self . bit_len ;
105
+ ( self . bits << padding, 0 )
106
+ } else {
107
+ let shift = self . bit_len - bit_len;
108
+ match shift. into ( ) {
109
+ 32u8 => ( 0 , self . bits ) , // Special case: cannot >> 32
110
+ shift => (
111
+ self . bits >> shift,
112
+ self . bits & ( std:: u32:: MAX >> 32 - shift) ,
113
+ ) ,
114
+ }
115
+ }
116
+ }
117
+
92
118
/// Split the bits into a prefix of `bit_len` bits and a suffix of `self.bit_len - bit_len`
93
119
/// bits.
94
120
///
95
121
/// # Failure
96
122
///
97
123
/// This function panics if `bit_len > self.bit_len`.
98
- pub fn split ( & self , bit_len : BitLen ) -> ( u32 , u32 ) {
99
- let shift = self . bit_len - bit_len;
100
- match shift. into ( ) {
101
- 0u8 => ( self . bits , 0 ) , // Special case: cannot >> 32
102
- 32u8 => ( 0 , self . bits ) , // Special case: cannot >> 32
103
- shift => (
104
- self . bits >> shift,
105
- self . bits & ( std:: u32:: MAX >> 32 - shift) ,
124
+ pub fn split ( & self , bit_len : BitLen ) -> ( BitSequence , BitSequence ) {
125
+ let ( prefix, suffix) = self . split_bits ( bit_len) ;
126
+ (
127
+ BitSequence :: new ( prefix, bit_len) ,
128
+ BitSequence :: new (
129
+ suffix,
130
+ if self . bit_len >= bit_len {
131
+ self . bit_len - bit_len
132
+ } else {
133
+ BitLen :: new ( 0 )
134
+ } ,
106
135
) ,
107
- }
136
+ )
108
137
}
138
+
139
+ /// Add lowest-weight to this bit sequence bits until it reaches
140
+ /// a sufficient bit length.
141
+ ///
142
+ /// Does nothing if the bit sequence already has a sufficient bitlength.
109
143
pub fn pad_lowest_to ( & self , total_bit_len : BitLen ) -> Cow < BitSequence > {
110
144
assert ! ( total_bit_len. 0 <= 32u8 ) ;
111
145
if total_bit_len <= self . bit_len {
@@ -117,21 +151,93 @@ impl BitSequence {
117
151
}
118
152
Cow :: Owned ( BitSequence :: new ( self . bits << shift, total_bit_len) )
119
153
}
154
+
155
+ /// Prepend a sequence of bits to a sequencce.s
156
+ pub fn with_prefix ( & self , prefix : & BitSequence ) -> Self {
157
+ assert ! ( ( prefix. bit_len( ) + self . bit_len( ) ) . as_u8( ) <= 32 ) ;
158
+ let bits = self . bits | ( prefix. bits ( ) << self . bit_len ) ;
159
+ let bit_len = self . bit_len + prefix. bit_len ;
160
+ BitSequence :: new ( bits, bit_len)
161
+ }
162
+
163
+ /// Return a range representing all possible suffixes of this `BitSequence`
164
+ /// containing exactly `bit_len` bits.
165
+ ///
166
+ /// If this `BitSequence` is already at least `bit_len` bits long, we
167
+ /// truncate the `BitSequence` to `bit_len` bits by removing the
168
+ /// lower-weight bits and there is only one such suffix.
169
+ ///
170
+ /// ```
171
+ /// use binjs_io::context::huffman::{ BitLen, BitSequence };
172
+ ///
173
+ /// let zero = BitSequence::new(0, BitLen::new(0));
174
+ ///
175
+ /// let range = zero.suffixes(BitLen::new(0));
176
+ /// assert_eq!(range, 0..1);
177
+ ///
178
+ /// let range = zero.suffixes(BitLen::new(2));
179
+ /// assert_eq!(range, 0..4);
180
+ ///
181
+ /// let range = zero.suffixes(BitLen::new(3));
182
+ /// assert_eq!(range, 0..8);
183
+ ///
184
+ /// let range = zero.suffixes(BitLen::new(4));
185
+ /// assert_eq!(range, 0..16);
186
+ ///
187
+ /// let sequence = BitSequence::new(0b00000100, BitLen::new(3));
188
+ ///
189
+ /// let range = sequence.suffixes(BitLen::new(0));
190
+ /// assert_eq!(range, 0..1);
191
+ ///
192
+ /// let range = sequence.suffixes(BitLen::new(2));
193
+ /// assert_eq!(range, 2..3);
194
+ ///
195
+ /// let range = sequence.suffixes(BitLen::new(3));
196
+ /// assert_eq!(range, 4..5);
197
+ ///
198
+ /// let range = sequence.suffixes(BitLen::new(4));
199
+ /// assert_eq!(range, 8..10); // 0b000001000 to 0b00001001 included
200
+ /// ```
201
+ pub fn suffixes ( & self , bit_len : BitLen ) -> std:: ops:: Range < u32 > {
202
+ debug_assert ! ( bit_len. as_u8( ) as usize <= 8 * std:: mem:: size_of_val( & self . bits( ) ) ) ;
203
+ debug_assert ! (
204
+ std:: mem:: size_of_val( & self . bits( ) ) == std:: mem:: size_of:: <u32 >( ) ,
205
+ "The arithmetics relies upon the fact that we're only using `u32` for Huffman keys"
206
+ ) ;
207
+ let ( first, last) = if bit_len <= self . bit_len ( ) {
208
+ // We have too many bits, we need to truncate the bits,
209
+ // then return a single element.
210
+ let shearing: u8 = ( self . bit_len ( ) - bit_len) . as_u8 ( ) ;
211
+ let first = if shearing == 32 {
212
+ 0
213
+ } else {
214
+ self . bits ( ) >> shearing
215
+ } ;
216
+ ( first, first)
217
+ } else {
218
+ // We need to pad with lower-weight 0s.
219
+ let padding: u8 = ( bit_len - self . bit_len ( ) ) . as_u8 ( ) ;
220
+ let first = self . bits ( ) << padding;
221
+ let len = std:: u32:: MAX >> ( 8 * std:: mem:: size_of :: < u32 > ( ) as u8 - padding) ;
222
+ ( first, first + len)
223
+ } ;
224
+ first..( last + 1 )
225
+ }
120
226
}
121
227
122
228
#[ test]
123
229
fn test_bit_sequence_split ( ) {
124
230
let bits = 0b11111111_11111111_00000000_00000000 ;
125
231
let key = BitSequence :: new ( bits, BitLen ( 32 ) ) ;
126
- assert_eq ! ( key. split ( BitLen ( 0 ) ) , ( 0 , bits) ) ;
127
- assert_eq ! ( key. split ( BitLen ( 32 ) ) , ( bits, 0 ) ) ;
128
- assert_eq ! ( key. split ( BitLen ( 16 ) ) , ( 0b11111111_11111111 , 0 ) ) ;
232
+ assert_eq ! ( key. split_bits ( BitLen ( 0 ) ) , ( 0 , bits) ) ;
233
+ assert_eq ! ( key. split_bits ( BitLen ( 32 ) ) , ( bits, 0 ) ) ;
234
+ assert_eq ! ( key. split_bits ( BitLen ( 16 ) ) , ( 0b11111111_11111111 , 0 ) ) ;
129
235
130
236
let bits = 0b00000000_00000000_00000000_11111111 ;
131
237
let key = BitSequence :: new ( bits, BitLen ( 16 ) ) ;
132
- assert_eq ! ( key. split ( BitLen ( 0 ) ) , ( 0 , bits) ) ;
133
- assert_eq ! ( key. split ( BitLen ( 16 ) ) , ( bits, 0 ) ) ;
134
- assert_eq ! ( key. split ( BitLen ( 8 ) ) , ( 0 , 0b11111111 ) ) ;
238
+ assert_eq ! ( key. split_bits ( BitLen ( 0 ) ) , ( 0 , bits) ) ;
239
+ assert_eq ! ( key. split_bits ( BitLen ( 16 ) ) , ( bits, 0 ) ) ;
240
+ assert_eq ! ( key. split_bits ( BitLen ( 8 ) ) , ( 0 , 0b11111111 ) ) ;
135
241
}
136
242
137
243
/// A Huffman key
@@ -159,6 +265,10 @@ impl Key {
159
265
Key ( BitSequence { bits, bit_len } )
160
266
}
161
267
268
+ pub fn from_bit_sequence ( sequence : BitSequence ) -> Self {
269
+ Self :: new ( sequence. bits , sequence. bit_len )
270
+ }
271
+
162
272
/// The bits in this Key.
163
273
///
164
274
/// # Invariant
@@ -176,6 +286,11 @@ impl Key {
176
286
pub fn as_bit_sequence ( & self ) -> & BitSequence {
177
287
& self . 0
178
288
}
289
+
290
+ pub fn with_prefix ( & self , prefix : & BitSequence ) -> Self {
291
+ let sequence = self . 0 . with_prefix ( prefix) ;
292
+ Key :: from_bit_sequence ( sequence)
293
+ }
179
294
}
180
295
181
296
/// A node in the Huffman tree.
@@ -219,43 +334,46 @@ impl<T> PartialEq for Node<T> {
219
334
}
220
335
impl < T > Eq for Node < T > { }
221
336
222
- /// Keys associated to a sequence of values.
337
+ /// Codebook associated to a sequence of values.
223
338
#[ derive( Clone , Debug ) ]
224
- pub struct Keys < T > {
225
- /// The longest bit length that actually appears in `keys `.
339
+ pub struct Codebook < T > {
340
+ /// The longest bit length that actually appears in `mappings `.
226
341
highest_bit_len : BitLen ,
227
342
228
343
/// The sequence of keys.
229
344
///
230
345
/// Order is meaningful.
231
- keys : Vec < ( T , Key ) > ,
346
+ mappings : Vec < ( T , Key ) > ,
232
347
}
233
348
234
- impl < T > Keys < T > {
349
+ impl < T > Codebook < T > {
350
+ /// The number of elements in this Codebook.
235
351
pub fn len ( & self ) -> usize {
236
- self . keys . len ( )
352
+ self . mappings . len ( )
237
353
}
354
+
355
+ /// The longest bit length that acctually appears in this Codebook.
238
356
pub fn highest_bit_len ( & self ) -> BitLen {
239
357
self . highest_bit_len
240
358
}
241
359
}
242
360
243
- impl < T > IntoIterator for Keys < T > {
361
+ impl < T > IntoIterator for Codebook < T > {
244
362
type Item = ( T , Key ) ;
245
363
type IntoIter = std:: vec:: IntoIter < ( T , Key ) > ;
246
364
fn into_iter ( self ) -> Self :: IntoIter {
247
- self . keys . into_iter ( )
365
+ self . mappings . into_iter ( )
248
366
}
249
367
}
250
368
251
- impl < T > Keys < T >
369
+ impl < T > Codebook < T >
252
370
where
253
371
T : Ord + Clone ,
254
372
{
255
- /// Compute a `Keys ` from a sequence of values.
373
+ /// Compute a `Codebook ` from a sequence of values.
256
374
///
257
375
/// Optionally, `max_bit_len` may specify a largest acceptable bit length.
258
- /// If `Keys ` may not be computed without exceeding this bit length,
376
+ /// If the `Codebook ` may not be computed without exceeding this bit length,
259
377
/// fail with `Err(problemantic_bit_len)`.
260
378
///
261
379
/// The current implementation only attempts to produce the best compression
@@ -278,11 +396,11 @@ where
278
396
let counter = map. entry ( item) . or_insert ( 0 . into ( ) ) ;
279
397
* counter += 1 . into ( ) ;
280
398
}
281
- // Then compute the `Keys `.
399
+ // Then compute the `Codebook `.
282
400
Self :: from_instances ( map, max_bit_len)
283
401
}
284
402
285
- /// Compute a `Keys ` from a sequence of values
403
+ /// Compute a `Codebook ` from a sequence of values
286
404
/// with a number of instances already attached.
287
405
///
288
406
/// The current implementation only attempts to produce the best compression
@@ -305,27 +423,27 @@ where
305
423
306
424
// The bits associated to the next value.
307
425
let mut bits = 0 ;
308
- let mut keys = Vec :: with_capacity ( bit_lengths. len ( ) ) ;
426
+ let mut mappings = Vec :: with_capacity ( bit_lengths. len ( ) ) ;
309
427
310
428
for i in 0 ..bit_lengths. len ( ) - 1 {
311
429
let ( bit_len, symbol, next_bit_len) = (
312
430
bit_lengths[ i] . 1 ,
313
431
bit_lengths[ i] . 0 . clone ( ) ,
314
432
bit_lengths[ i + 1 ] . 1 ,
315
433
) ;
316
- keys . push ( ( symbol. clone ( ) , Key :: new ( bits, bit_len) ) ) ;
434
+ mappings . push ( ( symbol. clone ( ) , Key :: new ( bits, bit_len) ) ) ;
317
435
bits = ( bits + 1 ) << ( next_bit_len - bit_len) ;
318
436
if bit_len > highest_bit_len {
319
437
highest_bit_len = bit_len;
320
438
}
321
439
}
322
440
// Handle the last element.
323
441
let ( ref symbol, bit_len) = bit_lengths[ bit_lengths. len ( ) - 1 ] ;
324
- keys . push ( ( symbol. clone ( ) , Key :: new ( bits, bit_len) ) ) ;
442
+ mappings . push ( ( symbol. clone ( ) , Key :: new ( bits, bit_len) ) ) ;
325
443
326
444
return Ok ( Self {
327
445
highest_bit_len,
328
- keys ,
446
+ mappings ,
329
447
} ) ;
330
448
}
331
449
@@ -412,26 +530,73 @@ where
412
530
#[ test]
413
531
fn test_coded_from_sequence ( ) {
414
532
let sample = "appl" ;
415
- let coded = Keys :: from_sequence ( sample. chars ( ) , std:: u8:: MAX ) . unwrap ( ) ;
533
+ let coded = Codebook :: from_sequence ( sample. chars ( ) , std:: u8:: MAX ) . unwrap ( ) ;
416
534
417
535
// Symbol 'p' appears twice, we should see 3 codes.
418
- assert_eq ! ( coded. keys . len( ) , 3 ) ;
536
+ assert_eq ! ( coded. mappings . len( ) , 3 ) ;
419
537
420
538
// Check order of symbols.
421
- assert_eq ! ( coded. keys [ 0 ] . 0 , 'p' ) ;
422
- assert_eq ! ( coded. keys [ 1 ] . 0 , 'a' ) ;
423
- assert_eq ! ( coded. keys [ 2 ] . 0 , 'l' ) ;
539
+ assert_eq ! ( coded. mappings [ 0 ] . 0 , 'p' ) ;
540
+ assert_eq ! ( coded. mappings [ 1 ] . 0 , 'a' ) ;
541
+ assert_eq ! ( coded. mappings [ 2 ] . 0 , 'l' ) ;
424
542
425
543
// Check bit length of symbols.
426
- assert_eq ! ( coded. keys [ 0 ] . 1 . bit_len( ) , 1 . into( ) ) ;
427
- assert_eq ! ( coded. keys [ 1 ] . 1 . bit_len( ) , 2 . into( ) ) ;
428
- assert_eq ! ( coded. keys [ 2 ] . 1 . bit_len( ) , 2 . into( ) ) ;
544
+ assert_eq ! ( coded. mappings [ 0 ] . 1 . bit_len( ) , 1 . into( ) ) ;
545
+ assert_eq ! ( coded. mappings [ 1 ] . 1 . bit_len( ) , 2 . into( ) ) ;
546
+ assert_eq ! ( coded. mappings [ 2 ] . 1 . bit_len( ) , 2 . into( ) ) ;
429
547
430
548
// Check code of symbols.
431
- assert_eq ! ( coded. keys [ 0 ] . 1 . bits( ) , 0b00 ) ;
432
- assert_eq ! ( coded. keys [ 1 ] . 1 . bits( ) , 0b10 ) ;
433
- assert_eq ! ( coded. keys [ 2 ] . 1 . bits( ) , 0b11 ) ;
549
+ assert_eq ! ( coded. mappings [ 0 ] . 1 . bits( ) , 0b00 ) ;
550
+ assert_eq ! ( coded. mappings [ 1 ] . 1 . bits( ) , 0b10 ) ;
551
+ assert_eq ! ( coded. mappings [ 2 ] . 1 . bits( ) , 0b11 ) ;
434
552
435
553
// Let's try again with a limit to 1 bit paths.
436
- assert_eq ! ( Keys :: from_sequence( sample. chars( ) , 1 ) . unwrap_err( ) , 2 ) ;
554
+ assert_eq ! ( Codebook :: from_sequence( sample. chars( ) , 1 ) . unwrap_err( ) , 2 ) ;
555
+ }
556
+
557
+ impl < T > Codebook < T > {
558
+ /// Create an empty Codebook
559
+ pub fn new ( ) -> Self {
560
+ Self {
561
+ highest_bit_len : BitLen :: new ( 0 ) ,
562
+ mappings : vec ! [ ] ,
563
+ }
564
+ }
565
+
566
+ /// Create an empty Codebook
567
+ pub fn with_capacity ( len : usize ) -> Self {
568
+ Self {
569
+ highest_bit_len : BitLen :: new ( 0 ) ,
570
+ mappings : Vec :: with_capacity ( len) ,
571
+ }
572
+ }
573
+
574
+ /// Add a mapping to a Codebook.
575
+ ///
576
+ /// This method does **not** check that the resulting Codebook is correct.
577
+ pub unsafe fn add_mapping ( & mut self , value : T , key : Key ) {
578
+ if key. bit_len ( ) > self . highest_bit_len {
579
+ self . highest_bit_len = key. bit_len ( ) ;
580
+ }
581
+ self . mappings . push ( ( value, key) ) ;
582
+ }
583
+
584
+ /// Return the mappings of a Codebook.
585
+ pub fn mappings ( self ) -> Vec < ( T , Key ) > {
586
+ self . mappings
587
+ }
588
+
589
+ pub fn map < F , U > ( self , mut f : F ) -> Codebook < U >
590
+ where
591
+ F : FnMut ( T ) -> U ,
592
+ {
593
+ Codebook {
594
+ highest_bit_len : self . highest_bit_len ,
595
+ mappings : self
596
+ . mappings
597
+ . into_iter ( )
598
+ . map ( |( value, key) | ( f ( value) , key) )
599
+ . collect ( ) ,
600
+ }
601
+ }
437
602
}
0 commit comments