From 42d3f22fb981ca0be5b1cf4d071004241764be03 Mon Sep 17 00:00:00 2001 From: Takeru Ohta Date: Thu, 6 Oct 2016 03:31:31 +0900 Subject: [PATCH] First commit --- .gitignore | 2 + Cargo.toml | 7 ++ README.md | 4 + examples/gz_dec.rs | 69 ++++++++++++++ src/deflate.rs | 188 +++++++++++++++++++++++++++++++++++++ src/gzip.rs | 1 + src/huffman.rs | 224 +++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 12 +++ src/lz77.rs | 1 + 9 files changed, 508 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 README.md create mode 100644 examples/gz_dec.rs create mode 100644 src/deflate.rs create mode 100644 src/gzip.rs create mode 100644 src/huffman.rs create mode 100644 src/lib.rs create mode 100644 src/lz77.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a9d37c5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +target +Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..e596647 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "libflate" +version = "0.1.0" +authors = ["Takeru Ohta "] + +[dependencies] +byteorder = "*" diff --git a/README.md b/README.md new file mode 100644 index 0000000..efdc538 --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +libflate +======== + +A Rust implementation of [DEFLATE](https://tools.ietf.org/html/rfc1951) algorithm. diff --git a/examples/gz_dec.rs b/examples/gz_dec.rs new file mode 100644 index 0000000..23510d1 --- /dev/null +++ b/examples/gz_dec.rs @@ -0,0 +1,69 @@ +extern crate byteorder; +extern crate libflate; + +use std::io; +use std::io::Read; +use byteorder::ReadBytesExt; +use byteorder::LittleEndian; + +fn main() { + let mut reader = io::stdin(); + let id1 = reader.read_u8().unwrap(); + let id2 = reader.read_u8().unwrap(); + let mode = reader.read_u8().unwrap(); + let flag = reader.read_u8().unwrap(); + let mtime = reader.read_u32::().unwrap(); + let xfl = reader.read_u8().unwrap(); + let os = reader.read_u8().unwrap(); + if flag & 0b0001 != 0 { + panic!(); + } + if flag & 0b0010 != 0 { + panic!(); + } + if flag & 0b0100 != 0 { + panic!(); + } + if flag & 0b1000 != 0 { + // FNAME + let mut name = String::new(); + loop { + let b = reader.read_u8().unwrap(); + if b == 0 { + break; + } + name.push(b as char); + } + println!("NAME: {}\n", name); + } + if flag & 0b10000 != 0 { + panic!(); + } + + println!(" +# HEADER +- id1: {} +- id2: {} +- mode: {} +- flag: {} +- mtime: {} +- xfl: {} +- os: {} +", + id1, + id2, + mode, + flag, + mtime, + xfl, + os); + + let mut dec = libflate::deflate::Decoder::new(reader); + let mut buf = Vec::new(); + dec.read_to_end(&mut buf).unwrap(); + println!(" +# BODY +{} +", + String::from_utf8_lossy(&buf)); +} diff --git a/src/deflate.rs b/src/deflate.rs new file mode 100644 index 0000000..d39ca19 --- /dev/null +++ b/src/deflate.rs @@ -0,0 +1,188 @@ +use std::io; +use std::io::Read; +use std::cmp; +use std::iter; +use byteorder::ReadBytesExt; + +use huffman; + +pub struct Decoder { + reader: BitReader, + block_buf: Vec, + block_offset: usize, + eos: bool, +} +impl Decoder + where R: Read +{ + pub fn new(reader: R) -> Self { + Decoder { + reader: BitReader::new(reader), + block_buf: Vec::new(), + block_offset: 0, + eos: false, + } + } + pub fn into_reader(self) -> R { + self.reader.into_byte_reader() + } + fn read_non_compressed_block(&mut self) -> io::Result<()> { + let len = try!(self.reader.read_byte_aligned_u16()); + let nlen = try!(self.reader.read_byte_aligned_u16()); + if !len != nlen { + Err(io::Error::new(io::ErrorKind::InvalidData, + format!("LEN={} is not the one's complement of NLEN={}", len, nlen))) + } else { + self.block_buf.resize(len as usize, 0); + self.block_offset = 0; + try!(self.reader.byte_reader.read_exact(&mut self.block_buf)); + Ok(()) + } + } + fn read_compressed_block(&mut self, is_dynamic: bool) -> io::Result<()> { + let mut huffman = if is_dynamic { + let hlit = try!(self.reader.read_bits_u8(5)) as u16 + 257; + let hdist = try!(self.reader.read_bits_u8(5)) + 1; + let hclen = try!(self.reader.read_bits_u8(4)) + 4; + + let mut hc = [0; 19]; + let indices = [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15]; + for &i in indices.iter().take(hclen as usize) { + hc[i] = try!(self.reader.read_bits_u8(3)); + } + println!("{:?}", hc); + let mut code_length_codes = huffman::Decoder2::from_lens(&hc[..]); + + let mut lit_lens = Vec::with_capacity(hlit as usize); + while lit_lens.len() < hlit as usize { + let c = try!(code_length_codes.decode(&mut self.reader)); + match c { + 0...15 => { + lit_lens.push(c as u8); + } + 16 => { + let count = try!(self.reader.read_bits_u8(2)) + 3; + let last = lit_lens.last().cloned().unwrap(); + lit_lens.extend(iter::repeat(last).take(count as usize)); + } + 17 => { + let zeros = try!(self.reader.read_bits_u8(3)) + 3; + lit_lens.extend(iter::repeat(0).take(zeros as usize)); + } + 18 => { + let zeros = try!(self.reader.read_bits_u8(7)) + 11; + lit_lens.extend(iter::repeat(0).take(zeros as usize)); + } + _ => unreachable!(), + } + } + println!("{:?}", lit_lens); + let mut lite_codes = huffman::Decoder2::from_lens(&lit_lens[..]); + + panic!("# {}, {}, {}", hlit, hdist, hclen); + } else { + huffman::Decoder::new_fixed() + }; + loop { + let s = try!(huffman.decode_one(&mut self.reader)); + println!("SYM: {:?}", s); + } + panic!() + } +} +impl Read for Decoder + where R: Read +{ + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if self.block_offset < self.block_buf.len() { + let copy_size = cmp::min(buf.len(), self.block_buf.len() - self.block_offset); + buf[..copy_size].copy_from_slice(&self.block_buf[self.block_offset..][..copy_size]); + self.block_offset += copy_size; + Ok(copy_size) + } else if self.eos { + Ok(0) + } else { + let bfinal = try!(self.reader.read_bit()); + let btype = try!(self.reader.read_bits_u8(2)); + println!("BFINAL: {}", bfinal); + self.eos = bfinal; + match btype { + 0b00 => { + try!(self.read_non_compressed_block()); + self.read(buf) + } + 0b01 => { + try!(self.read_compressed_block(false)); + self.read(buf) + } + 0b10 => { + try!(self.read_compressed_block(true)); + self.read(buf) + } + 0b11 => { + Err(io::Error::new(io::ErrorKind::InvalidData, + "btype 0x11 of DEFLATE is reserved(error) value")) + } + _ => unreachable!(), + } + } + } +} + +pub struct BitReader { + byte_reader: R, + last_byte: u8, + offset: usize, +} +impl BitReader + where R: Read +{ + pub fn new(byte_reader: R) -> Self { + BitReader { + byte_reader: byte_reader, + last_byte: 0, + offset: 8, + } + } + pub fn into_byte_reader(self) -> R { + self.byte_reader + } + pub fn read_bit(&mut self) -> io::Result { + if self.offset == 8 { + self.last_byte = try!(self.byte_reader.read_u8()); + self.offset = 0; + } + let bit = (self.last_byte & (1 << self.offset)) != 0; + self.offset += 1; + Ok(bit) + } + pub fn read_bits_u8(&mut self, bits: usize) -> io::Result { + assert!(bits <= 8); + // TODO: optimize + let mut n = 0; + for i in 0..bits { + let bit = try!(self.read_bit()); + n |= (bit as u8) << i; + } + Ok(n) + } + pub fn read_bits_u16(&mut self, bits: usize) -> io::Result { + assert!(bits <= 16); + // TODO: optimize + let mut n = 0; + for i in 0..bits { + let bit = try!(self.read_bit()); + n |= (bit as u16) << i; + } + Ok(n) + } + pub fn read_byte_aligned_u16(&mut self) -> io::Result { + if self.offset != 0 { + self.last_byte = try!(self.byte_reader.read_u8()); + } + self.offset = 8; + let low = self.last_byte as u16; + let high = try!(self.byte_reader.read_u8()) as u16; + Ok((high << 8) | low) + } +} diff --git a/src/gzip.rs b/src/gzip.rs new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/gzip.rs @@ -0,0 +1 @@ + diff --git a/src/huffman.rs b/src/huffman.rs new file mode 100644 index 0000000..be0095c --- /dev/null +++ b/src/huffman.rs @@ -0,0 +1,224 @@ +/// Length-limited Huffman Codes +/// +/// Reference: https://www.ics.uci.edu/~dan/pubs/LenLimHuff.pdf +use std::io; + +use deflate::BitReader; // TODO: move + +const CODE_UNDEF: u16 = 0; + +pub struct Codes { + table: [u16; 0x10000], +} +impl Codes { + fn new() -> Self { + Codes { table: [CODE_UNDEF; 0x10000] } + } + fn set_mapping(&mut self, length: u8, from: u16, to: u16) { + self.table[from as usize] = (to << 5) + (length as u16); + } + fn decode(&self, length: u8, code: u16) -> Option { + let x = self.table[code as usize]; + if x & 0b11111 != length as u16 { + return None; + } else { + Some(x >> 5) + } + } +} + +pub fn fixed_literal_length_codes() -> Codes { + let mut codes = Codes::new(); + for i in 0..144 { + codes.set_mapping(8, 0b0011_0000 + i, i); + } + for i in 144..256 { + codes.set_mapping(9, 0b1_1001_0000 + i - 144, i); + } + for i in 256..280 { + codes.set_mapping(7, 0b000_0000 + i - 256, i); + } + for i in 280..287 { + codes.set_mapping(8, 0b1100_0000 + i - 280, i); + } + codes +} + +pub fn fixed_distance_codes() -> Codes { + let mut codes = Codes::new(); + for i in 0..30 { + codes.set_mapping(5, i, i); + } + codes +} + +pub struct Decoder2 { + codes: Codes, +} +impl Decoder2 { + pub fn from_lens(lens: &[u8]) -> Self { + // NOTE: Canonical Huffman Code + let mut codes = Vec::new(); + for (code, count) in lens.iter().cloned().enumerate() { + if count == 0 { + continue; + } + codes.push((code as u16, count)); + } + println!("=> {:?}", codes); + codes.sort_by_key(|x| x.1); + + let mut cs = Codes::new(); + let mut from = 0; + let mut prev_count = 0; + for (code, count) in codes { + if prev_count != count { + from <<= count - prev_count; + prev_count = count; + } + cs.set_mapping(count, from, code); + from += 1; + } + Decoder2 { codes: cs } + } + pub fn decode(&mut self, reader: &mut BitReader) -> io::Result + where R: io::Read + { + let mut code = try!(reader.read_bit()) as u16; + let mut length = 1; + for _ in 0..16 { + if let Some(decoded) = self.codes.decode(length, code) { + return Ok(decoded); + } + code = (code << 1) | (try!(reader.read_bit()) as u16); + length += 1; + } + Err(io::Error::new(io::ErrorKind::InvalidData, "TODO")) + } +} + +pub struct Decoder { + literal_codes: Codes, + distance_codes: Codes, +} +impl Decoder { + pub fn new_fixed() -> Self { + Decoder { + literal_codes: fixed_literal_length_codes(), + distance_codes: fixed_distance_codes(), + } + } + fn decode_literal_or_length(&mut self, reader: &mut BitReader) -> io::Result + where R: io::Read + { + let mut code = try!(reader.read_bit()) as u16; + let mut length = 1; + for _ in 0..16 { + if let Some(decoded) = self.literal_codes.decode(length, code) { + println!("! {}@{0:b}[{}] => {}", code, length, decoded); + let s = match decoded { + 0...255 => Symbol::Literal(decoded as u8), + 256 => Symbol::EndOfBlock, + length_code => { + let (base, extra) = decode_length(length_code); + let length = base + try!(reader.read_bits_u8(extra)) as u16; + Symbol::Share { + length: length, + distance: 0, + } + } + }; + return Ok(s); + } + code = (code << 1) | (try!(reader.read_bit()) as u16); + length += 1; + } + Err(io::Error::new(io::ErrorKind::InvalidData, + "Can not decode literal or length code")) + } + fn decode_distance(&mut self, reader: &mut BitReader) -> io::Result + where R: io::Read + { + let mut code = try!(reader.read_bit()) as u16; + let mut length = 1; + for _ in 0..16 { + if let Some(decoded) = self.distance_codes.decode(length, code) { + println!("@ {} => {}", code, decoded); + let (base, extra) = decode_distance(decoded); + println!("# {}, {}", base, extra); + let distance = base + try!(reader.read_bits_u16(extra)) as u16; + return Ok(distance); + } + code = (code << 1) | (try!(reader.read_bit()) as u16); + length += 1; + } + Err(io::Error::new(io::ErrorKind::InvalidData, "Can not decode distance code")) + } + pub fn decode_one(&mut self, reader: &mut BitReader) -> io::Result + where R: io::Read + { + self.decode_literal_or_length(reader).and_then(|mut s| { + if let Symbol::Share { ref mut distance, .. } = s { + *distance = try!(self.decode_distance(reader)); + } + Ok(s) + }) + } +} + +fn decode_distance(code: u16) -> (u16, usize) { + let table = [(1, 0), + (2, 0), + (3, 0), + (4, 0), + (5, 1), + (7, 1), + (9, 2), + (13, 2), + (17, 3), + (25, 3), + (33, 4), + (49, 4), + (65, 5), + (97, 5), + (129, 6), + (193, 6), + (257, 7), + (385, 7), + (513, 8), + (769, 8), + (1025, 9), + (1537, 9), + (2049, 10), + (3073, 10), + (4097, 11), + (6145, 11), + (8193, 12), + (12289, 12), + (16385, 13), + (24577, 13)]; + table[code as usize] +} +fn decode_length(code: u16) -> (u16, usize) { + let table = [(3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 1), + (13, 1), (15, 1), (17, 1), (19, 2), (23, 2), (27, 2), (31, 2), (35, 3), (43, 3), + (51, 3), (59, 3), (67, 4), (83, 4), (99, 4), (115, 4), (131, 5), (163, 5), + (195, 5), (227, 5), (258, 0)]; + let index = (code - 257) as usize; + table[index] +} + +#[derive(Debug)] +pub enum Symbol { + EndOfBlock, + Literal(u8), + + // TODO: name + Share { length: u16, distance: u16 }, +} + +#[cfg(test)] +mod test { + #[test] + fn it_works() {} +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..65c80b4 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,12 @@ +extern crate byteorder; + +pub mod lz77; +pub mod huffman; + +pub mod deflate; + +#[cfg(test)] +mod tests { + #[test] + fn it_works() {} +} diff --git a/src/lz77.rs b/src/lz77.rs new file mode 100644 index 0000000..fc0e9ae --- /dev/null +++ b/src/lz77.rs @@ -0,0 +1 @@ +pub struct Lz77;