From 81ae67c934e58ba65c37244ccf21f7cd469ade3e Mon Sep 17 00:00:00 2001 From: Charles Date: Fri, 26 Jun 2020 10:17:45 +0200 Subject: Added serialization --- .gitignore | 1 + README.md | 18 ++++++++++++++++-- src/bits.rs | 4 ++-- src/conversion.rs | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 40 +++++++++++++++++++++++++--------------- src/tree.rs | 45 +++++++++++++++++++++++++-------------------- 6 files changed, 117 insertions(+), 39 deletions(-) create mode 100644 src/conversion.rs diff --git a/.gitignore b/.gitignore index eed3cc5..104a166 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /target *.txt +*.huffman diff --git a/README.md b/README.md index 74ef205..75fb0ad 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,23 @@ # huffman -Implementation of [Huffman coding](https://en.wikipedia.org/wiki/Huffman_coding?oldformat=true) made for educational purposes. +Implementation of the [Huffman coding](https://en.wikipedia.org/wiki/Huffman_coding?oldformat=true) made for educational purposes. ## Usage +* compress: `cargo run < input_file > output_file.huffman` +* decompress: `cargo run d < input_file.huffman > output_file` + `python3 draft.py [file_name]` to run the python draft. -`cargo run [file_name]` for the real thing. +## File format + +Compress to a custom `.huffman` file format which is a header of the huffman coding tree followed by the compressed content. + + +### Header format + +4 byte unsigned int: header size (including this field) +Conversion table where each entry's format is: + 1 byte for the actual byte value + 1 byte for size of representation in bits + the representation aligned on a 8-bit boundary diff --git a/src/bits.rs b/src/bits.rs index b479cc9..0f944ca 100644 --- a/src/bits.rs +++ b/src/bits.rs @@ -1,7 +1,7 @@ #[derive(Clone)] pub struct BitSet { - data: Vec, - len: usize, + pub data: Vec, + pub len: usize, } impl BitSet { diff --git a/src/conversion.rs b/src/conversion.rs new file mode 100644 index 0000000..ad7c320 --- /dev/null +++ b/src/conversion.rs @@ -0,0 +1,48 @@ +use std::collections::HashMap; + +use super::bits::BitSet; +use super::tree::Tree; + +pub struct Table(pub HashMap); + +impl Table { + pub fn from_tree(tree: &Tree) -> Table { + Table(tree.to_hash_map()) + } + + pub fn convert(&self, data: Vec) -> Vec { + let mut bitset = BitSet::new(); + for byte in data { + bitset.concat(&self.0[&byte]); + } + bitset.data + } + + pub fn serialize(&self) -> Vec { + let mut out: Vec = Vec::new(); + let size: u32 = self.0.iter() + .fold(0, |acc, (_, v)| acc + 2 + v.data.len() as u32); + out.push(((size & 0xff000000) >> 24) as u8); + out.push(((size & 0x00ff0000) >> 16) as u8); + out.push(((size & 0x0000ff00) >> 8) as u8); + out.push(((size & 0x000000ff) >> 0) as u8); + + for (k, v) in &self.0 { + out.push(*k); + out.push(v.len as u8); + out.extend(v.data.iter()); + } + out + } +} + +use std::fmt; + +impl fmt::Debug for Table { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for (k, v) in self.0.iter() { + write!(f, "{:4} {:?}\n", format!("{:?}", *k as char), v)?; + } + Ok(()) + } +} diff --git a/src/main.rs b/src/main.rs index 920aee2..a54a3bd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,30 +1,40 @@ use std::env; -use std::fs; +// use std::fs; +use std::io; use std::io::Read; +use std::io::Write; pub mod tree; pub mod bits; +pub mod conversion; use tree::Tree; -use bits::BitSet; fn main() { - let file_path = env::args().nth(1).unwrap(); + if let Some(s) = env::args().nth(1) { + if s != "d" { + return + } + // let data: Vec = io::stdin().bytes().map(|x| x.unwrap()).collect(); + // deserialize - let f = fs::File::open(file_path).unwrap(); - let data: Vec = f.bytes().map(|x| x.unwrap()).collect(); + } else { + // let f = fs::File::open(file_path).unwrap(); - let tree = Tree::from_data(&data); - tree.put(); + let data: Vec = io::stdin().bytes().map(|x| x.unwrap()).collect(); - let bitmap = tree.to_bit_map(); - for (k, v) in bitmap.iter() { - println!("{:4} {:?}", format!("{:?}", *k as char), v); - } + let tree = Tree::from_data(&data); + // print!("{:?}", tree); + + let table = conversion::Table::from_tree(&tree); + // print!("{:?}", table); + + let converted_data = table.convert(data); + // println!("{:?}", converted_data); + + let header = table.serialize(); - let mut bitset = BitSet::new(); - for byte in data { - bitset.concat(&bitmap[&byte]); + io::stdout().write_all(&header).unwrap(); + io::stdout().write_all(&converted_data).unwrap(); } - println!("{:?}", bitset); } diff --git a/src/tree.rs b/src/tree.rs index c7e8aae..7d40979 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -1,3 +1,4 @@ +use std::fmt; use std::collections::{BinaryHeap, HashMap}; use super::bits::BitSet; @@ -52,7 +53,7 @@ impl Tree { heap.pop().unwrap() } - pub fn to_bit_map(&self) -> HashMap { + pub fn to_hash_map(&self) -> HashMap { match &self.content { Content::Leaf(b) => { let mut hm = HashMap::with_capacity(1); @@ -60,11 +61,11 @@ impl Tree { hm }, Content::Parent { left, right } => { - let mut left_hm = left.to_bit_map(); + let mut left_hm = left.to_hash_map(); for (_, bits) in left_hm.iter_mut() { bits.push_front_bit(0); } - let mut right_hm = right.to_bit_map(); + let mut right_hm = right.to_hash_map(); for (_, bits) in right_hm.iter_mut() { bits.push_front_bit(1); } @@ -74,31 +75,29 @@ impl Tree { } } - pub fn put(&self) { - self.put_with_level(0); + fn fmt_spaces(f: &mut fmt::Formatter, level: usize) -> fmt::Result { + for _ in 0..level { + write!(f, " ")?; + } + Ok(()) } - fn put_with_level(&self, level: usize) { + fn fmt_with_level(&self, f: &mut fmt::Formatter, level: usize) -> fmt::Result { match &self.content { Content::Leaf(b) => { - print!("{} {}\n", self.occurences, b); + write!(f, "{} {}\n", self.occurences, b)?; }, Content::Parent { left, right } => { - print!("NODE {}\n", self.occurences); - Tree::put_spaces(level); - print!("left: "); - left.put_with_level(level + 1); - Tree::put_spaces(level); - print!("right: "); - right.put_with_level(level + 1); + write!(f, "NODE {}\n", self.occurences)?; + Tree::fmt_spaces(f, level)?; + write!(f, "left: ")?; + left.fmt_with_level(f, level + 1)?; + Tree::fmt_spaces(f, level)?; + write!(f, "right: ")?; + right.fmt_with_level(f, level + 1)?; }, } - } - - fn put_spaces(n: usize) { - for _ in 0..(n * 2) { - print!(" "); - } + Ok(()) } } @@ -124,3 +123,9 @@ impl PartialEq for Node { self.occurences == other.occurences } } + +impl fmt::Debug for Tree { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.fmt_with_level(f, 0) + } +} -- cgit