From 063afb1650160c8973de51603be0acca53b39e5d Mon Sep 17 00:00:00 2001 From: Charles Date: Thu, 25 Jun 2020 18:17:41 +0200 Subject: Initial commit: Rewrite in Rust --- .gitignore | 2 + Cargo.lock | 5 +++ Cargo.toml | 9 +++++ README.md | 9 +++++ draft.py | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 12 ++++++ src/symbols.rs | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/tree.rs | 22 +++++++++++ 8 files changed, 293 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 README.md create mode 100755 draft.py create mode 100644 src/main.rs create mode 100644 src/symbols.rs create mode 100644 src/tree.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..eed3cc5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +*.txt diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..e3b5bc6 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,5 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "huffman" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..192f14f --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "huffman" +version = "0.1.0" +authors = ["Charles "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/README.md b/README.md new file mode 100644 index 0000000..74ef205 --- /dev/null +++ b/README.md @@ -0,0 +1,9 @@ +# huffman + +Implementation of [Huffman coding](https://en.wikipedia.org/wiki/Huffman_coding?oldformat=true) made for educational purposes. + +## Usage + +`python3 draft.py [file_name]` to run the python draft. + +`cargo run [file_name]` for the real thing. diff --git a/draft.py b/draft.py new file mode 100755 index 0000000..f822362 --- /dev/null +++ b/draft.py @@ -0,0 +1,112 @@ +#!/usr/bin/python3 + +import sys +import heapq +import collections +import copy + + +class BitRepr: + def __init__(self, x: int, length: int): + self.x = x + self.length = length + + def add_zero(self): + self.x = self.x << 1 + self.length += 1 + + def add_one(self): + self.x = self.x << 1 + self.length += 1 + self.x = self.x | 1 + + def __repr__(self): + return "{0:b}".format(self.x).zfill(self.length) + + +class HuffmanNode: + def __init__( + self, occurence: int, is_leaf: bool, + symbol: int = None, left: 'HuffmanNode' = None, right: 'HuffmanNode' = None + ): + self.occurence = occurence + self.is_leaf = is_leaf + if is_leaf: + self.symbol = symbol + else: + self.left = left + self.right = right + + @staticmethod + def leaf(occurence: int, symbol: int): + return HuffmanNode(occurence, is_leaf=True, symbol=symbol) + + @staticmethod + def node(left: 'HuffmanNode', right: 'HuffmanNode'): + return HuffmanNode(left.occurence + right.occurence, is_leaf=False, left=left, right=right) + + def __gt__(self, other: 'HuffmanNode') -> bool: + return self.occurence > other.occurence + + def __lt__(self, other: 'HuffmanNode') -> bool: + return self.occurence < other.occurence + + def __repr__(self) -> str: + if self.is_leaf: + return f"({self.symbol} {self.occurence})" + else: + return f"(NODE {self.occurence})" # left {str(self.left)} right {str(self.right)})" + + def put_tree(self, level: int = 0): + if self.is_leaf: + print(f"({self.symbol} {self.occurence})") + return + print(f"NODE {self.occurence}") + print(" " * level * 4, "left ", end="") + self.left.put_tree(level + 1) + print(" " * level * 4, "right ", end="") + self.right.put_tree(level + 1) + + def to_bit_repr(self, parent_bits: 'BitRepr' = BitRepr(0, 0)) -> {int: 'BitRepr'}: + if self.is_leaf: + return {self.symbol: parent_bits} + left_bits = copy.deepcopy(parent_bits) + left_bits.add_zero() + right_bits = copy.deepcopy(parent_bits) + right_bits.add_one() + return { + **self.left.to_bit_repr(left_bits), + **self.right.to_bit_repr(right_bits) + } + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("missing file name") + sys.exit(1) + + file_name = sys.argv[1] + # print(file_name) + + content = None + with open(file_name, "rb") as f: + content = f.read() + counter = collections.Counter(content) + + huffman_nodes = [HuffmanNode.leaf(v, k) for k, v in dict(counter).items()] + heapq.heapify(huffman_nodes) + + while len(huffman_nodes) >= 2: + # print(huffman_nodes) + left = heapq.heappop(huffman_nodes) + right = heapq.heappop(huffman_nodes) + node = HuffmanNode.node(left, right) + heapq.heappush(huffman_nodes, node) + # print(huffman_nodes) + + huffman_tree = huffman_nodes[0] + huffman_tree.put_tree() + bit_repr = huffman_tree.to_bit_repr() + print(bit_repr) + + huffman_content = ''.join([str(bit_repr[s]) for s in content]) + print(huffman_content) diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..2b4e6c5 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,12 @@ +use std::env; + +pub mod symbols; +// pub mod tree; +// +use symbols::Tree; + +fn main() { + let file_path = env::args().nth(1).unwrap(); + let tree = Tree::from_file(&file_path).unwrap(); + tree.put(); +} diff --git a/src/symbols.rs b/src/symbols.rs new file mode 100644 index 0000000..cd8fd52 --- /dev/null +++ b/src/symbols.rs @@ -0,0 +1,122 @@ +use std::fs; +use std::io; +use std::io::Read; +use std::collections::{BinaryHeap, HashMap}; + +pub struct Node { + occurences: usize, + content: Content, +} + +impl Node { + fn join(left: Node, right: Node) -> Node { + Node { + occurences: left.occurences + right.occurences, + content: Content::Parent { + left: Box::new(left), + right: Box::new(right), + } + } + } +} + +enum Content { + Leaf(u8), + Parent { + left: Box, + right: Box, + }, +} + +pub type Tree = Node; + +impl Tree { + pub fn from_file(file_path: &str) -> io::Result { + let mut symbols = Symbols::from_file(file_path)?; + Ok(Tree::from_symbols(&mut symbols)) + } + + fn from_symbols(symbols: &mut Symbols) -> Tree { + while symbols.0.len() >= 2 { + let first = symbols.0.pop().unwrap(); + let second = symbols.0.pop().unwrap(); + let n = Node::join(first, second); + symbols.0.push(n); + } + let ret = symbols.0.pop().unwrap(); + ret + } + + pub fn put(&self) { + self.put_with_level(0); + } + + fn put_with_level(&self, level: usize) { + match &self.content { + Content::Leaf(b) => { + // Tree::put_spaces(level); + print!("{} {}\n", self.occurences, b); + }, + Content::Parent { left, right } => { + print!("NODE {}\n", self.occurences); + Tree::put_spaces(level); + print!("left: "); + left.put_with_level(level + 1); + Tree::put_spaces(level); + print!("right: "); + right.put_with_level(level + 1); + }, + } + } + + fn put_spaces(n: usize) { + for _ in 0..(n * 2) { + print!(" "); + } + } +} + +struct Symbols(BinaryHeap); + +impl Symbols { + fn from_file(file_path: &str) -> io::Result { + let f = fs::File::open(file_path)?; + let mut counter: HashMap = HashMap::new(); + for b in f.bytes() { + let k = b?; + let v = match counter.get(&k) { + Some(count) => count + 1, + None => 1, + }; + counter.insert(k, v); + } + let heap: BinaryHeap = counter + .iter() + .map(|(k, v)| Node { occurences: *v, content: Content::Leaf(*k) }) + .collect(); + Ok(Symbols(heap)) + } +} + +use std::cmp::Ordering; + +impl Ord for Node { + fn cmp(&self, other: &Self) -> Ordering { + // self.occurences.cmp(&other.occurences) + other.occurences.cmp(&self.occurences) // dirty hack for min-heap + } +} + +impl PartialOrd for Node { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Eq for Node {} + +impl PartialEq for Node { + fn eq(&self, other: &Self) -> bool { + self.occurences == other.occurences + } +} diff --git a/src/tree.rs b/src/tree.rs new file mode 100644 index 0000000..d3cca22 --- /dev/null +++ b/src/tree.rs @@ -0,0 +1,22 @@ +use super::symbols::Symbols; + +struct Node { + occurence: usize, + content: Content, +} + +enum Content { + Leaf(i8), + Parent { + left: Box, + right: Box, + }, +} + +type Tree = Node; + +// impl Tree { +// fn from_symbols(symbols: Symbols) -> Tree { +// symbols.0 +// } +// } -- cgit