aboutsummaryrefslogtreecommitdiff
path: root/draft.py
blob: f8223627e2a70d5e24b2368b9fd3d22c536ae2e6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/python3

import sys
import heapq
import collections
import copy


class BitRepr:
    def __init__(self, x: int, length: int):
        self.x = x
        self.length = length

    def add_zero(self):
        self.x = self.x << 1
        self.length += 1

    def add_one(self):
        self.x = self.x << 1
        self.length += 1
        self.x = self.x | 1

    def __repr__(self):
        return "{0:b}".format(self.x).zfill(self.length)


class HuffmanNode:
    def __init__(
        self, occurence: int, is_leaf: bool,
        symbol: int = None, left: 'HuffmanNode' = None, right: 'HuffmanNode' = None
    ):
        self.occurence = occurence
        self.is_leaf = is_leaf
        if is_leaf:
            self.symbol = symbol
        else:
            self.left = left
            self.right = right

    @staticmethod
    def leaf(occurence: int, symbol: int):
        return HuffmanNode(occurence, is_leaf=True, symbol=symbol)

    @staticmethod
    def node(left: 'HuffmanNode', right: 'HuffmanNode'):
        return HuffmanNode(left.occurence + right.occurence, is_leaf=False, left=left, right=right)

    def __gt__(self, other: 'HuffmanNode') -> bool:
        return self.occurence > other.occurence

    def __lt__(self, other: 'HuffmanNode') -> bool:
        return self.occurence < other.occurence

    def __repr__(self) -> str:
        if self.is_leaf:
            return f"({self.symbol} {self.occurence})"
        else:
            return f"(NODE {self.occurence})"  # left {str(self.left)} right {str(self.right)})"

    def put_tree(self, level: int = 0):
        if self.is_leaf:
            print(f"({self.symbol} {self.occurence})")
            return
        print(f"NODE {self.occurence}")
        print(" " * level * 4, "left ", end="")
        self.left.put_tree(level + 1)
        print(" " * level * 4, "right ", end="")
        self.right.put_tree(level + 1)

    def to_bit_repr(self, parent_bits: 'BitRepr' = BitRepr(0, 0)) -> {int: 'BitRepr'}:
        if self.is_leaf:
            return {self.symbol: parent_bits}
        left_bits = copy.deepcopy(parent_bits)
        left_bits.add_zero()
        right_bits = copy.deepcopy(parent_bits)
        right_bits.add_one()
        return {
            **self.left.to_bit_repr(left_bits),
            **self.right.to_bit_repr(right_bits)
        }

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("missing file name")
        sys.exit(1)

    file_name = sys.argv[1]
    # print(file_name)

    content = None
    with open(file_name, "rb") as f:
        content = f.read()
    counter = collections.Counter(content)

    huffman_nodes = [HuffmanNode.leaf(v, k) for k, v in dict(counter).items()]
    heapq.heapify(huffman_nodes)

    while len(huffman_nodes) >= 2:
        # print(huffman_nodes)
        left = heapq.heappop(huffman_nodes)
        right = heapq.heappop(huffman_nodes)
        node = HuffmanNode.node(left, right)
        heapq.heappush(huffman_nodes, node)
    # print(huffman_nodes)

    huffman_tree = huffman_nodes[0]
    huffman_tree.put_tree()
    bit_repr = huffman_tree.to_bit_repr()
    print(bit_repr)

    huffman_content = ''.join([str(bit_repr[s]) for s in content])
    print(huffman_content)