forked from tarunsinghofficial/HacktoberFest
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLossless-Text-Compression
164 lines (133 loc) · 5.2 KB
/
Lossless-Text-Compression
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import heapq
import os
class binarytreenode:
def __init__(self,value,freq):
self.value=value
self.freq=freq
self.left=None
self.right=None
def __lt__(self,other):
return self.freq<other.freq
def __eq__(self,other):
return self.freq==other.freq
class HuffmanCoding:
def __init__(self,path):
self.path=path
self.__heap=[]
self.__codes={}
self.__reverseCodes={}
def __make_frequency_dict(self,text):
freq_dict={}
for char in text:
if char not in freq_dict:
freq_dict[char]=0
freq_dict[char]+=1
return freq_dict
def __buildheap(self,freq_dict):
for key in freq_dict:
frequency=freq_dict[key]
binary_tree_node=binarytreenode(key,frequency)
heapq.heappush(self.__heap,binary_tree_node)
def __buildTree(self):
while (len(self.__heap)>1):
binary_tree_node_1=heapq.heappop(self.__heap)
binary_tree_node_2=heapq.heappop(self.__heap)
freq_sum=binary_tree_node_1.freq+binary_tree_node_2.freq
newNode=binarytreenode(None,freq_sum)
newNode.left=binary_tree_node_1
newNode.right=binary_tree_node_2
heapq.heappush(self.__heap,newNode)
return
def __buildCodesHepler(self,root,curr_bits):
if root==None:
return
if root.value is not None:
self.__codes[root.value]=curr_bits
self.__reverseCodes[curr_bits]=root.value
return
self.__buildCodesHepler(root.left,curr_bits+"0")
self.__buildCodesHepler(root.right,curr_bits+"1")
def __buildCodes(self):
root=heapq.heappop(self.__heap)
self.__buildCodesHepler(root,"")
def __getEncodedText(self,text):
encoded_text=""
for char in text:
encoded_text+=self.__codes[char]
return encoded_text
def __getPaddedEncodedText(self,encoded_text):
padded_amount=8-(len(encoded_text)%8)
for i in range (padded_amount):
encoded_text+='0'
padded_info="{0:08b}".format(padded_amount)
padded_encoded_text=padded_info+encoded_text
return padded_encoded_text
def __getBytesArray(self,padded_encoded_text):
array=[]
for i in range (0, len(padded_encoded_text),8):
byte=padded_encoded_text[i:i+8]
array.append(int(byte,2))
return array
def compression(self):
### get file from path
###read text from file
file_name,file_extension=os.path.splitext(self.path)
output_path=file_name+".bin"
with open(self.path,'r+')as file , open(output_path,'wb') as output:
text=file.read()
text=text.rstrip()
### make freq dict using the text
freq_dict=self.__make_frequency_dict(text)
###build an heap from the freq dict
self.__buildheap(freq_dict)
### construct binary tree
self.__buildTree()
### construct the codes from binary tree
self.__buildCodes()
### create the encoded text using the codes
encoded_text=self.__getEncodedText(text)
### put this encoded text into the binary file
### pad this encoded text
padded_encoded_text=self.__getPaddedEncodedText(encoded_text)
### get our bytes arrays
bytes_array=self.__getBytesArray(padded_encoded_text)
### return this binary files as output
final_bytes=bytes(bytes_array) ### convert again in bytes form
output.write(final_bytes)
print('compressed')
return output_path
def __removePadding(self,text):
padded_info=text[:8]
extra_padding=int(padded_info,2)
text=text[8:]
text_after_padding_removed=text[:-1*extra_padding]
return text_after_padding_removed
def __decodeText(self,text):
decoded_text=""
current_bit=""
for bit in text:
current_bit+=bit
if current_bit in self.__reverseCodes:
character=self.__reverseCodes[current_bit]
decoded_text+=character
current_bit=""
return decoded_text
def decompress(self,input_path):
filename,file_extension=os.path.splitext(self.path)
output_path=filename+"_decompressed"+".txt"
with open(input_path,'rb') as file,open(output_path,'w')as output:
bit_string=""
byte=file.read(1)
while byte:
byte=ord(byte)
bits=bin(byte)[2:].rjust(8,'0')
bit_string+=bits
byte=file.read(1)
actual_text=self.__removePadding(bit_string)
decompressed_text=self.__decodeText(actual_text)
output.write(decompressed_text)
return
path='D:/python project work/filed.txt' ##path
h=HuffmanCoding(path)
output_path=h.compression()
h.decompress(output_path)