-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword_frequency_analysis.py
80 lines (56 loc) · 1.72 KB
/
word_frequency_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from exercise_13_data_str import skip_gutenberg_header
from string import whitespace
import random
# MARKOV ANALYSIS
suffix_map = dict()
prefix = tuple()
def read_file(f_name):
# read the file from its crude form
fin = open(f_name, 'r+', encoding='UTF8')
skip_gutenberg_header(fin, "*END*")
data = fin.readlines()
fin.close()
# write the file on your own terms
fout = open("Output of - {}".format(f_name), "w+", encoding='UTF8')
for line in data:
line = line.replace('-', ' ')
line = line.strip(whitespace)
fout.write(line + ' ')
fout.close()
def process_file(f_name, order=2):
read_file(f_name)
f_obj = open("Output of - {}".format(f_name), "r+", encoding='UTF8')
t = []
for line in f_obj.readlines():
for word in line.split():
t.append(word.strip(whitespace))
for word in t:
process_word(word, order)
def process_word(word, order=2):
global prefix
if len(prefix) < order:
prefix += (word,)
return
try:
suffix_map[prefix].append(word)
except KeyError:
# if there is no entry for this prefix, make one
suffix_map[prefix] = [word]
prefix = shift(prefix, word)
def shift(t, word):
return t[1:] + (word,)
def random_text(n=100):
excerpt = ''
start = random.choice(list(suffix_map.keys()))
for i in range(n):
suffixes = suffix_map.get(start, None)
if suffixes is None:
random_text(n - i)
return
word = random.choice(suffixes)
excerpt += word + ' '
start = shift(start, word)
return excerpt
if __name__ == '__main__':
process_file(f_name="emma.txt", order=4)
print(random_text(20))