-
Notifications
You must be signed in to change notification settings - Fork 0
/
lang.py
161 lines (122 loc) · 6.14 KB
/
lang.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os.path
import re
import sys
from enum import Enum
from pathlib import Path
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from hashies import big_end_int_64
# update mine with a local path
nltk.data.path.append('C:\\Users\\chris\\AppData\\Local\\databox\\nltk')
phrases = {
'long_sentences': [
"As the sun began to set over the horizon, painting the sky in shades of orange and pink, a gentle breeze rustled the leaves of the ancient oak tree that stood alone in the vast, open field, where generations of families had come to picnic and enjoy the serene beauty of nature.",
"The old, weathered lighthouse, standing tall and proud on the edge of the rocky coastline, had guided countless ships safely to shore with its unwavering beam of light, even during the fiercest of storms when the waves crashed violently against the cliffs below.",
"In a small, bustling village nestled in the heart of the mountains, the annual harvest festival brought together people from near and far, celebrating with music, dancing, and feasting on the bountiful produce that the fertile soil had generously provided that year.",
"With the first light of dawn breaking through the dense canopy of the rainforest, the calls of exotic birds echoed through the trees, awakening the vibrant ecosystem teeming with life, from the smallest insects to the majestic jaguars stealthily prowling the forest floor.",
"As the train chugged along the winding tracks, weaving through picturesque countryside and charming towns, passengers gazed out the windows at the ever-changing scenery, each lost in their own thoughts and stories, as the journey brought them closer to their destinations.",
"In the grand, opulent ballroom of the ancient castle, chandeliers glittered with hundreds of candles, illuminating the elegant guests who danced gracefully to the melodies of a live orchestra, their laughter and conversation filling the air with a sense of timeless enchantment.",
"The scientist, surrounded by a labyrinth of books and papers in her cluttered office, diligently worked late into the night, driven by a relentless curiosity and the hope of making a groundbreaking discovery that could change the world and benefit future generations.",
"High above the bustling city streets, in a sleek, modern skyscraper, the executive looked out over the sprawling metropolis, contemplating the challenges and opportunities that lay ahead, as the city's lights twinkled like stars in the deepening twilight.",
"On the arid plains of the savannah, a herd of elephants marched steadily towards a distant watering hole, their massive silhouettes silhouetted against the setting sun, while nearby, a pride of lions lounged in the shade, eyeing the potential prey with lazy interest.",
"Amidst the chaotic hustle and bustle of the crowded marketplace, vendors shouted their wares, colorful stalls overflowed with exotic goods, and the air was thick with the aromas of spices and street food, creating a vibrant tapestry of sights, sounds, and smells that captivated all who wandered through."
]
}
def normalize_arr(arr: list, t_min=-1, t_max=1) -> list:
"""
bring an array of number values within a bounds constraint
:param arr:
:param t_min:
:param t_max:
:return: a normalized list
"""
norm_arr = []
diff = t_max - t_min
diff_arr = max(arr) - min(arr)
for i in arr:
temp = (((i - min(arr)) * diff) / diff_arr) + t_min
norm_arr.append(temp)
return norm_arr
def normalize_str(s: str, t_min=-1, t_max=1) -> list:
"""
using a natural language model (NLTK), converting text
into a list of normalized number values
:param s:
:param t_min:
:param t_max:
:return: a normalized list
"""
# all to lowercase
s = s.lower()
# remove numbers
s = re.sub(r'\d+', '', s)
# remove everything except words and spaces
s = re.sub(r'[^\w\s]', '', s)
try:
stopw = set(stopwords.words('english'))
except LookupError:
nltk.download('stopwords', quiet=True)
stopw = set(stopwords.words('english'))
# remove stopwords and convert all strings to integer hash
s_ls = [big_end_int_64(w.encode('utf8')) for w in s.split(' ') if w not in stopw]
# normalize
s_ls = normalize_arr(s_ls, t_min, t_max)
return s_ls
def plots_2d(arr: list) -> list:
"""
convert a 1 dimensional list to a 2 dimensional one, for plotting 2D coordinates
:param arr:
:return: a list of 2D coordinates
"""
new_arr = []
for i in range(len(arr)):
plot = None
if i * 2 < len(arr):
plot = [arr[i * 2]]
if i * 2 + 1 < len(arr):
plot.append(arr[i * 2 + 1])
if plot is not None:
new_arr.append(plot)
return new_arr
def plots_3d(arr: list) -> list:
"""
converts a 1-dimensional list to a 2-dimensional one, for plotting 3D coordinates
:param arr:
:return: a list of 3D coordinates
"""
new_arr = []
# pad list length to be divisible by 3
mod = len(arr) % 3
if mod > 0:
pad_size = 3 - mod
arr += [0] * pad_size
for i in range(len(arr)):
plot = None
if i * 3 < len(arr):
plot = [arr[i * 3]]
if i * 3 + 1 < len(arr):
plot.append(arr[i * 3 + 1])
if i * 3 + 2 < len(arr):
plot.append(arr[i * 3 + 2])
if plot is not None:
new_arr.append(plot)
return new_arr
def do_lexical_analysis(s: str):
s = s.strip().lower()
# remove digits
s = re.sub(r'\d+', '', s)
# remove all but words and spaces
s = re.sub(r'[^\w\s]', '', s)
def main():
with open(Path(os.path.expanduser('~'), ".databox", "texts", "nature-rwemerson.txt"), 'rt', encoding='utf-8') as f:
words_and_stuff = f.read()
do_lexical_analysis(words_and_stuff)
'''
plot = plots_3d(normalize_str(phrases['long_sentences'][0], t_min=0, t_max=9))
plot2 = plots_3d(normalize_str(phrases['long_sentences'][1], t_min=-1, t_max=1))
print(plot)
print(plot2)
'''
if __name__ == "__main__":
main()