-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathformat.py
113 lines (94 loc) · 2.91 KB
/
format.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# -*- coding: utf-8 -*-
'''
Author: Tomas Phelan
License Employed: GNU General Public License v3.0
Brief: Includes methods for processing raw tweet data
'''
from string import ascii_lowercase
from nltk.corpus import words as english_words, stopwords
import re
english = set(w.lower() for w in english_words.words())
stop = set(w.lower() for w in stopwords.words())
def generate_variations(list):
updated_list = []
for phrase in list:
#add spaces to the begining and end
updated_list.append(" " + phrase + " ")
#add sing space in the end
updated_list.append(phrase + " ")
#add fullstop
updated_list.append(phrase + ".")
#add hashtag
updated_list.append("#" + phrase)
return updated_list
def remove_url(text):
return re.sub(r"http\S+", "", text)
def remove_excess_whitespace(text):
return ' '.join(text.split())
def convert_to_lowercase(text):
return text.lower()
def remove_non_alpha_chars(text):
T = list(text)
i = 0
while i < len(T):
if T[i] not in ascii_lowercase and T[i] != ' ':
del T[i]
else:
i += 1
return ''.join(T)
def remove_non_english_words(text, english):
T = text.split(' ') # ["hello", "world"]
i = 0
while i < len(T):
if T[i] not in english:
del T[i]
else:
i += 1
return ' '.join(T)
def remove_stopwords(text, stop):
T = text.split(' ')
i = 0
while i < len(T):
if T[i] in stop or len(T[i]) == 1:
del T[i]
else:
i += 1
return ' '.join(T)
def format_syntax(text):
a = convert_to_lowercase(text)
b = remove_non_alpha_chars(a)
c = remove_excess_whitespace(b)
return c
def format_semantic(text):
a = remove_non_english_words(text, english)
#Checkcking to see what difference is made
#b = remove_stopwords(a, stop)
return a
def format_full(text):
return (format_semantic(format_syntax(text)))
#return #format_porter(formatted_text)
def format_test():
print(format_full("# ILoveNY bcuz $ $ money"))
print(format_full("Hello to the world"))
if __name__ == '__main__':
# Format raw tweets
seen = set()
infile = open('tweets_raw.txt', 'r')
outfile = open('tweets_formatted.txt', 'w+')
i = 0
for line in infile:
i += 1
if (i % 1000) == 0:
i = 0
print(time.time())
tw = line.split("||")
fmt = format_tweets.format_full(tw[1])
try:
tweet_time = datetime.datetime.strptime(tw[2], "%Y-%m-%dT%H:%M:%S")
tweet_hash= str(tweet_time.year) + str(tweet_time.month) + str(tweet_time.day) + str(tweet_time.hour)+"||"+fmt
if fmt:
if tweet_hash not in seen:
outfile.write(tw[2] + "||" + fmt + "\n")
seen.add(tweet_hash)
except ValueError as e:
continue