-
Notifications
You must be signed in to change notification settings - Fork 3
/
crowdSourceEvents.py
executable file
·2036 lines (1597 loc) · 59.1 KB
/
crowdSourceEvents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
'''
Keith Murray
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* "Once men turned their thinking over to machines in the hope *
* that this would set them free. But that only permitted other *
* men with machines to enslave them." *
* " 'Thou shalt not make a machine in the likeness of a man's *
* mind,' " Paul quoted. *
* "Right out of the Butlerian Jihad and the Orange Catholic *
* Bible," she said. "But what the O.C. Bible should've said is: *
* 'Thou shalt not make a machine to counterfeit a human mind.'..." *
* *
* --from Dune, by Frank Herbert *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Using tweepy: http://tweepy.readthedocs.org/
Uses list of events and list of loctations around world
Read in a list of key events
Search twitter for events, and search results for location ID's
If event and location are present with enough frequency,
tweet about event
Later have user go back and adjust results, perhaps using something
to 'learn' about actual events occuring.
**********************************************
explination for setting up api
http://www.74by2.com/2014/06/easily-get-twitter-api-key-api-secret-access-token-access-secret-pictures/
status Object Structure:
http://tkang.blogspot.com/2011/01/tweepy-twitter-api-status-object.html
'contributors': None,
'truncated': False,
'text': 'My Top Followers in 2010: @tkang1 @serin23 @uhrunland @aliassculptor @kor0307 @yunki62. Find yours @ http://mytopfollowersin2010.com',
'in_reply_to_status_id': None,
'id': 21041793667694593,
'_api': <tweepy.api.api object="" at="" 0x6bebc50="">,
'author': <tweepy.models.user object="" at="" 0x6c16610="">,
'retweeted': False,
'coordinates': None,
'source': 'My Top Followers in 2010',
'in_reply_to_screen_name': None,
'id_str': '21041793667694593',
'retweet_count': 0,
'in_reply_to_user_id': None,
'favorited': False,
'retweeted_status': <tweepy.models.status object="" at="" 0xb2b5190="">,
'source_url': 'http://mytopfollowersin2010.com',
'user': <tweepy.models.user object="" at="" 0x6c16610="">,
'geo': None,
'in_reply_to_user_id_str': None,
'created_at': datetime.datetime(2011, 1, 1, 3, 15, 29),
'in_reply_to_status_id_str': None,
'place': None
}
'''
import tweepy
import time
import math
import numpy as np
import nltk
nltk.data.path.append('/home/pi/nltk_data')
import datetime
import socket
import sys
import os
import re # http://stackoverflow.com/questions/6883049/regex-to-find-urls-in-string-in-python
import threading
from sklearn.neighbors import KernelDensity
import unCorruptFiles
#import getKMKeys # Format of CK, CS, AK, AS
#import getChatBotKeys as getKMKeys
#[CONSUMER_KEY, CONSUMER_SECRET, ACCESS_KEY, ACCESS_SECRET]
'''
Pin Numbers RPi.GPIO Raspberry Pi Name BCM2835 USED AS
P1_01 1 3V3
P1_02 2 5V0
P1_03 3 SDA0 GPIO0
P1_04 4 DNC
P1_05 5 SCL0 GPIO1
P1_06 6 GND GND
P1_07 7 GPIO7 GPIO4
P1_08 8 TXD GPIO14 TXD
P1_09 9 DNC
P1_10 10 RXD GPIO15 RXD
P1_11 11 GPIO0 GPIO17
P1_12 12 GPIO1 GPIO18
P1_13 13 GPIO2 GPIO21
P1_14 14 DNC
P1_15 15 GPIO3 GPIO22
P1_16 16 GPIO4 GPIO23
P1_17 17 DNC
P1_18 18 GPIO5 GPIO24
P1_19 19 SPI_MOSI GPIO10
P1_20 20 DNC
P1_21 21 SPI_MISO GPIO9
P1_22 22 GPIO6 GPIO25
P1_23 23 SPI_SCLK GPIO11
P1_24 24 SPI_CE0_N GPIO8
P1_25 25 DNC
P1_26 26 SPI_CE1_N GPIO7
pin setup on PI
1 2
3 4
5 6 --GND
BUTTON- 7 8
VCC-- 9 10
11 12 --RED
YELLOW- 13 14
BLUE-- 15 16 --HEARTBEAT
17 18 --GREEN
19 20
21 22
23 24
25 26
'''
class heartBeatThread(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.name="HeartBeatThread"
self.daemon = True
def run(self):
print "STARTING THAT SICK BEAT YO"
heartBeat()
def stop(self):
self._stop.set()
def stopped(self):
return self._stop.isSet()
class twitterThread(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.name="TwitterBotThread"
self.daemon = True
def run(self):
print "Starting to Tweet"
main1()
print "I ESCAPED"
def stop(self):
self._stop.set()
def stopped(self):
return self._stop.isSet()
class restartButtonThread(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.name="restartButtonThread"
self.daemon = True
def run(self):
print "Button is active"
buttonListener()
def stop(self):
self._stop.set()
def stopped(self):
return self._stop.isSet()
def startupLogFile():
# REPLICATED: utils/botHelperFunctions.py startupLogFile()
newStartup = "Last Startup:\t" + \
str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))
updateLogFile(newStartup)
return
def updateLogFile(newLine):
# REPLICATED: utils/botHelperFunctions.py updateLogFile()
"""
Takes `newLine` and inserts it into the correct line in
the log file, sucessfully updating that feature
Parameters
----------
Raises
------
Notes
-----
"""
try:
logFile = open("crowdSource.log", 'r')
data = logFile.read()
logFile.close()
flSize = os.path.getsize("crowdSource.log")
if flSize == 0:
raise EmptyFile # Cheap trigger for the except condition
if data == "":
raise EmptyFile
except:
# I'm being extra fancy, it's pointless, but I'm pleased with it
# The time of the very first tweet from this twitter account
firstTweetTime = "11:55 PM - 15 Apr 2015"
firstTweetTime = firstTweetTime.replace('- ', '')
from dateutil.parser import parse
parsedTweetTime = parse(firstTweetTime)
logFile = open("crowdSource.log", 'a')
logFile.write("Last Startup:\t" +
str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M")) + '\n')
logFile.write("Last KDE Update:\t" +
str(parsedTweetTime.strftime("%Y-%m-%d %H:%M")) + '\n')
logFile.write("Last Weekly Summary:\t" +
str(parsedTweetTime.strftime("%Y-%m-%d %H:%M")) + '\n')
logFile.close()
lineAdded = False
lineID = newLine.split('\t')[0]
logFile = open("crowdSource.log", 'r')
allLog = logFile.read()
logFile.close()
logFile = open("crowdSource.log", 'w')
statements = allLog.split('\n')
for line in statements:
if line != "":
key, value = line.split('\t')
if key in lineID:
line = newLine
lineAdded = True
logFile.write(line+'\n')
logFile.close()
if lineAdded == False:
logFile = open("crowdSource.log", 'w')
logFile.write(allLog)
logFile.close()
raise SyntaxError("Given string '%s' did not align with a log entry" % lineID)
return
def getLoggedData(lineID):
# REPLICATED: utils/botHelperFunctions.py getLoggedData()
"""
Takes `newLine` and inserts it into the correct line in
the log file, sucessfully updating that feature
Parameters
----------
lineID: string
a string with a log name the user wants to find
Returns
-------
theValue: string
Raises
------
SyntaxError:
If the program can't parse the input string to find the
correct log entry, it'll raise this error. I'll fix this
phrasing later on.
Notes
-----
"""
lineFound = False
logFile = open("crowdSource.log", 'r')
for line in logFile:
if line != "":
line = line.strip()
key, value = line.split('\t')
if key in lineID:
theValue = value
lineFound = True
logFile.close()
if lineFound == False:
logFile = open("crowdSource.log", 'w')
logFile.write(allLog)
logFile.close()
raise SyntaxError("Given string '%s' not found in log entry" % lineID)
return theValue
def ledCycle():
# REPLICATED: utils/rpiGPIOFunctions.py ledCycle()
"""
Runs through each LED on the display to verify its function
Cycle order should be (right to left) White, Red, Yellow, Blue,
Green. The same order they are in the perf board
Notes
-----
This is largely just so I can be sure that the LEDs still work.
"""
def blink(pin):
pinState = False
for i in range(6):
pinState = not pinState
GPIO.output(pin, pinState)
time.sleep(0.15)
GPIO.output(pin, False)
white = 16
red = 12
yellow = 13
blue = 15
green = 18
blink(white)
blink(red)
blink(yellow)
blink(blue)
blink(green)
return
def myLED(theLED):
# REPLICATED: utils/rpiGPIOFunctions.py myLED()
red = 12
yellow = 13
blue = 15
green = 18
if rPI == True:
if theLED == "RED":
GPIO.output(red, True)
GPIO.output(yellow, False)
GPIO.output(blue, False)
GPIO.output(green, False)
# Consider running a wifi reconnect script now
#print "RED"
if theLED == "YELLOW":
GPIO.output(red, False)
GPIO.output(yellow, True)
GPIO.output(blue, False)
GPIO.output(green, False)
#print "YELLOW"
if theLED == "GREEN":
GPIO.output(red, False)
GPIO.output(yellow, False)
GPIO.output(blue, True)
GPIO.output(green, False)
#print "BLUE"
if theLED == "EVENT":
GPIO.output(red, False)
GPIO.output(yellow, False)
GPIO.output(green, False)
GPIO.output(blue, True)
time.sleep(0.15)
GPIO.output(blue, False)
time.sleep(0.15)
GPIO.output(blue, True)
time.sleep(0.15)
GPIO.output(blue, False)
time.sleep(0.15)
GPIO.output(blue, True)
time.sleep(0.15)
GPIO.output(blue, False)
time.sleep(0.15)
GPIO.output(blue, True)
#print "EVENT"
if theLED == "SLEEP":
GPIO.output(red, False)
GPIO.output(yellow, False)
GPIO.output(blue, False)
GPIO.output(green, True)
if theLED == "KDEPREP":
GPIO.output(red, False)
GPIO.output(yellow, True)
GPIO.output(blue, False)
GPIO.output(green, True)
#print "GREEN"
return
def heartBeat():
# REPLICATED: utils/rpiGPIOFunctions.py heartBeat()
# Toggles an LED to verify the program is running
# Lets the pi run without owner needing a monitor
while True:
GPIO.output(16, True)
# print "BEAT"
time.sleep(1)
GPIO.output(16, False)
time.sleep(1)
def buttonListener():
# REPLICATED and improved: utils/rpiGPIOFunctions.py buttonListener()
red = 12
yellow = 13
blue = 15
green = 18
# For restart and shutdown
# This is super inelegant. But I think it'll work so there we go
oldState = GPIO.input(7)
buttonPress = False
while True:
curState = GPIO.input(7)
if curState != oldState:
# Debounce
time.sleep(0.003)
if curState != oldState:
if buttonPress == True:
buttonPress = False
duration = time.time() - startTime
if duration <= 7:
GPIO.output(yellow, True)
time.sleep(.1)
os.system("sudo reboot")
else:
buttonPress = True
oldState = curState
startTime = time.time()
# Turn on "I'm responding" LED Combo
GPIO.output(green, True)
GPIO.output(blue, True)
time.sleep(.001)
if buttonPress:
duration = time.time() - startTime
if duration > 7:
# Shutdown condition
GPIO.output(red, True)
time.sleep(.1)
GPIO.output(red, False)
time.sleep(.1)
GPIO.output(red, True)
time.sleep(.5)
os.system("sudo shutdown -h now")
return
def is_connected():
# REPLICATED: utils/twitterInteractions.py is_connected()
"""
Tries to connect to google to see if router is down
Returns
-------
connected: boolean
"""
REMOTE_SERVER = "www.google.com"
try:
print "Testing Internet Connection"
# see if we can resolve the host name -- tells us if there is
# a DNS listening
host = socket.gethostbyname(REMOTE_SERVER)
# connect to the host -- tells us if the host is actually
# reachable
s = socket.create_connection((host, 80), 2)
return True
except:
return False
def getLocation(locBestGuess):
# REPLICATED: utils/nlpTools/locationFromText.py getLocation()
"""
Gets the best guess for location the event took place in
Parameters
----------
locBestGuess: list of strings
locBestGuess should be made up of parsed substrings
from tweets. The parsing is handled by `extractLocations`
or a similar function
Returns
-------
bestGuess: str
string will be "in X" where x is the guess or " but I can't
find where" according to the presence of any guess in the input
parameter
Notes
-----
`ExtractLocation` might be moved to its own file, as might `getLocation`
I'm not yet sure if I want it, but the location extraction would be a
useful standalone program.
"""
if len(locBestGuess) == 0:
return " but I can't find where"
elif len(locBestGuess) == 1:
return " in " +str(locBestGuess[0])
else:
d = dict((i,locBestGuess.count(i)) for i in locBestGuess)
return " in " +str(max(d, key=d.get))
def extractLocation(text):
# REPLICATED: utils/nlpTools/locationFromText.py extractLocation()
'''
An NLTK based location extractor.
Parameters
----------
text : string
input text from tweets
Returns
-------
locations : list of strings
A list of strings from `text` that were extracted using the grammar
outlined
Notes
-----
A really basic and probably bug prone + hard to follow location extracter
But hey, if it stops washington post from popping up all the damn time..
Aaaaand at this point I realize that I basically wrote a program to find
words that are capitalized...
whoop... whoop.
:|
NLTK That bitchaz
From what I can tell, this will have almost (if not more) false positives
when compared to true positives. However these grammar rules that extract
locations were written so their false negatives were minimal. I want to
miss as little information as possible. From other portions of the code
(Event Feature Set) the bot should only trigger when an event occurs. From
there there's an assumption that there will be more similar true positives
than similar false positives: the location signal can be extracted from the
background noise.
This also uses the nltk pos tagger, which assumes propper english rules are
being followed (as was the case in its training data). That assumption is
invalid with twitter data. So... yeah. Fun thing about that is that it still
generally works
It works under the assumption that locations are grammatically similar to
state machines when used in english language.
'''
# Do the nltk stuff
sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
locations = []
for sent in tagged_sentences:
words, pos = map(list, zip(*sent))
#print "\t"+" ".join(words)
#print "\t"+" ".join(pos)
startC= ["NNP", "JJ"]
validExtend = [ "NNP", ',', '#', "IN", "CC", "DT"]
afterIN = ["NNP", "JJ"]
validTerminal = ["NNP"] # CURRENTLY NOT IN USE. REMOVE SOON
startHere = ["IN"] # Had JJ, JJ requires more grammer rules to be added. This is a bad format for that
locStart = -1
locEnd = -1
# Right, ok, I need to read CH7 of the NLTK book and make a better method.
# But, this will at least establish a baseline for CH7 performance
nnpChar = False
if pos[0] in startC:
if pos[0] == "NNP":
nnpChar = True
locStart =0
tail = 1
if len(pos)>1:
i = 1
while (pos[i] in validExtend) and i+1<len(pos): # Double check that, might be off by 1
if pos[i] == "NNP":
nnpChar = True
tail = i+1
i += 1
#print sent[locStart][0], locStart
#print tail
#print sent[0:0]
#print sent[locStart:tail][0]
if nnpChar == True:
locations.append(" ".join(words[locStart:tail]))
# Oh great Off By One Gods, I know you must have a sacrifice of IndexErrors,
# But I pray you are satasfied with my offerings early on, rather than
# late
# into the night
# when I am furious at the code
# and delerious from adding and subtracting 1's from all lines
# Oh great Off By One Gods
# Hear my plea
#print len(sent)
i = 0
locStart = -1
tail = -1
lastNNPindex = -1
# Edit to start on nnp so if you have vb something, in, it'll start on nnp
while i < len(sent):
if pos[i] in startHere:
# Sweet! Step Forward one character if possible
if i+1 >= len(pos):
break
i += 1
# Check if post startHere is a valid afterIN or validExtend
if (pos[i] in afterIN) or (pos[i] in validExtend):
locStart = i
if pos[i] == "NNP":
lastNNPindex = i
# Woo, it was, that's the start of our location!
while len(pos) > i+1:
if i+1 >= len(pos):
break
i+=1
if pos[i]=="NNP":
lastNNPindex = i
if pos[i] not in validExtend:
break
# ummmm
# How about we reverse to find
#if i > locStart+1:
tail = lastNNPindex+1 # That should be allowed via sent[locStart:tail]
#else:
# tail = i
if tail > locStart:
locations.append(" ".join(words[locStart:tail]))
locStart = -1
tail = -1
lastNNPindex = -1
# *should*
i += 1
# You are the worst gods ever OBO gods.
# A genie would grant my prayer with fewer strings attached...
#if locStart > -1:
#print sent[locStart:tail]
return locations
def processLocations(tweets, event):
# REPLICATED: utils/nlpTools/locationFromText.py processLocations()
# Now we get ready to tweet!! :D
locBestGuess = []
for tweet in tweets:
aTweet = tweet.text.encode('utf-8')
words = aTweet.split(" ")
try:
guessLocation = []
guessLocationTemp = extractLocation(cleanTweetTextofAts(aTweet))
for location in guessLocationTemp:
if event.lower() not in location.lower(): # Stop saying a tornado occured in tornado
guessLocation.append(location)
except UnicodeDecodeError:
print "Got unicodeDecodeError.."
guessLocation = []
if len(guessLocation)>0:
for loc in guessLocation:
locBestGuess.append(loc)
# now we've looked at the tweets and tried to guess a location
#locBestGuess1 = getLocation(locBestGuess)
return locBestGuess
def cleanTweetText(text):
# REPLICATED: utils/nlpTools/filters/twitterSpecific cleanTweetText()
#removes urls
return re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text, flags=re.MULTILINE)
def cleanTweetTextofAts(text):
# REPLICATED: utils/nlpTools/filters/twitterSpecific cleanTweetTextofAts()
#removes urls and @ mentions
# Though it doesn't really.. It misses 'there' in @here_there
text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text, flags=re.MULTILINE)
text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
# Consider this mod if I end up getting lots of unicode errors. Weeee.....
#printable = set(string.printable)
#text = filter(lambda x: x in printable, text)
return text
def processSpam(tweet, tweetDict, userDict, event, myHandle):
# REPLICATED: utils/nlpTools/filters/unspam processSpam()
'''
Remove blantantly spam tweets, nothing else
Parameters
----------
tweet: tweepy tweet structure
Just the tweet itself, all of its data
tweetDict: dict
previous tweet text stored in it
userDict: dict
usernames, allowed 5 total appearences
event: str
The event we're trying to talk about, removes "@JohnnyTsunami tweeted this"
myHandle: str
The twitter handle of this bot
Returns
-------
uniqueTweet: Bool
Boolean value, True if it's not a retweet (by "RT...")
the event is mentioned, not just in username
After URL removal, it is still a tweet not previously seen
userhandle has been seen less than 5 times in this set
False otherwise
tweetDict: dict
Dictionary of all previous tweets
userDict: dict
Dictionary of all previous user handles, and counts of the number of times that
handle has appeared
Notes
-----
I have yet to decide, but I might remove all @[generichandle42] because the grammer
calls it a location if I do that, I might have to sub in a name to make the tweet
maintain grammer. High on the list to be considered is Slartybartfast.
Still though, that might cause other problems. So we'll ease into that improvement
I could filter out "Basketball", "DairyQueen", and "Blizzard Entertainment" tweets here,
but I think I'd really annoy twitter a lot this way. (If the tweet is "invalid" I
ask for another. So during an OKC, I'd get a ton of tweets talking about basketball,
chuck them, ask for a ton more, and repeat. Better to just grab a set of unique
tweets, look at them, and make note if there's not enough "event" focused tweets)
in testEvent/crowdSourceEvents (as of Aug 29 2017) this function is named 'processTweets'
not processSpam
'''
handle = tweet.user.screen_name.encode('utf-8')
name = tweet.user.name.encode('utf-8')
text = tweet.text.encode('utf-8')
# Basic Spam Rejection:
# Reject Retweets
if text[0:2] == "RT" or tweet.retweeted:
return False, tweetDict, userDict
# And I shouldn't influence myself (feedback isn't useful)
if handle == myHandle:
return False, tweetDict, userDict
# Usernames have caused a problem for me..
# Keep an eye on this, see if twitter does translation for us in search
if (event.lower() in handle.lower()) or (event.lower() in name.lower()):
if event.lower() not in cleanTweetTextofAts(text.lower()):
'''
Only risk throwing the tweet out if the twitter handle has the event
in their name. That way if twitter does translate queries, they're
less likely to be omited. And if twitter doesn't, nothing changes
'''
return False, tweetDict, userDict
# Usernames in Replies have also been a problem
if tweet.in_reply_to_screen_name:
if event.lower() in tweet.in_reply_to_screen_name.lower():
if event.lower() not in cleanTweetTextofAts(text.lower()):
return False, tweetDict, userDict
# re based url stripper: see re import comment
cleanTweet = cleanTweetText(text)
# Check Duplicate Tweets, and tweets from same account
if cleanTweet in tweetDict:
# Reject Repeats
return False, tweetDict, userDict
else:
# This whole handle buisness is to reduce bot spam (see meteoroid lyrics bots)
if handle in userDict:
# Only let a single account have a max of 5 tweets
# in case it's a user freaking out about an event
if userDict[handle] > 5:
return False, tweetDict, userDict
userDict[handle] += 1
userDict[handle] = 1
tweetDict[cleanTweet] = 1
return True, tweetDict, userDict
def polysemeFilter(tweets, event):
# REPLICATED: utils/nlpTools/filters/polysemyFilters polysemyFilter()
'''
Remove polyseme's of the event (OKC Thunder, Blizzard from DQ, or the gaming company)
Parameters
----------
tweets: list
A list of tweets (tweepy tweet struct)
event: str
A string for the event
Returns
-------
tweets: list
A list of all tweets (tweepy tweet object), filtered to remove polyseme's
Notes
-----
Currently incomplete. Just a dummy function until I build the projection matrix
Right. Well. Hmm
First, I need a collection (for each event) of tweets (I mean strings) With the
correct data, and some with the polyseme.
Dairy Queen Ads and Ice cream lists
Blizzard Gaming Anything. Overwatch, LoL, WoW, whatever.
Especially Sale talk
Pokemon--Less of a event happening trigger, but it really messes with Location
Pokemon go bots tweeting where a pokemon with the move earthquake is happening
more
OKC Thunder. Anything with basketball, and probably any sports ball.
Also hail. Tweets about football Hail Mary's, Hail Varsity, and Hail any body of
worship, whether serious or sarcastic, is a huge pain. Most hail tweet distrobutions
are fairly wide apart unless there's a tornado somewhere
Lightning: I need soo many filters on that one. Ligthning sale, lightning charger, so
on and so forth. Jan 2018 the bot uses the word lightening, but until the polyseme
filter is online, I'm not going to fix this.
Since we're talking about what would be nice, maybe look to see if "No Tsunami" is also
posible to polyseme filter.
It might be most useful to build in a notebook page, and then port the final projection
matrix over here. There will be a lot of testing, but once the projection matrix
is built, it's just a matter of loading it, projecting, and either running a cluster
centroid test, or a SVM, or any other method. Either way, it's nothing compared to
generating the matrix.
Finally, I don't yet know if I'll be building a projection for each event, or one hyper
projection which hopefully would work on all
'''
return tweets
def negationFilter(tweets, event):
# REPLICATED: utils/nlpTools/filters/grammarFilters negationFilter()
'''
Remove tweets which say the event didn't happen.
"No tsunami threat present"
Parameters
----------
tweets: list
A list of tweets (tweepy tweet struct)
event: str
A string for the event
Returns
-------
tweets: list
A list of all tweets (tweepy tweet object), filtered to remove tweets
which say the event won't or didn't occur
Notes
-----
Currently incomplete. Just a dummy function until I build the projection matrix
Really this is a grammar filter, and the polyseme filter is a semantic filter
'''
return tweets
def gotTweepError():
# REPLICATED: utils/twitterInteractions gotTweetError()
'''
Just kill some time. 5 minutes for tweepy, 1 for no internet
Notes
-----
I need to add a condition that reboots if wifi is down for too long
'''
# check if the error is internet connection based on
connected = is_connected()
if connected:
print "I started to annoy twitter, now I have to wait a bit", datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
myLED("YELLOW")
time.sleep(60*5)
else:
print "I'm not connected to the network at the moment, sorry"
myLED("RED")
time.sleep(60*1)
myLED("GREEN")
return
def getTweets(api, event, rppSize=50):
# REPLICATED: utils/twitterInteractions getTweets()
'''
Grabs around 50 tweets (most recent) from twitter
Blantantly spam tweets are removed (exact copies)
Parameters
----------
api: twitter api
this is needed for all twitter communciation
event: str
A string that is the keyword in a twitter search
rppSize: int
Default = 50
the number of tweets you want twitter to return to you (max is 50)
Returns
-------
listOfTweets: list
A list of all tweets (tweepy tweet object), filtered to remove basic spam
Notes
-----
'''
# Get a set of tweets, and filter them
# Tweets are events, filters such that
# Retweets are removed
# URLs are stripped of the tweet
# Accounts with the event in their name are removed from the set (Not yet working)
#
# Grab my handle to avoid grabbing my own tweets
while True:
try:
myHandle = api.me().screen_name.encode('utf-8')
break
except tweepy.TweepError:
gotTweepError()
# Grab Cursor tweets from the API, wrap in try/except for safety
while True:
try:
tweetList = tweepy.Cursor(api.search,
q=event,
rpp=rppSize,
result_type="recent",
include_entities=True).items()
break
except tweepy.TweepError:
gotTweepError()
listOfTweets = []
#for tweet in tweetList:
tweetTracker = 0