-
Notifications
You must be signed in to change notification settings - Fork 0
/
pol_model.py
93 lines (83 loc) · 4.76 KB
/
pol_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# import twitter API and JSON processor
import twitter
import json
# gensim imports.
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn import utils
import csv
import multiprocessing
import nltk
from nltk.corpus import stopwords
from __future__ import print_function
# define our accessible API object
api = twitter.Api(consumer_key='iTYLP38Z7pGv31kIg6x994IIJ',
consumer_secret='wxw4zrKmxhvmOlkwFI4UhVoVKWRCpZsaQH3gF8BIFoyViFHuCV',
access_token_key='1071489398317989889-wRuGv0r65uK8RX9wHFTrFbSRNMNgMr',
access_token_secret='zGa4BDFxIb3RIa7hoiVtQZLuRc9ehKd3vqMbUkkbqs2pF', tweet_mode="extended")
# senator list retreived from CSPAN https://twitter.com/cspan/lists/senators?lang=en
# party affiliations from https://www.senate.gov/senators/index.htm
party_assocation = {"SenHydeSmith": 1, "SenDougJones": 0, "SenTinaSmith": 0,
"SenJohnKennedy": 1, "SenCortezMasto": 0, "SenKamalaHarris": 0,
"SenThomTillis": 1, "SenSasse": 1, "SenatorRounds": 1, "SenDanSullivan": 1,
"sendavidperdue": 1, "SenJoniErnst": 1, "SenBrianSchatz": 0, "senatemajldr": 1,
"MartinHeinrich": 0, "SenatorRisch": 1, "SenatorBaldwin": 0, "SenTedCruz": 1,
"SenatorFischer": 1, "SenatorHeitkamp": 0, "SenDuckworth": 0,
"SenWarren": 0, "SenTomCotton": 1, "SenatorHassan": 0, "MikeCrapo": 1,
"SenatorTester": 0, "SenJackReed": 0, "SenFeinstein": 0, "SenCoonsOffice": 0,
"LindseyGrahamSC": 1, "SenJohnHoeven": 1, "McConnellPress": 1,
"SenJohnThune": 1, "PattyMurray": 0, "SenatorEnzi": 1, "SenBlumenthal": 0,
"EnergyDems": 0, "SenDeanHeller": 1, "SenatorWicker": 1,
"senorrinhatch": 1, "RonWyden": 0, "SenatorCarper": 0, "SenatorDurbin": 0,
"SenatorLeahy": 0, "SenWhitehouse": 0, "SenGaryPeters": 0, "SenCoryGardner": 1,
"Sen_JoeManchin": 0, "SenToddYoung": 1, "SenRonJohnson": 1, "SenateAgDems": 0,
"SenatorLankford": 1, "SenBennetCO": 0, "SenToomey": 1, "SenatorTimScott": 1,
"RandPaul": 1, "SenDonnelly": 0, "SenJohnBarrasso": 1, "SenCapito": 1,
"timkaine": 0, "SenBobCasey": 0, "ChrisMurphyCT": 0,
"SenatorCantwell": 0, "SenatorShaheen": 0, "SenatorCardin": 0,
"maziehirono": 0, "SenMikeLee": 1, "EnergyGOP": 1, "SenatorIsakson": 1,
"SenAlexander": 1, "SenStabenow": 0, "SenPatRoberts": 1, "SenBobCorker": 1,
"SenateDems": 0, "SenGillibrand": 0, "SenatorTomUdall": 0, "BillCassidy": 1,
"brianschatz": 0, "TinaSmithMN": 0, "SenSherrodBrown": 0,
"amyklobuchar": 0, "SenJeffMerkley": 0, "SenMarkey": 0,
"RoyBlunt": 1, "SenatorBurr": 1, "SenShelby": 1, "SenBillNelson": 0,
"InhofePress": 1, "stabenow": 0, "SenatorCollins": 1,
"senrobportman": 1, "SenatorMenendez": 0, "JerryMoran": 1,
"ChrisVanHollen": 0, "lisamurkowski": 1, "SenSchumer": 0, "clairecmc": 0,
"CoryBooker": 0, "marcorubio": 1, "ChrisCoons": 0,
"GrahamBlog": 1, "SenateGOP": 1, "JohnCornyn": 1, "SteveDaines": 1,
"ChuckGrassley": 1, "MarkWarner": 0, "JimInhofe": 1, "JohnBoozman": 1}
# create lists for tweets and their labels
tweets = []; tweet_labels = []
# add tweets from all senators to correct list
for senator in party_assocation.keys():
for tweet in api.GetUserTimeline(screen_name=senator, count=200):
t = tweet.full_text
separate = "https"
t_new = t.split(separate, 1)[0]
tweets.append(t_new)
if party_assocation[senator] == 1:
tweet_labels.append('rep')
else:
tweet_labels.append('dem')
# revised tokenizer.
def tokenize_text(text):
tokens = []
for sent in nltk.sent_tokenize(text):
for word in nltk.word_tokenize(sent):
if len(word) < 2:
continue
tokens.append(word.lower())
return tokens
# find number of cores for parallelization.
cores = multiprocessing.cpu_count()
# read + tokenize the tweets.
documents = []
for i,tweet in enumerate(tweets):
documents.append(TaggedDocument(words=tokenize_text(tweet), tags=[tweet_label,i+1])
# create Doc2Vec model.
model_dbow = Doc2Vec(dm=1, vector_size=45, negative=5, hs=0, min_count=2, sample=0, workers=cores)
model_dbow.build_vocab([x for x in documents])
documents = utils.shuffle(documents) # shuffle
model_dbow.train(documents,total_examples=len(documents), epochs=30)
model_dbow.save('./out/pol_model.d2v')