-
Notifications
You must be signed in to change notification settings - Fork 2
/
evaluate-on-tweet-level.py
155 lines (138 loc) · 7.93 KB
/
evaluate-on-tweet-level.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import time
from goose3 import Goose
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from tweetfinder import Article
answer_dict = {'https://www.techradar.com/news/lord-of-the-rings-on-amazon' : ['1349403885836791808',
'1422255647106617359',
'1422618263695941633',
'1103656820130775050',
'1410622923421851649'],
'https://edition.cnn.com/2021/05/12/football/mo-salah-tweet-riyad-mahrez-benjamin-mendy-spt-intl/index.html':
['1392185890844430342', '1392187739383255051', '1391714799717953536', '1391436786665246726', '1392185890844430342'],
'https://www.foxnews.com/politics/black-lives-matter-hamas-terrorists-israeli': ['1394289672101064704',
'1394578742920552453',
'1394667705173688326'],
'https://www.breitbart.com/economy/2021/03/03/washington-post-editorial-board-15-minimum-wage-will-not-happen/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+breitbart+%28Breitbart+News%29':
['1366545099132456961', '1366922134057058304', '1366837187094978566'],
'https://www.foxnews.com/us/new-jersey-house-party-shooting-dead-injured-state-police-suspect':
['1396471486651768837', '1396472421641822209'],
'https://www.foxnews.com/us/us-seeing-wave-of-textbook-anti-semitism-amid-israel-gaza-tensions':
['1395469376849993730', '1395094581255966723'],
'https://www.npr.org/2021/05/25/1000129271/marjorie-taylor-greenes-holocaust-remarks-blasted-by-republicans-leaders':
['1397192128598523911', '1396805268126720001'],
'https://www.cbsnews.com/news/alexei-navalny-russia-putin-critic-prison-health-infirmary-arrests/':
['1379075608030892032', '1379408844984569860'],
'https://www.foxnews.com/us/fox-news-spots-migrant-group-running-across-southern-border-into-us':
['1396528462026924033'],
'https://www.vice.com/en/article/wx8wm5/arkansas-just-became-the-first-state-to-ban-health-care-for-trans-kids':
['1379510072716488715'],
'https://www.cnn.com/us/live-news/san-jose-ca-shooting-05-26-21/h_41658163e6c6f2416d346adb6c01119f':
['1397580228537450510'],
'https://www.cbsnews.com/news/texas-defund-police-bill-abbott/':
['1396643982785105920'],
'https://newrepublic.com/article/161084/republican-retreat-governance-voter-suppression':
['1354556580231077891'],
'https://www.cnn.com/2021/05/17/investing/bitcoin-price-elon-musk-tesla-intl-hnk/index.html':
['1394001894809427971'],
'https://www.cbsnews.com/news/rand-paul-suspicious-package-white-powder/':
['1396973910994915328'],
'https://boston.cbslocal.com/2021/05/26/boston-dorchester-fire-fayston-street-homes-burning/':
['1397609773898600450'],
'https://www.msnbc.com/opinion/texas-new-abortion-law-isn-t-just-dangerous-it-s-n1267950':
['1394730325612404741'],
'https://www.bbc.com/news/world-europe-57250285':
['1396719090321010688'],
'https://www.npr.org/2021/05/21/999020140/its-now-legal-to-practice-yoga-in-alabamas-public-schools':
['1395482631043702787'],
'https://www.msnbc.com/opinion/republicans-say-court-packing-unconstitutional-they-re-wrong-n1265972':
['1387966124797661188']}
def getDriver():
# setup a headless chrome we can re-use it in all the tests within this class
chrome_options = Options()
chrome_options.add_argument('--mute-audio')
chrome_options.add_argument('--headless')
try:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
except WebDriverException as wde:
driver = webdriver.Chrome('chromedriver.exe', options=chrome_options)
return driver
def _loadViaSelenium(driver, url: str, delay_secs: int = 1):
driver.get(url)
# let it render the javascript, then grab the *rendered* html, not the source_html
time.sleep(delay_secs) # hopefully it renders after this much time
rendered_html = driver.find_element(By.TAG_NAME, "html").get_attribute('innerHTML')
# now that we have HTML rendered by Javascript, we can check for tweets
return Article(html=rendered_html)
def count_tweets_goose(url):
g = Goose()
article = g.extract(url=url)
return article.tweets
def count_tweets_goose_js(html):
g = Goose()
article = g.extract(raw_html=html)
return article.tweets
def get_stats_for_all():
"""
Pull the total cumulative scores. Notes:
* True Positive: tweet id in manual set and also in set from tweetfinder
* False Positive: tweet id not manual set but is in set from tweetfinder
* False Negative: tweet id in manual set and not in set from tweetfinder
:return:
"""
count_dict = {'tweetfinder': {'tp': 0, 'fp': 0},
'tweetfinder_js': {'tp': 0, 'fp': 0},
'goose': {'tp': 0, 'fp': 0},
'goose_js': {'tp': 0, 'fp': 0}
}
driver = getDriver()
for url, tweet_id_list in answer_dict.items():
article_js = _loadViaSelenium(driver, url)
article = Article(url=url)
found_tweets_tweetfinder = article.list_embedded_tweets()
found_tweets_goose = count_tweets_goose(url)
found_tweets_tweetfinder_js = article_js.list_embedded_tweets()
found_tweets_goose_js = count_tweets_goose_js(article_js.get_html())
found_id_dict = {'tweetfinder': found_tweets_tweetfinder, 'tweetfinder_js': found_tweets_tweetfinder_js,
'goose': found_tweets_goose, 'goose_js': found_tweets_goose_js}
for key, found_tweets in found_id_dict.items():
found_tweets = found_id_dict[key]
key_count_dict = count_dict[key]
if key == 'tweetfinder' or key == 'tweetfinder_js':
found_id_list = []
for tweet in found_tweets:
found_id_list.append(tweet['tweet_id'])
found_tweets = found_id_list
for tweet_id in found_tweets:
if key == 'goose':
id_start = tweet_id.find('status/')
tweet_id = tweet_id[id_start + 7 :id_start + 26]
if tweet_id in tweet_id_list:
key_count_dict['tp'] += 1
else:
key_count_dict['fp'] += 1
total_count = 36
stats_dict = {}
stats_dict_dict = {}
for key in count_dict.keys():
try:
count_dict_key = count_dict[key]
tp = count_dict_key['tp']
fp = count_dict_key['fp']
fn = total_count - tp
precision = tp / (tp + fp)
recall = tp / (tp + fn)
except ZeroDivisionError:
precision = 0
recall = 0
stats_dict_dict[key] = {'precision': precision, 'recall': recall}
stats_dict[key] = [stats_dict_dict[key]['precision'], stats_dict_dict[key]['recall']]
eval_df = pd.DataFrame(stats_dict)
eval_df.to_csv('embeds_tweet_review.csv', index=False)
if __name__ == "__main__":
get_stats_for_all()