-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathCollecting Reddit Comments
85 lines (70 loc) · 17.7 KB
/
Collecting Reddit Comments
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
!pip install psaw
!pip install praw
import pandas as pd
import numpy as np
import time
import glob
import json
import plotly.graph_objects as go
import tqdm
from tqdm import trange
from tqdm import tqdm
from psaw import PushshiftAPI
import datetime as dt
import praw
reddit = praw.Reddit(
client_id = "<client_id>",
client_secret = "<client_secret>",
username = "<username>",
password = "<password>",
user_agent = "<user_agent>",
check_for_async=False
)
reddit.read_only = True
api = PushshiftAPI()
start_epoch=int(dt.datetime(2022, 1, 1).timestamp())
# News Categories:
categories = {
'mainstream': ['bloomberg.com', 'fortune.com', 'theguardian.com', 'npr.org', 'salon.com', 'newsweek.com', 'politico.com', 'pbs.org', 'economist.com', 'nbc.com', 'abcnews.com', 'cbsnews.com', 'cnn.com', 'foxnews.com', 'huffingtonpost.com', 'msnbc.com', 'nytimes.com', 'usatoday.com', 'washingtonpost.com', 'wsj.com', 'aol.com/news', 'Bbc.co.uk', 'bostonglobe.com', 'chicagotribune.com', 'dailymail.co.uk', 'latimes.com', 'news.yahoo.com', 'nydailynews.com', 'nypost.com', 'sfchronicle.com'],
'democrat': ['aquinas.edu', 'bluevirginia.us', 'care2.com', 'thedailybeast.com', 'dailykos.com', 'esquire.com', 'fcnp.com', 'alternet.org', 'arkansasonline.com', 'boingboing.net', 'currentaffairs.org', 'democracynow.org', 'heralddemocrat.com', 'huffingtonpost.com', 'jacobinmag.com', 'mashable.com', 'msnbc.com', 'newrepublic.com', 'nymag.com', 'politicususa.com', 'rollingstone.com', 'sfchronicle.com', 'slate.com', 'socialistalternative.org', 'splinternews.com', 'mediamatters.org', 'motherjones.com', 'nydailynews.com', 'peacock-panache.com', 'progressivevoicesofiowa.com', 'rawstory.com', 'salon.com', 'socialistproject.ca', 'canyoncountryzephyr.com', 'theintercept.com', 'newyorker.com', 'upworthy.com', 'screen.yahoo.com', 'thenation.com', 'thinkprogress.org', 'vice.com', 'vox.com', 'yesmagazine.org'],
'leaning_democrat': ['mystatesman.com', 'buzzfeed.com', 'cbsnews.com', 'publicintegrity.org', 'chicago.suntimes.com', 'dailynorthwestern.com', 'grist.org', 'hbswk.hbs.edu', 'indyweek.com', 'abcnews.go.com', 'ajc.com', 'bustle.com', 'centre-view.com', 'cnn.com', 'countercurrents.org', 'dailytargum.com', 'lasvegassun.com', 'latimes.com', 'miamiherald.com', 'michigandaily.com', 'today.com', 'nytimes.com', 'sfgate.com', 'state-journal.com', 'teenvogue.com', 'centralkynews.com', 'timescall.com', 'mediaite.com', 'mtv.com', 'nbcnews.com', 'newsweek.com', 'psmag.com', 'politico.com', 'politifact.com', 'mercurynews.com', 'skyhidailynews.com', 'spokesman.com', 'bostonglobe.com', 'commercialappeal.com', 'delcotimes.com', 'theguardian.com', 'thejustice.org', 'sacbee.com', 'time.com', 'usnews.com', 'vanityfair.com', 'vtdigger.org', 'washingtonmonthly.com', 'wisconsingazette.com', 'theatlantic.com', 'cadizrecord.com', 'courier-journal.com', 'thedailyshow.com', 'economist.com', 'philly.com', 'theroot.com', 'theverge.com', 'truth-out.org', 'univision.com', 'washingtonpost.com'],
'leaning_republican': ['bostonherald.com', 'deseretnews.com', 'drudgereport.com', 'thefiscaltimes.com', 'foxnews.com', 'dailypress.com', 'hotair.com', 'ijreview.com', 'intellectualconservative.com', 'investors.com', 'ocregister.com', 'post-gazette.com', 'judicialwatch.org', 'leesburgtoday.com', 'liveactionnews.org', 'oann.com', 'quillette.com', 'reason.org', 'richmond.com', 'theamericanconservative.com', 'theepochtimes.com', 'thelibertarianrepublic.com', 'telegraph.co.uk', 'washingtonexaminer.com', 'washingtontimes.com', 'watchdog.org'],
'republican': ['spectator.org', 'bearingdrift.com', 'cnsnews.com', 'conservativehq.com', 'inacow.com', 'americanthinker.com', 'breitbart.com', 'cbn.com', 'city-journal.org', 'commentarymagazine.com', 'dailymail.co.uk', 'frontpagemag.com', 'infowars.com', '28mrc.org', 'newsmax.com', 'rightwingnews.com', 'ksl.com', 'michellemalkin.com', 'nationalreview.com', 'nypost.com', 'pjmedia.com', 'redstate.com', 'rightsidenews.com', 'dailycaller.com', 'dailysignal.com', 'theblaze.com', 'westernjournalism.com', 'thecollegefix.com', 'dailywire.com', 'thefederalist.com', 'thegatewaypundit.com', 'weeklystandard.com', 'townhall.com', 'freebeacon.com', 'whatfinger.com', 'wnd.com'],
'fake_news': ['angrypatriotmovement.com', 'yesimright.com', 'worldnewsdailyreport.com', 'conservative101.com', 'usanewsflash.com', 'firstpost.com', 'dailysnark.com', 'postcard.news', 'higherperspectives.com', 'dailypost.ng', 'celebtricity.com', 'learnprogress.org', 'viralliberty.com', 'noticias-frescas.com', 'tmzworldnews.com', 'healthnut-23news.com', 'bb4sp.com', 'freedomsfinalstand.com', 'awarenessact.com', 'nation.com.', 'actualidadpanamericana.com', 'tmzhiphop.com', 'butthatsnoneofmybusiness.com', 'supremepatriot.com', 'usasupreme.com', 'patriotcrier.com', 'actualite.co', 'consciouslyenlightened.com', 'truthkings.com', 'now8news.com', 'activistpost.com', 'newzmagazine.com', '12minutos.com', 'reflectionofmind.org', 'ladylibertysnews.com', 'wearechange.org', 'teddystick.com', 'everynewshere.com', 'liberalplug.com', 'thevalleyreport.com', 'usherald.com', 'newsrescue.com', 'badcriminals.com', 'politicalmayhem.news', 'viralmugshot.com', 'conservativepoliticus.com', 'consnation.com', 'greenvillegazette.com', 'redrocktribune.com', 'the-postillon.com', 'ustruthwire.com', 'revolutions2040.com', 'dailynewsposts.info', 'viralstuppid.com', 'therealstrategy.com', 'journaldemourreal.com', 'ncscooper.com', 'donaldtrumpnews.co', 'washingtonfeed.com', 'bizstandardnews.com', 'theeventchronicle.com', 'smhwtfnews.com', 'whatsupic.com', 'thefrt.com', 'fury.news', 'worldpoliticus.com', 'freewoodpost.com', 'nymeta.co', 'tmzbreaking.com', 'politicops.com', 'channel24news.com', 'patriothangout.com', 'spinzon.com', 'thepoliticaltribune.com', 'clear-politics.com', 'enhlive.com', 'medicalkidnap.com', 'zootfeed.com', 'tmzuncut.com', 'nativestuff.us', 'theracketreport.com', 'abcnews.com.co', 'dailynews11.com', 'heaviermetal.net', 'notallowedto.com', 'dailyusaupdate.com', 'friendsofsyria.wordpress.com', 'westernsentinel.com', 'politicalo.com', 'openmagazines.com', 'nationalinsiderpolitics.com', 'viralactions.com', 'president45donaldtrump.com', 'tdnewswire.com', 'morningnewsusa.com', 'embols.com', 'whydontyoutrythis.com', 'nbc.com.co', 'ideaspots.com', 'adobochronicles.com', 'bignuggetnews.com', 'statenation.co', 'channel23news.com', 'qualitysharing.com', 'uspostman.com', 'cooltobeconservative.com', 'eutimes.net', 'cnoticias.net', 'superstation95.com', 'thefreepatriot.org', 'smag31.com', 'newslo.com', 'usatelevision.com', 'shariaunveiled.wordpress.com', 'jookos.com', 'conservativearmy88.com', 'thepeoplescube.com', 'gummypost.com', 'stateofthenation2012.com', 'cartelreport.com', 'dailyinsidernews.com', 'impulsetoday.com', 'stuppid.com', 'religionmind.com', 'channel28news.com', 'anonews.co', 'viralspeech.com', 'trunews.com', 'thenationalmarijuananews.com', 'choiceandtruth.com', 'channel5000.com', 'voxtribune.com', 'thenewsclub.info', 'hotglobalnews.com', 'folksvideo.com', 'actualites.co', 'mainerepublicemailalert.com', 'kupr7.com', 'gotnews.com', 'wetheproudpatriots.com', 'observatorial.com', 'dailyfinesser.com', 'thenewyorkevening.com', 'dailystormer.com', 'satiratribune.com', 'religionlo.com', 'mrnewswatch.com', 'americantoday.news', 'en-bref.fr', 'thenewsnerd.com', 'readconservatives.news', 'christiantimesnewspaper.com', 'newsexaminer.net', 'channel22news.com', 'politicot.com', 'drugsofficial.com', 'konkonsagh.biz', 'intrendtoday.com', 'newsdaily27.com', '20minutenews.com', 'universepolitics.com', 'bigbluevision.org', 'uspoln.com', 'wmacnews.com', 'theseattletribune.com', 'usadailypost.us', 'newsbreakshere.com', '24ak-24tuelles.com', 'houstonchronicle-tv.com', 'viralcocaine.com', 'daily-sun.com', 'donaldtrumppotus45.com', 'londonwebnews.com', 'conservativeinsider.co', 'whatdoesitmean.com', 'federalistnation.com', 'unitedmediapublishing.com', 'endoftheamericandream.com', 'knp7.com', 'straightstoned.com', 'thesmokersclub.com', 'newsdaily12.com', 'sourceplanet.net', 'flashinfo.org', 'nachrichten.de.com', 'conservativeflashnews.com', 'dailypresser.com', 'sundayinquirer.com', 'channel16news.com', 'therooster.com', 'blackinsurancenews.com', 'socialeverythings.com', 'libertyalliance.com', 'therightists.com', 'civictribune.com', 'baldwinpost.com', 'thebostontribune.com', 'dailybuzzlive.com', 'theexaminer.site', 'newsuptoday.com', 'baltimoregazette.com', 'urbanimagemagazine.com', 'wleb21.com', 'headlinebrief.com', 'thelastamericanvagabond.com', 'themiamigazette.com', 'channel18news.com', 'km8news.com', 'ourlandofthefree.com', 'usadailytime.com', 'wcpm3.com', 'krbcnews.com', 'thebigriddle.com', 'ushealthyadvisor.com', 'oreillypost.com', 'aldipest.com', 'kspm33.com', 'guerilla.news', 'ky12news.com', 'clancyreport.com', 'teoinfo.com', 'thenochill.com', 'localnews33.com', 'snoopack.com', 'dailynews3.com', 'stillnessinthestorm.com', 'usaconservativereport.com', 'news4ktla.com', 'americanflare.com', 'nnettle.com', 'usa-radio.com', 'mckenziepost.com', 'diyhours.net', 'factrider.com', 'telegraphsun.com', 'theusa-news.com', 'redinfo.us', 'statestv.com', 'toutelinfo.fr', 'everydaybreakingnews.com', 'redpolitics.us', 'tmzcomedy.com', 'mbynews.com', 'newpoliticstoday.com', 'newsbiscuit.com', 'viralpropaganda.com', 'ky6news.com', 'usanewshome.com', 'urdoca.com', 'americafans.com', 'asamericanasapplepie.org', 'city-herald.com', 'kbc14.com', 'christiantoday.info', 'tmzurban.com', 'heightpost.com', 'alynews.com', 'anews24.org', 'undergroundnewsreport.com', 'viralcords.com', 'reaganwasright.com', 'borderherald.com', 'wm21news.com', 'cnnews3.com', 'famousviralstories.com', 'usa360-tv.com', 'buzzfeedusa.com', 'wtoe5news.com', 'usapolitics24hrs.com', 'regated.com', 'politicsusanews.com', 'puppetstringnews.com', 'ushealthylife.com', 'klponews.com', 'fox-news24.com', 'politicono.com', 'conservativestudio.com', 'getoffthebs.com', 'conservative7.com', 'alertchild.com', 'libertycourier.com', 'local31news.com', 'forfreedomworld.com', 'net-infosnews.com', 'thesolexchange.com', 'conservativespirit.com', 'worldinformation24.info', 'landrypost.com', 'success-street.com', 'dailynews10.com', 'bluevisionpost.com', 'cartelpress.com', 'stgeorgegazette.com', 'coffeebreakforyou.com', 'usafirstinformation.com', 'dailynews5.com', 'usasnich.com', 'fanzinger.com', 'thereporterz.com', 'usatodaynews.me', 'tdtalliance.com', 'theinternationalreporter.org', 'newsdaily10.com', 'worldnewscircle.com', 'usinfonews.com', 'focusnews.info', 'newsfeedhunter.com', 'maywoodpost.com', 'pressunion.org', 'dailyheadlines.net', '24x365live.com', 'thenationalpatriot.com', 'usaworldbox.com', 'thenationalsun.com', '24wpn.com', 'flashnewscorner.com', 'usapoliticszone.com', 'potatriotpost.com', 'chuckcallesto.blogspot.com', 'battypost.com', 'thewashingtonpress.com', 'kcst7.com', 'lastdeplorables.com', 'floridasunpost.com', 'trueamericans.me', 'unitednews.org', 'the-insider.co', 'healthyworldhouse.com', 'ilovenativeamericans.us', 'areyousleep.com', 'kata33.com', 'onlysimchas.com', 'rilenews.com', '247newsmedia.com', 'mentor2day.com', 'breakingtop.world', 'politicass.com', 'americantoday.us', 'isthatlegit.com', 'dineal.com', 'thelastlineofdefense.online', 'bestthings.us', 'rightsidenews.com', 'healthycareandbeauty.com', 'persecutes.com', 'usatodaypolitics.com', 'newsoftrump.com', 'usanews4u.us', 'globalpoliticsnow.com', 'proudleader.com', '365usnews.com', 'observeronline.news', 'ladiesofliberty.net', 'tmzworldstarnews.com', 'tmzworldstar.com', 'newsleak.co', 'myfreshnews.com', 'fedsalert.com', 'newsformetoday.com', 'thepremiumnews.com', 'anews-24.com', 'theusaconservative.com', 'uspoliticsinfo.com', 'the-global-news.com', 'bluevision.news', 'kypo6.com', 'dailyfeed.news', 'dailysidnews.com', 'newshubs.info', 'newsfeedobserver.com', 'weconservative.com', 'thirdestatenewsgroup.com', 'halfwaypost.com', 'freedomcrossroads.us', '16wmpo.com', 'damnleaks.com', 'wrpt16.com', 'macedoniaonline.eu', 'mississippiherald.com', 'alabamaobserver.com', 'newsconservative.com', 'freeinfomedia.com', 'wazanews.tk', 'halturnershow.com', 'conservativeview.info', 'thetrumppers.com', 'armyusanews.com', '24usainfo.com', 'channel17news.com', 'wy21news.com', 'usadosenews.com', 'cnewsgo.com', 'worldpoliticsnow.com', 'bostonleader.com', 'weekendpoliticalnews.com', 'morningherald.com', 'westfieldpost.com', 'newsjustforyou1.blogspot.com', 'weekendherald.com', 'aurora-news.us', 'washingtonevening.com', 'americanprides.com', 'empiresports.co', 'chicksontheright.com', 'majorthoughts.com', 'dailyinfobox.com', 'kty24news.com', 'nunadisbereel.com', 'metropolitanworlds.com', 'americasnewest.com', 'dailyworldupdate.com', 'interestingdailynews.com', 'conservativepaper.com', 'nephef.com', 'wrpm33.com', 'americanflavor.news', 'politicspaper.com', 'powerdaily.us', 'nydailynews-tv.com', 'blog.veterantv.net', 'rumorjournal.com', 'ihavethetruth.com', 'xbn-news.com', 'givemeliberty01.com', 'usa-conservative.com', 'americanpeoplenetwork.com', 'consinfo.us', 'amposts.com', 'jacksontelegraph.com', 'viraldevil.com', 'channel34news.com', 'channel40news.com', 'benjaminfulford.typepad.com', 'positivedaily.com', 'thenet24h.com', 'globalassociatednews.com', 'themoralofthestory.us', 'theinternetpost.net', 'departedmedia.com', 'newjournal.us', 'americanpresident.co', 'vesselnews.io', 'rogue-nation3.com', 'heatst.com', 'prntly.com', 'dailythings.world', 'freedomdaily.com', 'witscience.org', 'worldpoliticus.com', 'proudcons.com', 'truetrumpers.com', '24online.news', 'usapoliticstoday.com', 'usapoliticsnow.com', 'breakingnewsblast.com', 'libertywritersnews.com', 'defenseusa.club', 'msfanpage.link', 'departed.co', 'scrapetv.com', 'worldnewspolitics.com', 'empireherald.com', 'nevo.news', 'a-news24.com', 'abcnews-us.com', 'albertatimes.com', 'americanpoliticnews.co', 'anonjekloy.tk', 'antinews.com', 'areyouasleep.com', 'associatedmediacoverage.com', 'belgique.actualites.co', '26bients.com', 'bluelinestrong.net', 'bostontribune.com', 'channel-7-news.com', 'channel33news.com', 'channel46news.com', 'channel55news.com', 'channel56news.com', 'channel59news.com', 'channel60news.com', 'channel62news.com', 'channel63news.com', 'channel65news.com', 'channel66news.com', 'channel68news.com', 'channel77news.com', 'cnn-business-news.ga', 'cnn-globalnews.com', 'cnn-internationaledition.com', 'cnn.com.de', 'conservativeinfocorner.com', 'countyweekly.com', 'dailynews33.com', 'dailynewsbin.com', 'dallastimes.us', 'denverguardian.com', 'denverinquirer.com', 'departedme.com', 'dublintribune.com', 'endingthefed.com', 'federalisttribune.com', 'france.actualites.co', 'freddymag.com', 'freedomjunkshun.com', 'freeinquirer.com', 'freshdailyreport.com', 'globalrevolutionnetwork.com', 'guerillanews.com', 'houstonleader.com', 'jokerviral.com', 'kf13.com', 'kmt11.com', 'krb7.com', 'lopezreport.com', 'mediamaxzone.com', 'mediazone.news', 'msnbc.website', 'nachrichten365.com', 'nativeamericans.us', 'newcenturytimes.com', 'news14kgpn.com', 'news14now.com', 'newsbreakhere.com', 'newsbreakingspipe.com', 'newsbuzzdaily.com', 'newsbysquad.com', 'newsnow17.com', 'newsphd.com', 'newswatch33.com', 'noticias365.info', 'notizzia.com', 'paris.actualites.co', 'patriotusa.website', 'redcountry.us', 'religiousmind.com', 'scaryhours.com', 'southernconservativeextra.com', 'sundaypost.org', 'surreytelegraph.com', 'sydneybulletin.com', 'thatviralfeed.net', 'theavocadonews.com', 'thebreakingnews.co', 'thelastlineofdefense.org', 'therealshtick.com', 'thetrumpmedia.com', 'theworldupdate.com', 'times.com.mx', 'timesofcambodia.com', 'tmzbusiness.com', 'usadailyinfo.com', 'usadailythings24.com', 'usanewstoday.com', 'usuncut.com', 'vancouverinquirer.com', 'veteransfordonaldtrump.com', 'viralstupid.com', 'washingtonpost.com.co', 'welovenative.com', 'werk35.com', 'wftj8news.com', 'wmb36.com', 'world.politics.com', 'wrejnews.com', 'clashdaily.com', 'downtrend.com', 'conservativedailypost.com', 'onepoliticalplaza.com', 'bb4sp.com', 'beforeitsnews.com', 'whatdoesitmean.com', 'socialeverythings.com', 'angrypatriotmovement.com', 'newsbreakshere.com', 'realnewsrightnow.com', 'notallowedto.com', 'now8news.com', 'react365.com', 'americannews.com', 'dailybuzzlive.com', 'thenewyorkevening.com', 'freedomdaily.com', 'channel24news.com', 'yournewswire.com', 'newspunch.com']
}
submissions = list(api.search_submissions(after=start_epoch,
subreddit='politics',
filter=['title', 'id'],
limit=1000))
comments_dict = {}
for i in tqdm(submissions):
comments_list=[]
for comment in list((reddit.submission(i.id)).comments):
if hasattr(comment, 'body'):
if (not 'As a reminder, this subreddit' in comment.body) and (comment.body != '[removed]') and (comment.body != '[deleted]'):
comments_list.append(comment.id)
comments_dict[i.id] = comments_list
# Create an array of arrays in the form [post id, article, news categories, comment text]
# This takes a long time because of the time spent searching Reddit for the comment text. In total it took about 20 hours on my machine.
for i in trange(len(comments_dict)):
key = list(comments_dict.keys())[i]
for value in (comments_dict[key]):
post_id = key
article = (reddit.submission(post_id)).url
news_types = []
for category in list(categories.keys()):
for site in categories[category]:
if site in article:
news_types.append(category)
if len(news_types) == 0:
news_types.append('none')
if news_types != ['none']:
comment_text = (reddit.comment(value)).body
output_array = [post_id, article, news_types, comment_text]
full_comments_array.append(output_array)
# Output the array to a .csv file:
full_comments_csv = pd.DataFrame({'Post ID': [i[0] for i in full_comments_array],
'Article': [i[1] for i in full_comments_array],
'News Categories': [i[2] for i in full_comments_array],
'Comment Text': [i[3] for i in full_comments_array]})
full_comments_csv.to_csv('full_comments.csv')