Skip to content

Commit

Permalink
Add python script to cluster tweet data and store statistical results
Browse files Browse the repository at this point in the history
  • Loading branch information
grimmmyshini committed Oct 11, 2020
1 parent c50e570 commit 15ae386
Show file tree
Hide file tree
Showing 3 changed files with 156 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -325,3 +325,4 @@ tags


db.sqlite3
/scripts/.idea/
5 changes: 5 additions & 0 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@ pytz==2020.1 # https://github.com/stub42/pytz
python-slugify==4.0.1 # https://github.com/un33k/python-slugify
Pillow==7.2.0 # https://github.com/python-pillow/Pillow
argon2-cffi==20.1.0 # https://github.com/hynek/argon2_cffi
tweepy~=3.9.0 # https://github.com/tweepy/tweepy
python-dotenv~=0.14.0 # https://github.com/theskumar/python-dotenv
nltk~=3.5 # https://github.com/nltk/nltk
pandas~=1.1.3 # https://github.com/pandas-dev/pandas
scikit-learn~=0.23.2 # https://github.com/scikit-learn/scikit-learn

# Flask
# ------------------------------------------------------------------------------
Expand Down
150 changes: 150 additions & 0 deletions scripts/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import os
from dotenv import load_dotenv
import tweepy as tw
import re
import string
import nltk
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer

# NLTK variables to aid in cleaning/preprocessing
ps = nltk.PorterStemmer()

# Pandas dataframe for tweets
df = pd.DataFrame(columns=["id", "tweet", "popularity"])


class Results:
def __init__(self):
"""Initialize final results of the analysis"""
self.clusters_count = df.clusters.value_counts()
self.df_results = df.groupby(["clusters"]).max().reset_index()
print("Number of tweets per cluster: \n{}".format(self.clusters_count))
print("Top cluster tweets: \n{}".format(self.df_results.to_string()))


def authorise_api():
"""Authorise access to twitter API and return the api handler"""
load_dotenv()
consumer_key = os.getenv("CONSUMER_KEY")
consumer_key_secret = os.getenv("CONSUMER_KEY_SECRET")
access_token = os.getenv("ACCESS_TOKEN")
access_token_secret = os.getenv("ACCESS_TOKEN_SECRET")

auth = tw.OAuthHandler(consumer_key, consumer_key_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

return api


def get_top_trends(api):
"""Returns a list of top trends at the specified location"""
# Current location: New Delhi, India. WOEID = 1 for global trending
latitude = 28.644800
longitude = 77.216721

locations = api.trends_closest(latitude, longitude)
woeid = locations[0]["woeid"]

trends = api.trends_place(woeid)
trends_dict = trends[0]["trends"]

return [trends_dict[0]]


def de_emojify(text):
"""Remove emoticons from given text"""
regrex_pattern = re.compile(pattern="["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", flags=re.UNICODE)
return regrex_pattern.sub(r'', text)


def remove_punctuation(text):
"""Remove links and other punctuation from text"""
text = text.replace('\n', '')
text = text.replace('\t', '')
re.sub(r'http\S+', '', text) # removes links

translator = str.maketrans('', '', string.punctuation)
return text.lower().translate(translator)


def tokenize(text):
"""Stem and tokenizes input text, used as custom tokenizer in tfi-df vectorization"""
tokens = nltk.word_tokenize(text)
stems = []
for item in tokens:
stems.append(ps.stem(item))
return stems


def cluster(esp):
"""Clusters data using DBSCAN with a specified esp value"""
df["tweet_clean"] = df["tweet"].apply(lambda y: remove_punctuation(y))
df["tweet_clean"] = df["tweet_clean"].apply(lambda y: de_emojify(y))

vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english', min_df=1)
x = vectorizer.fit_transform(df.loc[:, "tweet_clean"])

db = DBSCAN(esp, min_samples=20).fit(x)

df["clusters"] = db.labels_
print("Number of unique clusters generated: {}".format(df.clusters.nunique()))


def stream(api, find_word):
"""Steam tweets containing the specified word"""
query = (
find_word + " -filter:retweet" + " -filter:media" + " -filter:links"
)
i = 0
limit = 1000
tweet_count = 100

for tweet in tw.Cursor(
api.search, q=query, count=tweet_count, lang="en", result_type="mixed"
).items():
df.loc[i, "id"] = tweet.id
df.loc[i, "tweet"] = tweet.text
df.loc[i, "popularity"] = tweet.favorite_count + tweet.retweet_count
i += 1

if i > limit:
break
else:
pass


def main():
api = authorise_api()
top_trend = get_top_trends(api)
stream(api, top_trend[0]["name"])

esp = 1.29
cluster(esp)

Results()


if __name__ == "__main__":
main()

0 comments on commit 15ae386

Please sign in to comment.