Add python script to cluster tweet data and store statistical results

MLH-Fellowship · Oct 11, 2020 · 15ae386 · 15ae386
1 parent c50e570
commit 15ae386
Show file tree

Hide file tree

Showing 3 changed files with 156 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -325,3 +325,4 @@ tags
 
 
 db.sqlite3
+/scripts/.idea/
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -2,6 +2,11 @@ pytz==2020.1  # https://github.com/stub42/pytz
 python-slugify==4.0.1  # https://github.com/un33k/python-slugify
 Pillow==7.2.0  # https://github.com/python-pillow/Pillow
 argon2-cffi==20.1.0  # https://github.com/hynek/argon2_cffi
+tweepy~=3.9.0  # https://github.com/tweepy/tweepy
+python-dotenv~=0.14.0  # https://github.com/theskumar/python-dotenv
+nltk~=3.5  # https://github.com/nltk/nltk
+pandas~=1.1.3  # https://github.com/pandas-dev/pandas
+scikit-learn~=0.23.2  # https://github.com/scikit-learn/scikit-learn
 
 # Flask
 # ------------------------------------------------------------------------------

diff --git a/scripts/main.py b/scripts/main.py
@@ -0,0 +1,150 @@
+import os
+from dotenv import load_dotenv
+import tweepy as tw
+import re
+import string
+import nltk
+import pandas as pd
+from sklearn.cluster import DBSCAN
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+# NLTK variables to aid in cleaning/preprocessing
+ps = nltk.PorterStemmer()
+
+# Pandas dataframe for tweets
+df = pd.DataFrame(columns=["id", "tweet", "popularity"])
+
+
+class Results:
+    def __init__(self):
+        """Initialize final results of the analysis"""
+        self.clusters_count = df.clusters.value_counts()
+        self.df_results = df.groupby(["clusters"]).max().reset_index()
+        print("Number of tweets per cluster: \n{}".format(self.clusters_count))
+        print("Top cluster tweets: \n{}".format(self.df_results.to_string()))
+
+
+def authorise_api():
+    """Authorise access to twitter API and return the api handler"""
+    load_dotenv()
+    consumer_key = os.getenv("CONSUMER_KEY")
+    consumer_key_secret = os.getenv("CONSUMER_KEY_SECRET")
+    access_token = os.getenv("ACCESS_TOKEN")
+    access_token_secret = os.getenv("ACCESS_TOKEN_SECRET")
+
+    auth = tw.OAuthHandler(consumer_key, consumer_key_secret)
+    auth.set_access_token(access_token, access_token_secret)
+    api = tw.API(auth, wait_on_rate_limit=True)
+
+    return api
+
+
+def get_top_trends(api):
+    """Returns a list of top trends at the specified location"""
+    # Current location: New Delhi, India. WOEID = 1 for global trending
+    latitude = 28.644800
+    longitude = 77.216721
+
+    locations = api.trends_closest(latitude, longitude)
+    woeid = locations[0]["woeid"]
+
+    trends = api.trends_place(woeid)
+    trends_dict = trends[0]["trends"]
+
+    return [trends_dict[0]]
+
+
+def de_emojify(text):
+    """Remove emoticons from given text"""
+    regrex_pattern = re.compile(pattern="["
+                                        u"\U0001F600-\U0001F64F"  # emoticons
+                                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+                                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
+                                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+                                        u"\U00002500-\U00002BEF"  # chinese char
+                                        u"\U00002702-\U000027B0"
+                                        u"\U00002702-\U000027B0"
+                                        u"\U000024C2-\U0001F251"
+                                        u"\U0001f926-\U0001f937"
+                                        u"\U00010000-\U0010ffff"
+                                        u"\u2640-\u2642"
+                                        u"\u2600-\u2B55"
+                                        u"\u200d"
+                                        u"\u23cf"
+                                        u"\u23e9"
+                                        u"\u231a"
+                                        u"\ufe0f"  # dingbats
+                                        u"\u3030"
+                                        "]+", flags=re.UNICODE)
+    return regrex_pattern.sub(r'', text)
+
+
+def remove_punctuation(text):
+    """Remove links and other punctuation from text"""
+    text = text.replace('\n', '')
+    text = text.replace('\t', '')
+    re.sub(r'http\S+', '', text)    # removes links
+
+    translator = str.maketrans('', '', string.punctuation)
+    return text.lower().translate(translator)
+
+
+def tokenize(text):
+    """Stem and tokenizes input text, used as custom tokenizer in tfi-df vectorization"""
+    tokens = nltk.word_tokenize(text)
+    stems = []
+    for item in tokens:
+        stems.append(ps.stem(item))
+    return stems
+
+
+def cluster(esp):
+    """Clusters data using DBSCAN with a specified esp value"""
+    df["tweet_clean"] = df["tweet"].apply(lambda y: remove_punctuation(y))
+    df["tweet_clean"] = df["tweet_clean"].apply(lambda y: de_emojify(y))
+
+    vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english', min_df=1)
+    x = vectorizer.fit_transform(df.loc[:, "tweet_clean"])
+
+    db = DBSCAN(esp, min_samples=20).fit(x)
+
+    df["clusters"] = db.labels_
+    print("Number of unique clusters generated: {}".format(df.clusters.nunique()))
+
+
+def stream(api, find_word):
+    """Steam tweets containing the specified word"""
+    query = (
+        find_word + " -filter:retweet" + " -filter:media" + " -filter:links"
+    )
+    i = 0
+    limit = 1000
+    tweet_count = 100
+
+    for tweet in tw.Cursor(
+        api.search, q=query, count=tweet_count, lang="en", result_type="mixed"
+    ).items():
+        df.loc[i, "id"] = tweet.id
+        df.loc[i, "tweet"] = tweet.text
+        df.loc[i, "popularity"] = tweet.favorite_count + tweet.retweet_count
+        i += 1
+
+        if i > limit:
+            break
+        else:
+            pass
+
+
+def main():
+    api = authorise_api()
+    top_trend = get_top_trends(api)
+    stream(api, top_trend[0]["name"])
+
+    esp = 1.29
+    cluster(esp)
+
+    Results()
+
+
+if __name__ == "__main__":
+    main()