Merge pull request #1 from ericnost/remove_twitter

Remove twitter
ericnost · Dec 15, 2023 · be7b144 · be7b144
2 parents 7df8a78 + 1c2e916
commit be7b144
Show file tree

Hide file tree

Showing 8 changed files with 6 additions and 159 deletions.
diff --git a/dist/observatory-0.0.2-py3-none-any.whl b/dist/observatory-0.0.2-py3-none-any.whl
diff --git a/dist/observatory-0.0.2.tar.gz b/dist/observatory-0.0.2.tar.gz
diff --git a/dist/observatory-1.1.0-py3-none-any.whl b/dist/observatory-1.1.0-py3-none-any.whl
diff --git a/dist/observatory-1.1.0.tar.gz b/dist/observatory-1.1.0.tar.gz
diff --git a/dist/observatory-1.2.0-py3-none-any.whl b/dist/observatory-1.2.0-py3-none-any.whl
diff --git a/dist/observatory-1.2.0.tar.gz b/dist/observatory-1.2.0.tar.gz
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
 
 [project]
 name = "observatory"
-version = "1.1.0"
+version = "1.2.0"
 authors = [
   { name="Eric Nost", email="[email protected]" },
 ]
@@ -24,7 +24,7 @@ dependencies = [
 	"Requests==2.31.0",
 	"Scrapy==2.6.1",
 	"tldextract==3.2.0",
-	"wayback==0.3.2"
+	"wayback>=0.3.2"
 ]
 
 [project.urls]

diff --git a/src/observatory.py b/src/observatory.py
@@ -8,7 +8,7 @@
 project = "default"
 
 # Set passwords
-credentials = {"twitter":"", "google":{"devkey": "", "cx": ""}, "postgres":{"user":"", "db":"", "password":""}}
+credentials = {"google":{"devkey": "", "cx": ""}, "postgres":{"user":"", "db":"", "password":""}}
 
 def start_project(project):
   # Create project directory
@@ -26,159 +26,6 @@ def start_project(project):
 # Global imports
 import pandas
 
-# Twitter search
-# Based on https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/master/Full-Archive-Search/full-archive-search.py
-def search_twitter(q = None, project = None):
-  # Import helper code
-  import requests
-  import os
-  import pandas
-  import time
-
-  # Activates permissions
-  bearer_token = credentials["twitter"] 
-  headers = {"Authorization": "Bearer {}".format(bearer_token)} 
-
-  search_url = "https://api.twitter.com/2/tweets/search/all"
-
-  # Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
-  # expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
-  # https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query
-  # https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all
-
-  query_params = {'query': 'from:ericnost has:links', #default to my tweets!
-                  'tweet.fields': 'created_at,entities,public_metrics',
-                  'max_results': 100,
-                  'start_time': '2006-03-21T00:00:00.00Z', #'2022-01-01T00:00:00.00Z', #2006-03-21T00:00Z = beginning of Twitter time
-                  'end_time': '2022-01-01T00:00:00.00Z' # Default to end at the end of 2021
-                  }
-
-  # Global params 
-  results = [] # Full results
-
-  # update query params
-  if q != None: query_params["query"] = q
-  # Export path
-  path = project+"/twitter_search_results_"+query_params["query"]+".csv"
-
-  def connect_to_endpoint(url, headers, params):
-    response = requests.request("GET", search_url, headers=headers, params=params)
-    #print(response.status_code,response.json()) # Debugging
-    if response.status_code != 200:
-        raise Exception(response.status_code, response.text)
-    return response.json()
-
-  def export(path):
-    """
-    #export what we have so we don't lose it
-    """
-    dump = pandas.DataFrame(results)
-    dump.to_csv(path)
-    #print("exported") # Debugging
-
-  def analysis(json_response):
-    # pull out created_at, urls - expanded_url, a sum of public metrics, and, if relevant, next_token
-    for result in json_response["data"]:
-      #print(result) # Debugging
-      #for each link in each tweet:
-      if "entities" in result and "urls" in result["entities"]: # means we only save tweets with links. unnecessary with -has:links ?
-        for url in result["entities"]["urls"]:
-          tweeted_link = {"date": None, "link": None, "tweet_id": None, "metrics": None, "source": "Twitter", "query": query_params["query"]}
-          # pull out created at
-          tweeted_link["date"] = result["created_at"]
-          tweeted_link["link"] = url["expanded_url"] 
-          # sum public metrics
-          tweeted_link["metrics"] = sum(result["public_metrics"].values())
-          tweeted_link["tweet_id"] = result["id"]
-          results.append(tweeted_link)
-      else:
-        pass
-
-  def query():
-    """
-    limit = 1 request (100 tweets) / 1 second
-    also, 300 requests (30,000 tweets) per 15-minute window
-    so, really limit = 1 request / 3 second
-    slow down!
-    """
-    time.sleep(5)
-
-    # Return the data
-    json_response = connect_to_endpoint(search_url, headers, query_params) 
-    #print(json.dumps(json_response, sort_keys=True)) # Debugging
-
-    return json_response 
-
-  def get_tweets():
-    #if we have a next page token from a previous query, get rid of it
-    if "next_token" in query_params.keys():
-      del query_params["next_token"]
-    print(query_params)
-
-    results.clear() # clear results from previous searches
-
-    next = True
-    count = 1
-
-    while next:
-      try:
-        #Get the data
-        json_response = query()
-
-        #Parse the data
-        if json_response["meta"]["result_count"] > 0 :
-          analysis(json_response)
-
-        # export the current state of results every 2000 results... 
-        if count % 20 == 0:
-          export(path)
-
-        # update query params to next
-        if "next_token" in json_response["meta"]:
-          next_token = json_response["meta"]["next_token"]
-          query_params["next_token"] = next_token
-          count += 1
-          print(count, end='\r')
-        else:
-          next = False
-          export(path)
-          print("Done!")
-
-      except: # if it breaks, export what we have
-        export(path)
-
-  get_tweets()
-  return results
-
-"""### Process Twitter search(es)"""
-
-def twitter_process(filespath = None, project = None):
-  #Clobber together output from TWITTER Query CSVs
-  import glob
-  import csv
-  import pandas
-
-  if filespath == None:
-    filespath = project+"/*twitter*.csv" # Default path # Colab /content/*.csv
-
-  filenames = glob.glob(filespath)
-
-  if len(filenames) > 1: # if we have multiple files
-    combined_csv = pandas.concat( [ pandas.read_csv(f, header=0, encoding='utf-8') for f in filenames ], join="inner" )
-  else: # if we have just one
-    combined_csv = pandas.read_csv(filenames[0], header=0, encoding='utf-8')
-  #combined_csv.columns = ['original_index', 'date', 'link', 'tweet_id', 'metrics', 'source', 'query', 'new_index'] # Rename columns (?)
-
-  twitter = combined_csv
-  twitter = twitter[~twitter["link"].astype(str).str.contains("twitter.com")] # Remove twitter.com links. We don't want links to other tweets.
-  twitter = twitter.sort_values(by=['date']) # Sorts oldest to top
-  twitter = twitter.drop_duplicates(subset='link', keep="first") # Keep first link which should be oldest - this is most conservative approach in terms of timeline
-  twitter.reset_index(inplace=True, drop=True)
-  twitter = twitter[['date','link','metrics','source','query']] # Drop unnecessary columns
-  twitter.to_csv(project+"/twitter_search_results_compiled.csv") # Export compiled twitter queries
-
-  return twitter
-
 
 """## Google search (Python) v.1
 
@@ -506,7 +353,7 @@ class NoVersions(Error):
   import requests
   import fnmatch
   from urllib.parse import urljoin, urlparse
-  from wayback import WaybackClient  #This is a hidden requirement - never gets installed
+  from wayback import WaybackClient
 
   # Set up
   pages["full_url"] = None # The actual url and not some bit.ly url 
@@ -703,7 +550,7 @@ def scrape(urls):
   import psycopg2
 
   conn = None 
-  conn = psycopg2.connect("dbname="+credentials["postgres"]["db"]+" user="+credentials["postgres"]["user"]+" password="+credentials["postgres"]["password"])
+  conn = psycopg2.connect("dbname="+credentials["postgres"]["db"]+" user="+credentials["postgres"]["user"]+" password="+credentials["postgres"]["password"]+" host=localhost port=5432")
   cur = conn.cursor()
 
   def DBinsert(url, contents, date, source, domain, count):
@@ -778,7 +625,7 @@ def query(qtype, terms = None, project = None):
   # Connect to db
   import psycopg2
   conn = None 
-  conn = psycopg2.connect("dbname="+credentials["postgres"]["db"]+" user="+credentials["postgres"]["user"]+" password="+credentials["postgres"]["password"])
+  conn = psycopg2.connect("dbname="+credentials["postgres"]["db"]+" user="+credentials["postgres"]["user"]+" password="+credentials["postgres"]["password"]+" host=localhost port=5432")
   cur = conn.cursor()
 
   if qtype == "copy":