Skip to content

Commit

Permalink
Merge pull request #1 from ericnost/remove_twitter
Browse files Browse the repository at this point in the history
Remove twitter
  • Loading branch information
ericnost authored Dec 15, 2023
2 parents 7df8a78 + 1c2e916 commit be7b144
Show file tree
Hide file tree
Showing 8 changed files with 6 additions and 159 deletions.
Binary file removed dist/observatory-0.0.2-py3-none-any.whl
Binary file not shown.
Binary file removed dist/observatory-0.0.2.tar.gz
Binary file not shown.
Binary file removed dist/observatory-1.1.0-py3-none-any.whl
Binary file not shown.
Binary file removed dist/observatory-1.1.0.tar.gz
Binary file not shown.
Binary file added dist/observatory-1.2.0-py3-none-any.whl
Binary file not shown.
Binary file added dist/observatory-1.2.0.tar.gz
Binary file not shown.
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"

[project]
name = "observatory"
version = "1.1.0"
version = "1.2.0"
authors = [
{ name="Eric Nost", email="[email protected]" },
]
Expand All @@ -24,7 +24,7 @@ dependencies = [
"Requests==2.31.0",
"Scrapy==2.6.1",
"tldextract==3.2.0",
"wayback==0.3.2"
"wayback>=0.3.2"
]

[project.urls]
Expand Down
161 changes: 4 additions & 157 deletions src/observatory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
project = "default"

# Set passwords
credentials = {"twitter":"", "google":{"devkey": "", "cx": ""}, "postgres":{"user":"", "db":"", "password":""}}
credentials = {"google":{"devkey": "", "cx": ""}, "postgres":{"user":"", "db":"", "password":""}}

def start_project(project):
# Create project directory
Expand All @@ -26,159 +26,6 @@ def start_project(project):
# Global imports
import pandas

# Twitter search
# Based on https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/master/Full-Archive-Search/full-archive-search.py
def search_twitter(q = None, project = None):
# Import helper code
import requests
import os
import pandas
import time

# Activates permissions
bearer_token = credentials["twitter"]
headers = {"Authorization": "Bearer {}".format(bearer_token)}

search_url = "https://api.twitter.com/2/tweets/search/all"

# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
# https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query
# https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all

query_params = {'query': 'from:ericnost has:links', #default to my tweets!
'tweet.fields': 'created_at,entities,public_metrics',
'max_results': 100,
'start_time': '2006-03-21T00:00:00.00Z', #'2022-01-01T00:00:00.00Z', #2006-03-21T00:00Z = beginning of Twitter time
'end_time': '2022-01-01T00:00:00.00Z' # Default to end at the end of 2021
}

# Global params
results = [] # Full results

# update query params
if q != None: query_params["query"] = q
# Export path
path = project+"/twitter_search_results_"+query_params["query"]+".csv"

def connect_to_endpoint(url, headers, params):
response = requests.request("GET", search_url, headers=headers, params=params)
#print(response.status_code,response.json()) # Debugging
if response.status_code != 200:
raise Exception(response.status_code, response.text)
return response.json()

def export(path):
"""
#export what we have so we don't lose it
"""
dump = pandas.DataFrame(results)
dump.to_csv(path)
#print("exported") # Debugging

def analysis(json_response):
# pull out created_at, urls - expanded_url, a sum of public metrics, and, if relevant, next_token
for result in json_response["data"]:
#print(result) # Debugging
#for each link in each tweet:
if "entities" in result and "urls" in result["entities"]: # means we only save tweets with links. unnecessary with -has:links ?
for url in result["entities"]["urls"]:
tweeted_link = {"date": None, "link": None, "tweet_id": None, "metrics": None, "source": "Twitter", "query": query_params["query"]}
# pull out created at
tweeted_link["date"] = result["created_at"]
tweeted_link["link"] = url["expanded_url"]
# sum public metrics
tweeted_link["metrics"] = sum(result["public_metrics"].values())
tweeted_link["tweet_id"] = result["id"]
results.append(tweeted_link)
else:
pass

def query():
"""
limit = 1 request (100 tweets) / 1 second
also, 300 requests (30,000 tweets) per 15-minute window
so, really limit = 1 request / 3 second
slow down!
"""
time.sleep(5)

# Return the data
json_response = connect_to_endpoint(search_url, headers, query_params)
#print(json.dumps(json_response, sort_keys=True)) # Debugging

return json_response

def get_tweets():
#if we have a next page token from a previous query, get rid of it
if "next_token" in query_params.keys():
del query_params["next_token"]
print(query_params)

results.clear() # clear results from previous searches

next = True
count = 1

while next:
try:
#Get the data
json_response = query()

#Parse the data
if json_response["meta"]["result_count"] > 0 :
analysis(json_response)

# export the current state of results every 2000 results...
if count % 20 == 0:
export(path)

# update query params to next
if "next_token" in json_response["meta"]:
next_token = json_response["meta"]["next_token"]
query_params["next_token"] = next_token
count += 1
print(count, end='\r')
else:
next = False
export(path)
print("Done!")

except: # if it breaks, export what we have
export(path)

get_tweets()
return results

"""### Process Twitter search(es)"""

def twitter_process(filespath = None, project = None):
#Clobber together output from TWITTER Query CSVs
import glob
import csv
import pandas

if filespath == None:
filespath = project+"/*twitter*.csv" # Default path # Colab /content/*.csv

filenames = glob.glob(filespath)

if len(filenames) > 1: # if we have multiple files
combined_csv = pandas.concat( [ pandas.read_csv(f, header=0, encoding='utf-8') for f in filenames ], join="inner" )
else: # if we have just one
combined_csv = pandas.read_csv(filenames[0], header=0, encoding='utf-8')
#combined_csv.columns = ['original_index', 'date', 'link', 'tweet_id', 'metrics', 'source', 'query', 'new_index'] # Rename columns (?)

twitter = combined_csv
twitter = twitter[~twitter["link"].astype(str).str.contains("twitter.com")] # Remove twitter.com links. We don't want links to other tweets.
twitter = twitter.sort_values(by=['date']) # Sorts oldest to top
twitter = twitter.drop_duplicates(subset='link', keep="first") # Keep first link which should be oldest - this is most conservative approach in terms of timeline
twitter.reset_index(inplace=True, drop=True)
twitter = twitter[['date','link','metrics','source','query']] # Drop unnecessary columns
twitter.to_csv(project+"/twitter_search_results_compiled.csv") # Export compiled twitter queries

return twitter


"""## Google search (Python) v.1
Expand Down Expand Up @@ -506,7 +353,7 @@ class NoVersions(Error):
import requests
import fnmatch
from urllib.parse import urljoin, urlparse
from wayback import WaybackClient #This is a hidden requirement - never gets installed
from wayback import WaybackClient

# Set up
pages["full_url"] = None # The actual url and not some bit.ly url
Expand Down Expand Up @@ -703,7 +550,7 @@ def scrape(urls):
import psycopg2

conn = None
conn = psycopg2.connect("dbname="+credentials["postgres"]["db"]+" user="+credentials["postgres"]["user"]+" password="+credentials["postgres"]["password"])
conn = psycopg2.connect("dbname="+credentials["postgres"]["db"]+" user="+credentials["postgres"]["user"]+" password="+credentials["postgres"]["password"]+" host=localhost port=5432")
cur = conn.cursor()

def DBinsert(url, contents, date, source, domain, count):
Expand Down Expand Up @@ -778,7 +625,7 @@ def query(qtype, terms = None, project = None):
# Connect to db
import psycopg2
conn = None
conn = psycopg2.connect("dbname="+credentials["postgres"]["db"]+" user="+credentials["postgres"]["user"]+" password="+credentials["postgres"]["password"])
conn = psycopg2.connect("dbname="+credentials["postgres"]["db"]+" user="+credentials["postgres"]["user"]+" password="+credentials["postgres"]["password"]+" host=localhost port=5432")
cur = conn.cursor()

if qtype == "copy":
Expand Down

0 comments on commit be7b144

Please sign in to comment.