-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_dates.py
144 lines (122 loc) · 5.91 KB
/
get_dates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import sys
import csv
import matplotlib.pyplot as plt
import pandas as pd
import re
from scipy.signal import argrelextrema
import numpy as np
from datetime import date
# Test that string lnput is in 20XX-XX-XX format.
def is_date(lnput):
return len(lnput) == 10 and re.match(r"(?<![\w-])20\d\d-\d\d-\d\d(?![\w-])",lnput) != None
# Convert the date format of our data into the number of days since January 1st, 2014, expressed as an int.
# date_string: the string containing the date in 20XX-XX-XX format.
def days_since(date_string: str):
year = int(date_string[0:4])
month = int(date_string[5:7])
day = int(date_string[8:10])
d0 = date(2014,1,1)
d1 = date(year,month,day)
return(d1 - d0).days
# Cull twitter data for dates and save them to a new file. Used to efficiently iterate through frequency.
# twitter_file: the string describing the filepath to the twitter data
# savename: the string to name the extracted date file, in .txt format. Default does not save, and returns date as ...
# ... string output.
# -- Returns the extracted dates in a string
def extract_dates(twitter_file, savename = ''):
dates = ""
with open(twitter_file) as make_dates:
make_dates_reader = make_dates.read() + "\n"
match = re.findall(r"20\d\d-\d\d-\d\d", make_dates_reader)
if savename != '':
with open(savename, "w") as write_dates:
for row in match:
dates += row + "\n"
write_dates.write(row + "\n")
return dates
# Using the saved filepath of extract_dates(), create a dataframe that stores the # of tweets per day since January 1st, 2014.
# dates_file: the string containing the path to the extracted dates
# savename: the string to name the saved dataframe, in .json format. Default does not save, and returns date as ...
# ... string output.
# -- Returns the pandas dataframe
def get_table(dates_file, savename = ''):
dates_lib = dict()
with open(dates_file) as make_dates:
make_dates_reader = make_dates.readlines()
for row in make_dates_reader:
if type(row) != type(45):
if dates_lib.get(days_since(row),0) == 0:
dates_lib[days_since(row)] = 1
else: dates_lib[days_since(row)] += 1
for x in range(days_since(row)):
if dates_lib.get(x, 0) == 0:
dates_lib[x] = 0
data = pd.DataFrame.from_dict(list(dates_lib.items()))
data.columns = ["days_since", "frequency"]
data = data.sort_values(by = ['days_since'])
if savename != '':
data.to_json(savename)
return data
# Using the output of get_table(), find the dates around which the data should be split to conduct a time series analysis.
# data: a pandas dataframe with columns 'days_since' and 'frequency', generated by get_table() above.
# mode: method by which to bin data:
# - "localmax" returns bins created according to relative maxima and can only be augmented with kwarg order.
# - "time" interprets kwarg bintervals as the number of days in between bins.
# - "tweets" returns bin quantities relative to number of tweets. If use_bin_numbers is True, bintervals is interpreted as # of desired bins. ...
# ... Else, bintervals is interpreted as # of tweets.
# use_bin_numbers: used for tweets as described above.
# bintervals: integer value representing a few possible values:
# - in "time", the number of days in between bin partitions.
# - in "tweets" with use_bin_numbers = True, the number of bins to make.
# - in "tweets" with use_bin_numbers = False, the number of tweets.
# order: for "localmax", optional fine tuner. Increasing order should make the function more sensitive to minor disruptions and report those.
# -- Returns a list of dates to use as the end of each bin.
def make_bins(data, mode, use_bin_numbers = False, bintervals = 0, order = 80):
final_list = []
if mode == "localmax":
localmax = argrelextrema(data.frequency.values, np.greater_equal, order=order)[0].tolist()
print(data.loc[localmax]['days_since'].tolist())
to_append = data[[localmax.count(row[0]) != 0 for row in data.iterrows()]]['days_since']
for row in to_append:
if(row > 159):
final_list.append(row)
elif mode == "time":
x = 159
while x < data.iloc[-1][0]:
final_list.append(x)
x += bintervals
elif mode == "tweets":
if use_bin_numbers:
if bintervals == 0: raise Exception("Specify a number of bins please!")
tweet_cap = int((data.sum()[1]) / bintervals)
makebins(data, mode, bintervals = tweet_cap)
else:
if bintervals == 0: raise Exception("Specify a number of tweets please!")
x = 0
exhaust = 0
while x < len(data):
if exhaust + data.iloc[x][1] > bintervals:
final_list.append(x)
exhaust = 0
x += 1
else:
exhaust += data.iloc[x][1]
x += 1
else: raise Exception("Please use 'time', 'tweets', or 'localmaxmin'")
return final_list
# Makes a plot that shows the frequencies from get_table().
# data: output of get_table().
# bins: output of make_bins(). If None, no partitions shown.
# savename: filename to save as. Default "plot.jpg" or "plot_binned.jpg."
def make_plot(data, bins = None, savename = 'plot.jpg'):
plt.plot(data['days_since'], data['frequency'])
if bins != None:
for x in range(len(bins)):
plt.axvline(x = bins[x], color = 'r', label = bins[x])
if bins != None and savename == 'plot.jpg':
plt.savefig("plot_binned.jpg")
else: plt.savefig(savename)
file = sys.argv[1]
extract_dates(file, savename = file[:-4] + "_dates.csv")
data = get_table(file[:-4] + "_dates.csv")
print(make_bins(data, 'localmax', order = 150))