get_dates.py

import sys
import csv
import matplotlib.pyplot as plt
import pandas as pd
import re
from scipy.signal import argrelextrema
import numpy as np
from datetime import date


# Test that string lnput is in 20XX-XX-XX format.
def is_date(lnput):
    return len(lnput) == 10 and re.match(r"(?<![\w-])20\d\d-\d\d-\d\d(?![\w-])",lnput) != None

# Convert the date format of our data into the number of days since January 1st, 2014, expressed as an int.
#   date_string: the string containing the date in 20XX-XX-XX format.
def days_since(date_string: str):
    year = int(date_string[0:4])
    month = int(date_string[5:7])
    day = int(date_string[8:10])
    d0 = date(2014,1,1)
    d1 = date(year,month,day)
    return(d1 - d0).days

# Cull twitter data for dates and save them to a new file. Used to efficiently iterate through frequency.
#   twitter_file: the string describing the filepath to the twitter data
#   savename: the string to name the extracted date file, in .txt format. Default does not save, and returns date as ...
#       ... string output.
#   -- Returns the extracted dates in a string
def extract_dates(twitter_file, savename = ''):
    dates = ""
    with open(twitter_file) as make_dates:
        make_dates_reader = make_dates.read() + "\n"
        match = re.findall(r"20\d\d-\d\d-\d\d", make_dates_reader)
    if savename != '':
        with open(savename, "w") as write_dates:
            for row in match:
                dates += row + "\n"
                write_dates.write(row + "\n")
    return dates

# Using the saved filepath of extract_dates(), create a dataframe that stores the # of tweets per day since January 1st, 2014.
#   dates_file: the string containing the path to the extracted dates
#   savename: the string to name the saved dataframe, in .json format. Default does not save, and returns date as ...
#       ... string output.
#   -- Returns the pandas dataframe 
def get_table(dates_file, savename = ''):
    dates_lib = dict()
    with open(dates_file) as make_dates:
        make_dates_reader = make_dates.readlines()
        for row in make_dates_reader:
            
            if type(row) != type(45):
                if dates_lib.get(days_since(row),0) == 0:
                    dates_lib[days_since(row)] = 1
                else: dates_lib[days_since(row)] += 1
    for x in range(days_since(row)):
        if dates_lib.get(x, 0) == 0:
            dates_lib[x] = 0
    data = pd.DataFrame.from_dict(list(dates_lib.items()))
    data.columns = ["days_since", "frequency"]
    data = data.sort_values(by = ['days_since'])
    if savename != '':
        data.to_json(savename)
    return data


# Using the output of get_table(), find the dates around which the data should be split to conduct a time series analysis.
#   data: a pandas dataframe with columns 'days_since' and 'frequency', generated by get_table() above.
#   mode: method by which to bin data:
#       - "localmax" returns bins created according to relative maxima and can only be augmented with kwarg order.
#       - "time" interprets kwarg bintervals as the number of days in between bins.
#       - "tweets" returns bin quantities relative to number of tweets. If use_bin_numbers is True, bintervals is interpreted as # of desired bins. ...
#           ... Else, bintervals is interpreted as # of tweets.
#   use_bin_numbers: used for tweets as described above.
#   bintervals: integer value representing a few possible values:
#       - in "time", the number of days in between bin partitions.
#       - in "tweets" with use_bin_numbers = True, the number of bins to make.
#       - in "tweets" with use_bin_numbers = False, the number of tweets.
#   order: for "localmax", optional fine tuner. Increasing order should make the function more sensitive to minor disruptions and report those.
#   -- Returns a list of dates to use as the end of each bin.
def make_bins(data, mode, use_bin_numbers = False, bintervals = 0, order = 80):
    final_list = []

    if mode == "localmax":
        localmax = argrelextrema(data.frequency.values, np.greater_equal, order=order)[0].tolist()
        print(data.loc[localmax]['days_since'].tolist())
        to_append = data[[localmax.count(row[0]) != 0 for row in data.iterrows()]]['days_since']
        for row in to_append:
            if(row > 159):
                final_list.append(row)


    elif mode == "time":
        x = 159
        while x < data.iloc[-1][0]:
            final_list.append(x)
            x += bintervals


    elif mode == "tweets":
        if use_bin_numbers:
            if bintervals == 0: raise Exception("Specify a number of bins please!")
            tweet_cap = int((data.sum()[1]) / bintervals)
            makebins(data, mode, bintervals = tweet_cap)

        else:
            if bintervals == 0: raise Exception("Specify a number of tweets please!")
            x = 0
            exhaust = 0
            while x < len(data):
                if exhaust + data.iloc[x][1] > bintervals:
                    final_list.append(x)
                    exhaust = 0
                    x += 1
                else:
                    exhaust += data.iloc[x][1]
                    x += 1
            

    else: raise Exception("Please use 'time', 'tweets', or 'localmaxmin'")

    return final_list

# Makes a plot that shows the frequencies from get_table().
#   data: output of get_table().
#   bins: output of make_bins(). If None, no partitions shown.
#   savename: filename to save as. Default "plot.jpg" or "plot_binned.jpg."
def make_plot(data, bins = None, savename = 'plot.jpg'):
    plt.plot(data['days_since'], data['frequency'])
    if bins != None:
        for x in range(len(bins)):
            plt.axvline(x = bins[x], color = 'r', label = bins[x])
    if bins != None and savename == 'plot.jpg':
        plt.savefig("plot_binned.jpg")

    else: plt.savefig(savename)


file = sys.argv[1]
extract_dates(file, savename = file[:-4] + "_dates.csv")
data = get_table(file[:-4] + "_dates.csv")
print(make_bins(data, 'localmax', order = 150))