Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

time_variation: Code optimization (refer issue #51) #62

Open
wants to merge 13 commits into
base: master
Choose a base branch
from

Conversation

Tanvi-Jain01
Copy link

@Tanvi-Jain01 Tanvi-Jain01 commented Jul 10, 2023

@nipunbatra , @patel-zeel
This PR solves the issue for #51.

BEFORE:

CODE:

def timeVariation(df, pollutant):
    """ 
    Plots four plots:
    - The average pollutant level per day by 
    each hour for each day of the week across all of the data
    - The average pollutant level by each hour, 
    across all data
    - The average pollutant level by each month of the
    year for across data
    - The average pollutant level per day of the week 
    across all data
    
    Parameters
    ----------
    df: pandas.DataFrame
      data frame of hourly data. 
      Must include a date field and at least one variable to plot
    pollutant: str
      Name of variable to plot

    """
    import datetime as dt
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    import numpy as np
    import pandas as pd
    from numpy import array

    df["date"] = pd.to_datetime(df.date)

    df_days = df
    df_days["day"] = df_days["date"].dt.day_name()
    df_days = df_days.set_index(keys=["day"])
    df_days = df_days.groupby(["day"])

    dayWeek = [
        "Monday",
        "Tuesday",
        "Wednesday",
        "Thursday",
        "Friday",
        "Saturday",
        "Sunday",
    ]
    pollutant = ["pm10", "no2", "pm25", "so2"]

    for i in range(len(dayWeek)):
        plt.figure(1, figsize=(40, 5))
        plt.subplot(1, 7, i + 1)
        plt.grid()

        df_day = df_days.get_group(dayWeek[i])
        df_day["hour"] = df_day["date"].dt.hour

        df_day_m = df_day.groupby("hour").mean()
        df_day_m = df_day_m.reset_index()

        df_day_s = df_day.groupby("hour").std()
        df_day_s = df_day_s.reset_index()

        for k in range(len(pollutant)):
            plt.plot(df_day_m["hour"], df_day_m[pollutant[k]], label=pollutant[k])
            plt.fill_between(
                df_day_s["hour"],
                df_day_m[pollutant[k]] - 0.5 * df_day_s[pollutant[k]],
                df_day_m[pollutant[k]] + 0.5 * df_day_s[pollutant[k]],
                alpha=0.2,
            )
            plt.xlabel(dayWeek[i])
            plt.legend()

    plt.figure(2, figsize=(35, 5))
    plt.subplot(1, 3, 1)

    df_hour = df
    df_hour["hour"] = df_hour["date"].dt.hour

    df_hour_m = df.groupby("hour").mean()
    df_hour_m = df_hour_m.reset_index()

    df_hour_s = df.groupby("hour").std()
    df_hour_s = df_hour_s.reset_index()

    plt.grid()

    for i in range(len(pollutant)):
        plt.plot(df_hour_m["hour"], df_hour_m[pollutant[i]], label=pollutant[i])
        plt.fill_between(
            df_hour_s["hour"],
            df_hour_m[pollutant[i]] - 0.5 * df_hour_s[pollutant[i]],
            df_hour_m[pollutant[i]] + 0.5 * df_hour_s[pollutant[i]],
            alpha=0.2,
        )
        plt.xlabel("Hour")
        plt.legend()

    plt.subplot(1, 3, 2)
    df_month = df
    df_month["month"] = df_month["date"].dt.month

    df_month_m = df_month.groupby("month").mean()
    df_month_m = df_month_m.reset_index()

    df_month_s = df_month.groupby("month").std()
    df_month_s = df_month_s.reset_index()

    plt.grid()

    for i in range(len(pollutant)):
        plt.plot(df_month_m["month"], df_month_m[pollutant[i]], label=pollutant[i])
        plt.fill_between(
            df_month_s["month"],
            df_month_m[pollutant[i]] - 0.5 * df_month_s[pollutant[i]],
            df_month_m[pollutant[i]] + 0.5 * df_month_s[pollutant[i]],
            alpha=0.2,
        )
        plt.xlabel("Month")
        plt.legend()

    plt.subplot(1, 3, 3)
    df_weekday = df
    df_weekday["weekday"] = df_weekday["date"].dt.weekday

    df_weekday_m = df_weekday.groupby("weekday").mean()
    df_weekday_m = df_weekday_m.reset_index()

    df_weekday_s = df_weekday.groupby("weekday").std()
    df_weekday_s = df_weekday_s.reset_index()

    plt.grid()

    for i in range(len(pollutant)):
        plt.plot(
            df_weekday_m["weekday"], df_weekday_m[pollutant[i]], label=pollutant[i]
        )
        plt.fill_between(
            df_weekday_s["weekday"],
            df_weekday_m[pollutant[i]] - 0.5 * df_weekday_s[pollutant[i]],
            df_weekday_m[pollutant[i]] + 0.5 * df_weekday_s[pollutant[i]],
            alpha=0.2,
        )
        plt.xlabel("WeekDay")
        plt.legend()


# =============================================================================
# df = pd.read_csv("mydata.csv")
# timeVariation(df,['pm10'])
# =============================================================================

AFTER:

CODE:

import datetime as dt
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
from numpy import array

def time_variation(df:pd.DataFrame, pollutant:list=['pm25']):
    """ 
    Plots four plots:
    - The average pollutant level per day by 
    each hour for each day of the week across all of the data
    - The average pollutant level by each hour, 
    across all data
    - The average pollutant level by each month of the
    year for across data
    - The average pollutant level per day of the week 
    across all data
    
    Parameters
    ----------
    df: pandas.DataFrame
      data frame of hourly data. 
      Must include a date field and at least one variable to plot
    pollutant: list
      Name of variables to plot

    """
   

    df["date"] = pd.to_datetime(df.date)

    df_days=df
    df_hour=df
    df_month=df
    df_weekday = df
    
    df_days["day"] = df_days["date"].dt.day_name()
    df_days = df_days.set_index(keys=["day"])
    df_days = df_days.groupby(["day"])

    dayWeek = [
        "Monday",
        "Tuesday",
        "Wednesday",
        "Thursday",
        "Friday",
        "Saturday",
        "Sunday",
    ]

    for i in range(len(dayWeek)):
        plt.figure(1, figsize=(40, 5))
        plt.subplot(1, 7, i + 1)
        plt.grid()

        df_day = df_days.get_group(dayWeek[i])
        df_day["hour"] = df_day["date"].dt.hour

        df_day_m = df_day.groupby("hour").mean().reset_index()
        df_day_s = df_day.groupby("hour").std().reset_index()
       

        for k in range(len(pollutant)):
            plt.plot(df_day_m["hour"], df_day_m[pollutant[k]], label=pollutant[k])
            plt.fill_between(
                df_day_s["hour"],
                df_day_m[pollutant[k]] - 0.5 * df_day_s[pollutant[k]],
                df_day_m[pollutant[k]] + 0.5 * df_day_s[pollutant[k]],
                alpha=0.2,
            )
            plt.xlabel(dayWeek[i])
            plt.legend()
    plt.savefig("TimeVariationPlots1.png", bbox_inches="tight")

    plt.figure(2, figsize=(35, 5))
    plt.subplot(1, 3, 1)
    plt.grid()

    df_hour["hour"] = df_hour["date"].dt.hour
    df_hour_m = df.groupby("hour").mean().reset_index()
    df_hour_s = df.groupby("hour").std().reset_index()
    
    for i in range(len(pollutant)):
        plt.plot(df_hour_m["hour"], df_hour_m[pollutant[i]], label=pollutant[i])
        plt.fill_between(
            df_hour_s["hour"],
            df_hour_m[pollutant[i]] - 0.5 * df_hour_s[pollutant[i]],
            df_hour_m[pollutant[i]] + 0.5 * df_hour_s[pollutant[i]],
            alpha=0.2,
        )
        plt.xlabel("Hour")
        plt.legend()

    plt.subplot(1, 3, 2)
    plt.grid()

    df_month["month"] = df_month["date"].dt.month
    df_month_m = df_month.groupby("month").mean().reset_index()
    df_month_s = df_month.groupby("month").std().reset_index()
    
    for i in range(len(pollutant)):
        plt.plot(df_month_m["month"], df_month_m[pollutant[i]], label=pollutant[i])
        plt.fill_between(
            df_month_s["month"],
            df_month_m[pollutant[i]] - 0.5 * df_month_s[pollutant[i]],
            df_month_m[pollutant[i]] + 0.5 * df_month_s[pollutant[i]],
            alpha=0.2,
        )
        plt.xlabel("Month")
        plt.legend()

    plt.subplot(1, 3, 3)
    plt.grid()
    
    df_weekday["weekday"] = df_weekday["date"].dt.weekday
    df_weekday_m = df_weekday.groupby("weekday").mean().reset_index()
    df_weekday_s = df_weekday.groupby("weekday").std().reset_index()

    for i in range(len(pollutant)):
        plt.plot(
            df_weekday_m["weekday"], df_weekday_m[pollutant[i]], label=pollutant[i]
        )
        plt.fill_between(
            df_weekday_s["weekday"],
            df_weekday_m[pollutant[i]] - 0.5 * df_weekday_s[pollutant[i]],
            df_weekday_m[pollutant[i]] + 0.5 * df_weekday_s[pollutant[i]],
            alpha=0.2,
        )
        plt.xlabel("WeekDay")
        plt.legend()
    plt.savefig("TimeVariationPlots2.png", bbox_inches="tight")
    print("Your plots has also been saved")
    plt.show()
    

USAGE:

# df = pd.read_csv("mydata.csv")
# time_variation(df, pollutant=['pm25','nh3'])

OUTPUT:
TimeVariationPlots1
TimeVariationPlots2

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant