Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

time_variation: Code optimization (refer issue #51) #62

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
86 changes: 29 additions & 57 deletions vayu/googleMaps.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
def googleMaps(df, lat, long, pollutant, dataLoc):
import folium
import webbrowser
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def google_maps(df:pd.DataFrame, lat:str, long:str, pollutant:str, date:str, markersize:int,zoom:int):
"""Plots a geographical plot.

Plots a folium plot of longitude and latitude points
Expand All @@ -15,67 +22,32 @@ def googleMaps(df, lat, long, pollutant, dataLoc):
long: str
Name of column in df of where longitude points are
pollutant: str
Name of pollutant
dataLoc: str
Name of df column where pollutanat values are stored
Name of pollutant where values of that pollutant is stored.
date: str
visualizing the pollutant of a specific date.
markersize: int
The int by which the value of pollutant will be multiplied.
zoom: int
The int by which you want to zoom in the plot

"""
import folium
import webbrowser
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

df1 = df[df['date'] == date]

latitude = 37.0902
longitude = -95.7129
Arithmetic_Mean_map = folium.Map(location=[latitude, longitude], zoom_start=4)
lat= df1[lat].values[0]
long=df1[long].values[0]
my_map4 = folium.Map(location = [lat, long], zoom_start = zoom)

# =============================================================================
# df = pd.read_csv('interpolData.csv')
# =============================================================================
for lat,long,pol,st in zip(df['latitude'],df['longitude'],df[pollutant],df['station']):
folium.CircleMarker([lat, long],radius=markersize * pol, popup=(str(st).capitalize()+"<br>"+ str(round(pol, 3))), fill=True, fill_opacity=0.7, color = 'red').add_to(my_map4)

some_value = pollutant
df = df.loc[df["Parameter Name"] == some_value]

some_value = "2018-05-07"
df = df.loc[df["Date Local"] == some_value]

df = df.sample(frac=1)

# df_train, df_test = train_test_split(df, test_size=0.2)
df["Arithmetic Mean Q"] = pd.qcut(df[dataLoc], 4, labels=False)
colordict = {0: "lightblue", 1: "lightgreen", 2: "orange", 3: "red"}

for lat, lon, Arithmetic_Mean_Q, Arithmetic_Mean, city, AQI in zip(
df[lat],
df[long],
df["Arithmetic Mean Q"],
df[dataLoc],
df["City Name"],
df["AQI"],
):
folium.CircleMarker(
[lat, lon],
radius=0.15 * AQI,
popup=(
"City: "
+ str(city).capitalize()
+ "<br>"
#'Bike score: ' + str(bike) + '<br>'
"Arithmetic_Mean level: "
+ str(Arithmetic_Mean)
+ "%"
),
color="b",
key_on=Arithmetic_Mean_Q,
threshold_scale=[0, 1, 2, 3],
fill_color=colordict[Arithmetic_Mean_Q],
fill=True,
fill_opacity=0.7,
).add_to(Arithmetic_Mean_map)
Arithmetic_Mean_map.save("mymap.html")
my_map4.save("googleMaps.html")
print('your map has been saved')
return my_map4


#Example:
# df = pd.read_csv('interpolData.csv')
# googleMaps(df,'Latitude','Longitude','Ozone','Arithmetic Mean')
# Call the function and display the map in Jupyter Notebook
# map_obj = google_maps(df, 'latitude', 'longitude', 'pm25', '2022-02-23', 5,10)
# map_obj
46 changes: 13 additions & 33 deletions vayu/scatterPlot.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,48 +19,28 @@ def scatterPlot(df, x, y, **kwargs):
import matplotlib.cm as cm
from math import pi

pm10 = df.pm10
o3 = df.o3
ws = df.ws
wd = df.wd
nox = df.nox
no2 = df.no2


#########################################
# converts wind data to randians
df = pd.DataFrame({"speed": ws, "direction": wd})
df["speed_x"] = df["speed"] * np.sin(df["direction"] * pi / 180.0)
df["speed_y"] = df["speed"] * np.cos(df["direction"] * pi / 180.0)
#df1 = pd.DataFrame({"speed": ws, "direction": wd})
df["speed"+str(x)] = df['ws'] * np.sin(df['wd'] * pi / 180.0)
df["speed"+str(y)] = df['ws'] * np.cos(df['wd'] * pi / 180.0)
fig, ax = plt.subplots(figsize=(8, 8), dpi=80)
x0, x1 = ax.get_xlim()
y0, y1 = ax.get_ylim()
ax.set_aspect("equal")
_ = df.plot(kind="scatter", x="speed_x", y="speed_y", alpha=0.35, ax=ax)
#ax.set_aspect("equal")
_ = df.plot(kind="scatter", x="speed"+str(x), y="speed"+str(y), alpha=0.35, ax=ax)
plt.show()


####################################
# simple seaborn plot that shows how given variables relate with one another
if x == "nox":
x = nox
elif x == "no2":
x = no2
elif x == "o3":
x = o3
elif x == "pm10":
x = pm10
if y == "nox":
y = nox
elif y == "no2":
y = no2
elif y == "o3":
y = o3
elif y == "pm10":
y = pm10

sns.jointplot(x=x, y=y, kind="hex")

sns.jointplot(x=df[x].values, y=df[y].values, kind="hex")
plt.xlabel(x)
plt.ylabel(y)
plt.show()


# =============================================================================
# df = pd.read_csv("mydata.csv")
# scatterPlot(df,'nox','no2')
Expand Down
54 changes: 37 additions & 17 deletions vayu/selectByDate.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,48 @@
def selectByDate(df, year):
"""
Utility function to cut given dataframe by the year
and find the average value of each day
import pandas as pd
import numpy as np

def select_by(df:pd.Dataframe, year:str, group:list=None, time_period:str='day'):
"""
Utility function to cut a given dataframe by year and find the average value
of each day, month, or year. Optionally, data can be grouped by specified columns.

Parameters
----------
df: data frame
a data frame containing a date field
A data frame containing a date field and optional grouping columns.
year: type string
a year to select to cut data
A year to select and filter the data.
group: list, optional
A list of columns to group the data by. Default is None (no grouping).
time_period: {'day', 'month', 'year'}, optional
The time period to compute the average value. Default is 'day'.

Returns
-------
data frame
A data frame with the average value of each day, month, or year.
If group is specified, the data will be grouped accordingly.
"""
import pandas as pd
import numpy as np

df.index = pd.to_datetime(df.date)
df = df.drop("date", axis=1)
df_n = df[year].resample("1D").mean()
df_n = df_n.fillna(method="ffill")
df_n["month"] = df_n.index.month
df_n.index.dayofweek
print(df_n)

df['date'] = pd.to_datetime(df['date'])
df_year = df[df['date'].dt.year == int(year)]

if group:
df_grouped = df_year.groupby(group).resample(time_period[0], on='date').mean(numeric_only=True)
return df_grouped

if time_period == 'month':
df_month = df_year.resample('M', on='date').mean(numeric_only=True)
return df_month
elif time_period == 'year':
df_yearly = df_year.resample('Y', on='date').mean(numeric_only=True)
return df_yearly

df_day = df_year.resample('D', on='date').mean(numeric_only=True)
return df_day


# =============================================================================
# df = pd.read_csv("mydata.csv")
# selectByDate(df,'2003')
#select_by(df1,'2022',group=['latitude','longitude','station'], time_period='month')
# =============================================================================
130 changes: 130 additions & 0 deletions vayu/summary_plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
from numpy import array
import matplotlib.patches as mpatches
import seaborn as sns
from matplotlib.pyplot import figure

def summary_plot(df: pd.DataFrame):
""" Plots import summary of data frame given. Plots line plots
and histograms for each polutant as well as statiscs such as
mean,max,min,median, and 95th percentile

Parameters
----------
df: data frame
data frame to be summarised. Must contain a date field
and at least one other parameter
"""

# Initialize variables
pollutants = ["pm10", "pm25", "sox", "co", "o3", "nox", "pb", "nh3"]
categories = ["s", "m", "h"]

counts = {pollutant: {category: 0 for category in categories} for pollutant in pollutants}


df.index = pd.to_datetime(df.date)
df = df.drop("date", axis=1)
df_all = df.resample("1D")
df_all = df.copy()
df_all = df_all.fillna(method="ffill")
#print(df_all.columns)

# Calculate counts for each pollutant category
for pollutant in pollutants:
if pollutant in df_all.columns:
column_data = df_all[pollutant]
#print(df_all)
for _, data in column_data.iteritems():
if pollutant in ["pm10", "pm25"]:
if data < 100:
counts[pollutant]["s"] += 1
elif data < 250:
counts[pollutant]["m"] += 1
else:
counts[pollutant]["h"] += 1
elif pollutant == "co":
if data < 2:
counts[pollutant]["s"] += 1
elif data < 10:
counts[pollutant]["m"] += 1
else:
counts[pollutant]["h"] += 1
elif pollutant == "sox":
if data <= 80:
counts[pollutant]["s"] += 1
elif data <= 380:
counts[pollutant]["m"] += 1
else:
counts[pollutant]["h"] += 1
elif pollutant == "o3":
if data < 100:
counts[pollutant]["s"] += 1
elif data < 168:
counts[pollutant]["m"] += 1
else:
counts[pollutant]["h"] += 1
elif pollutant == "nox":
if data < 80:
counts[pollutant]["s"] += 1
elif data < 180:
counts[pollutant]["m"] += 1
else:
counts[pollutant]["h"] += 1
elif pollutant == "pb":
if data <= 1:
counts[pollutant]["s"] += 1
elif data <= 2:
counts[pollutant]["m"] += 1
else:
counts[pollutant]["h"] += 1
elif pollutant == "nh3":
if data <= 400:
counts[pollutant]["s"] += 1
elif data <= 800:
counts[pollutant]["m"] += 1
else:
counts[pollutant]["h"] += 1



# Plot line, histogram, and pie charts for each pollutant
fig, axes = plt.subplots(len(df_all.columns), 3, figsize=(25,25))

for i, pollutant in enumerate(df_all.columns):
ax_line = axes[i, 0]
ax_hist = axes[i, 1]
ax_pie = axes[i, 2]

df_all[pollutant].plot.line(ax=ax_line, color="gold")
ax_line.axes.get_xaxis().set_visible(False)
ax_line.yaxis.set_label_position("left")
ax_line.set_ylabel(pollutant, fontsize=30, bbox=dict(facecolor="whitesmoke"))

ax_hist.hist(df_all[pollutant], bins=50, color="green")

labels = ["Safe", "Moderate", "High"]
sizes = [counts[pollutant][category] for category in categories]
explode = [0, 0, 1]

ax_pie.pie(sizes, explode=explode, labels=labels, autopct="%1.1f%%", shadow=False, startangle=90)
ax_pie.axis("equal")

ax_pie.set_xlabel("Statistics")

print(f"{pollutant}\nmin = {df_all[pollutant].min():.2f}\nmax = {df_all[pollutant].max():.2f}\nmissing = {df_all[pollutant].isna().sum()}\nmean = {df_all[pollutant].mean():.2f}\nmedian = {df_all[pollutant].median():.2f}\n95th percentile = {df_all[pollutant].quantile(0.95):.2f}\n")

plt.savefig("summary_plot.png", dpi=300, format="png")
plt.show()
print("your plots has also been saved")
plt.close()


# =============================================================================
# df = pd.read_csv('mydata.csv')
# summary_plot(df)
# =============================================================================
Loading