Skip to content

Commit

Permalink
edited for readability
Browse files Browse the repository at this point in the history
  • Loading branch information
sofia-frenk committed Nov 11, 2024
1 parent af5534d commit 8ab318d
Showing 1 changed file with 115 additions and 51 deletions.
166 changes: 115 additions & 51 deletions code/python_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,23 +265,6 @@

df_modified2 = df_modified.copy()
df_modified2['CO2_Emitted/Hour'] = df_modified2['CO2_Emitted (US Ton)'] / df_modified2['Total_Duration']
df_modified2 = df_modified2.drop(columns=['CO2_Emitted (US Ton)', 'Total_Duration'])

#X_for_corrplot2 = df_modified2.drop(columns=['Fleet_Airbus A320', 'Fleet_Boeing 737s', 'Frequent_Route'])
#correlation matrix for variables to get a visual on correlation among variables before applying decision trees
#corr_matrix = X_for_corrplot2.corr()

corr_matrix = df_modified2.corr()
# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix of Features 2")
plt.show()

print("df_modified2 head:")
df_modified2.head()

print(df_modified2[['CO2_Emitted/Hour', 'Fuel_Consumption_Rate (liters/hr)']].head())

plt.figure(figsize=(10, 5))
plt.hist(df_modified2['CO2_Emitted/Hour'], bins=30, color='skyblue', edgecolor='black')
Expand All @@ -297,11 +280,30 @@
plt.title('Distribution of Transformed Fuel Consumption Rate')
plt.show()

X_df = df_modified2.drop(columns=['CO2_Emitted/Hour'])
df_modified2 = df_modified2.drop(columns=['CO2_Emitted (US Ton)', 'Total_Duration'])

corr_matrix2 = df_modified2.drop(columns=['Fleet_Airbus A320', 'Fleet_Boeing 737s', 'Frequent_Route'])
#correlation matrix for variables to get a visual on correlation among variables before applying decision trees
#corr_matrix = X_for_corrplot2.corr()

corr_matrix2 = corr_matrix2.corr()

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix2, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix of Features 2")
plt.show()

print("df_modified2 head:")
df_modified2.head()

print(df_modified2[['CO2_Emitted/Hour', 'Fuel_Consumption_Rate (liters/hr)']].head())

X_df2 = df_modified2.drop(columns=['CO2_Emitted/Hour'])
#X_df = X_df.astype(np.float64) #might need to get rid of

#X_df = X_df.fillna(X_df.mean())
print(X_df.dtypes)
print(X_df2.dtypes)
'''X = X_df.values
std_dev = X.std(axis=0)
zero_std_columns = (std_dev == 0)
Expand All @@ -313,12 +315,12 @@
# Remove columns with zero standard deviation from X
X_reduced = X[:, ~zero_std_columns]'''

y = df_modified2['CO2_Emitted/Hour'].values
y2 = df_modified2['CO2_Emitted/Hour'].values

#split the data
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_df2, y2, test_size=0.2, random_state=42)
#X_train = X_train.astype(np.float64)
print(X_train.dtypes)
print(X_train2.dtypes)

'''
#normalize data
Expand Down Expand Up @@ -361,32 +363,32 @@
best_score = np.sqrt(-grid_search.best_score_)
tree_model2 = DecisionTreeRegressor(**best_params, random_state=42)'''
tree_model2.fit(X_train, y_train)
tree_model2.fit(X_train2, y_train2)

# Generate predictions
y_pred = tree_model2.predict(X_test)
y_pred2 = tree_model2.predict(X_test2)

# Calculate and print RMSE and R^2 Score on test data
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test2, y_pred2))
r2 = r2_score(y_test2, y_pred2)
print(f"Tuned RMSE on Test Set: {rmse}")
print(f"Tuned R^2 Score on Test Set: {r2}")

#PLOTS TO VISUALIZE THE OUTCOME

#scatter plot
plt.figure(figsize=(10,6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.scatter(y_test2, y_pred2, alpha=0.6)
plt.plot([y_test2.min(), y_test2.max()], [y_test2.min(), y_test2.max()], 'r--')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.show()

#residuals plot
residuals = y_test - y_pred
residuals2 = y_test2 - y_pred2
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.6)
plt.scatter(y_pred2, residuals2, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
Expand All @@ -395,7 +397,7 @@

#histogram
plt.figure(figsize=(10, 6))
plt.hist(residuals, bins=30, color='skyblue', edgecolor='black')
plt.hist(residuals2, bins=30, color='skyblue', edgecolor='black')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
Expand All @@ -404,14 +406,6 @@
#EMISSIONS PER FUEL USAGE RATE -- NON BINOMIAL
df_modified3 = df_modified.copy()
df_modified3['CO2_Emitted/Fuel_Usage_Rate'] = df_modified3['CO2_Emitted (US Ton)'] / df_modified3['Fuel_Consumption_Rate (liters/hr)']
df_modified2 = df_modified3.drop(columns=['CO2_Emitted (US Ton)', 'Fuel_Consumption_Rate (liters/hr)']) #, 'Total_Duration'

corr_matrix = df_modified3.corr()
# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix of Features 3")
plt.show()

plt.figure(figsize=(10, 5))
plt.hist(df_modified3['CO2_Emitted/Fuel_Usage_Rate'], bins=30, color='skyblue', edgecolor='black')
Expand All @@ -427,36 +421,46 @@
plt.title('Distribution of Total_Duration')
plt.show()

X_df2 = df_modified3.drop(columns=['CO2_Emitted/Fuel_Usage_Rate'])
y2 = df_modified3['CO2_Emitted/Fuel_Usage_Rate'].values
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_df2, y2, test_size=0.2, random_state=42)
df_modified3 = df_modified3.drop(columns=['CO2_Emitted (US Ton)', 'Fuel_Consumption_Rate (liters/hr)']) #, 'Total_Duration'

corr_matrix3 = df_modified3.drop(columns=['Fleet_Airbus A320', 'Fleet_Boeing 737s', 'Frequent_Route'])
corr_matrix3 = corr_matrix3.corr()
# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix3, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix of Features 3")
plt.show()

X_df3 = df_modified3.drop(columns=['CO2_Emitted/Fuel_Usage_Rate'])
y3 = df_modified3['CO2_Emitted/Fuel_Usage_Rate'].values
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_df3, y3, test_size=0.2, random_state=42)
tree_model3 = DecisionTreeRegressor(criterion="squared_error", random_state=42)
tree_model3.fit(X_train2, y_train2)
tree_model3.fit(X_train3, y_train3)

# Generate predictions
y_pred2 = tree_model3.predict(X_test2)
y_pred3 = tree_model3.predict(X_test3)

# Calculate and print RMSE and R^2 Score on test data
rmse = np.sqrt(mean_squared_error(y_test2, y_pred2))
r2 = r2_score(y_test2, y_pred2)
rmse = np.sqrt(mean_squared_error(y_test3, y_pred3))
r2 = r2_score(y_test3, y_pred3)
print(f"Tuned RMSE on Test Set: {rmse}")
print(f"Tuned R^2 Score on Test Set: {r2}")

#PLOTS TO VISUALIZE THE OUTCOME

#scatter plot
plt.figure(figsize=(10,6))
plt.scatter(y_test2, y_pred2, alpha=0.6)
plt.plot([y_test2.min(), y_test2.max()], [y_test2.min(), y_test2.max()], 'r--')
plt.scatter(y_test3, y_pred3, alpha=0.6)
plt.plot([y_test3.min(), y_test3.max()], [y_test3.min(), y_test3.max()], 'r--')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.show()

#residuals plot
residuals2 = y_test2 - y_pred2
residuals3 = y_test3 - y_pred3
plt.figure(figsize=(10, 6))
plt.scatter(y_pred2, residuals, alpha=0.6)
plt.scatter(y_pred3, residuals3, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
Expand All @@ -465,8 +469,68 @@

#histogram
plt.figure(figsize=(10, 6))
plt.hist(residuals2, bins=30, color='skyblue', edgecolor='black')
plt.hist(residuals3, bins=30, color='skyblue', edgecolor='black')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.show()


#FEATURE SELECTION FROM RANDOM FOREST 1
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV

df_analyse_features1 = df_modified.copy()
#df_analyse_features1['CO2_Emitted/Fuel_Usage_Rate'] = df_analyse_features1['CO2_Emitted (US Ton)'] / df_analyse_features1['Fuel_Consumption_Rate (liters/hr)']
df_analyse_features1['CO2_Emitted/Hour'] = df_analyse_features1['CO2_Emitted (US Ton)'] / df_analyse_features1['Total_Duration']
df_analyse_features1 = df_analyse_features1.drop(columns=['CO2_Emitted (US Ton)', 'Total_Duration'])
#df_modified4 = df_modified4.drop(columns=['CO2_Emitted (US Ton)', 'Fuel_Consumption_Rate (liters/hr)'])

X_analyse_features1 = df_analyse_features1.drop(columns=['CO2_Emitted/Hour'])
y_analyse_features1 = df_analyse_features1['CO2_Emitted/Hour'].values

features = X_analyse_features1.columns

X_train_analyse_features1, X_test_analyse_features1, y_train_analyse_features1, y_test_analyse_features1 = train_test_split(X_analyse_features1, y_analyse_features1, test_size=0.2, random_state=42)

rf = RandomForestRegressor(random_state=0)

rf.fit(X_analyse_features1,y_analyse_features1)

f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i])
plt.xlabel("Feature Importance 1")
plt.ylabel("Feature 1")
plt.title("Feature Importances from Random Forest 1")

plt.show()


#FEATURE SELECTION FROM RANDOM FOREST 2
df_analyse_features2 = df_modified.copy()
df_analyse_features2['CO2_Emitted/Fuel_Usage_Rate'] = df_analyse_features2['CO2_Emitted (US Ton)'] / df_analyse_features2['Fuel_Consumption_Rate (liters/hr)']
#df_modified4['CO2_Emitted/Hour'] = df_modified4['CO2_Emitted (US Ton)'] / df_modified4['Total_Duration']
#df_analyse_features1 = df_analyse_features1.drop(columns=['CO2_Emitted (US Ton)', 'Total_Duration'])
df_analyse_features2 = df_analyse_features2.drop(columns=['CO2_Emitted (US Ton)', 'Fuel_Consumption_Rate (liters/hr)'])

X_analyse_features2 = df_analyse_features2.drop(columns=['CO2_Emitted/Fuel_Usage_Rate'])
y_analyse_features2 = df_analyse_features2['CO2_Emitted/Fuel_Usage_Rate'].values

features2 = X_analyse_features2.columns

X_train_analyse_features2, X_test_analyse_features2, y_train_analyse_features2, y_test_analyse_features2 = train_test_split(X_analyse_features2, y_analyse_features2, test_size=0.2, random_state=42)

rf2 = RandomForestRegressor(random_state=0)

rf2.fit(X_analyse_features2,y_analyse_features2)

f_i2 = list(zip(features2,rf2.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i])
plt.xlabel("Feature Importance 2")
plt.ylabel("Feature 2")
plt.title("Feature Importances from Random Forest 2")

plt.show()

0 comments on commit 8ab318d

Please sign in to comment.