edited for readability

uiceds · Nov 11, 2024 · 8ab318d · 8ab318d
1 parent af5534d
commit 8ab318d
Showing 1 changed file with 115 additions and 51 deletions.
diff --git a/code/python_code.py b/code/python_code.py
@@ -265,23 +265,6 @@
 
 df_modified2 = df_modified.copy()
 df_modified2['CO2_Emitted/Hour'] = df_modified2['CO2_Emitted (US Ton)'] / df_modified2['Total_Duration']
-df_modified2 = df_modified2.drop(columns=['CO2_Emitted (US Ton)', 'Total_Duration'])
-
-#X_for_corrplot2 = df_modified2.drop(columns=['Fleet_Airbus A320', 'Fleet_Boeing 737s', 'Frequent_Route'])
-#correlation matrix for variables to get a visual on correlation among variables before applying decision trees
-#corr_matrix = X_for_corrplot2.corr()
-
-corr_matrix = df_modified2.corr()
-# Plot the heatmap
-plt.figure(figsize=(12, 8))
-sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
-plt.title("Correlation Matrix of Features 2")
-plt.show()
-
-print("df_modified2 head:")
-df_modified2.head()
-
-print(df_modified2[['CO2_Emitted/Hour', 'Fuel_Consumption_Rate (liters/hr)']].head())
 
 plt.figure(figsize=(10, 5))
 plt.hist(df_modified2['CO2_Emitted/Hour'], bins=30, color='skyblue', edgecolor='black')
@@ -297,11 +280,30 @@
 plt.title('Distribution of Transformed Fuel Consumption Rate')
 plt.show()
 
-X_df = df_modified2.drop(columns=['CO2_Emitted/Hour'])
+df_modified2 = df_modified2.drop(columns=['CO2_Emitted (US Ton)', 'Total_Duration'])
+
+corr_matrix2 = df_modified2.drop(columns=['Fleet_Airbus A320', 'Fleet_Boeing 737s', 'Frequent_Route'])
+#correlation matrix for variables to get a visual on correlation among variables before applying decision trees
+#corr_matrix = X_for_corrplot2.corr()
+
+corr_matrix2 = corr_matrix2.corr()
+
+# Plot the heatmap
+plt.figure(figsize=(12, 8))
+sns.heatmap(corr_matrix2, annot=True, cmap="coolwarm", fmt=".2f")
+plt.title("Correlation Matrix of Features 2")
+plt.show()
+
+print("df_modified2 head:")
+df_modified2.head()
+
+print(df_modified2[['CO2_Emitted/Hour', 'Fuel_Consumption_Rate (liters/hr)']].head())
+
+X_df2 = df_modified2.drop(columns=['CO2_Emitted/Hour'])
 #X_df = X_df.astype(np.float64) #might need to get rid of
 
 #X_df = X_df.fillna(X_df.mean())
-print(X_df.dtypes)
+print(X_df2.dtypes)
 '''X = X_df.values
 std_dev = X.std(axis=0)
 zero_std_columns = (std_dev == 0)
@@ -313,12 +315,12 @@
 # Remove columns with zero standard deviation from X
 X_reduced = X[:, ~zero_std_columns]'''
 
-y = df_modified2['CO2_Emitted/Hour'].values
+y2 = df_modified2['CO2_Emitted/Hour'].values
 
 #split the data
-X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)
+X_train2, X_test2, y_train2, y_test2 = train_test_split(X_df2, y2, test_size=0.2, random_state=42)
 #X_train = X_train.astype(np.float64)
-print(X_train.dtypes)
+print(X_train2.dtypes)
 
 '''
 #normalize data
@@ -361,32 +363,32 @@
 best_score = np.sqrt(-grid_search.best_score_)
 
 tree_model2 = DecisionTreeRegressor(**best_params, random_state=42)'''
-tree_model2.fit(X_train, y_train)
+tree_model2.fit(X_train2, y_train2)
 
 # Generate predictions
-y_pred = tree_model2.predict(X_test)
+y_pred2 = tree_model2.predict(X_test2)
 
 # Calculate and print RMSE and R^2 Score on test data
-rmse = np.sqrt(mean_squared_error(y_test, y_pred))
-r2 = r2_score(y_test, y_pred)
+rmse = np.sqrt(mean_squared_error(y_test2, y_pred2))
+r2 = r2_score(y_test2, y_pred2)
 print(f"Tuned RMSE on Test Set: {rmse}")
 print(f"Tuned R^2 Score on Test Set: {r2}")
 
 #PLOTS TO VISUALIZE THE OUTCOME
 
 #scatter plot
 plt.figure(figsize=(10,6))
-plt.scatter(y_test, y_pred, alpha=0.6)
-plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
+plt.scatter(y_test2, y_pred2, alpha=0.6)
+plt.plot([y_test2.min(), y_test2.max()], [y_test2.min(), y_test2.max()], 'r--')
 plt.xlabel('Actual Values')
 plt.ylabel('Predicted Values')
 plt.title('Actual vs Predicted Values')
 plt.show()
 
 #residuals plot
-residuals = y_test - y_pred
+residuals2 = y_test2 - y_pred2
 plt.figure(figsize=(10, 6))
-plt.scatter(y_pred, residuals, alpha=0.6)
+plt.scatter(y_pred2, residuals2, alpha=0.6)
 plt.axhline(y=0, color='r', linestyle='--')
 plt.xlabel('Predicted Values')
 plt.ylabel('Residuals')
@@ -395,7 +397,7 @@
 
 #histogram
 plt.figure(figsize=(10, 6))
-plt.hist(residuals, bins=30, color='skyblue', edgecolor='black')
+plt.hist(residuals2, bins=30, color='skyblue', edgecolor='black')
 plt.xlabel('Residuals')
 plt.ylabel('Frequency')
 plt.title('Distribution of Residuals')
@@ -404,14 +406,6 @@
 #EMISSIONS PER FUEL USAGE RATE -- NON BINOMIAL
 df_modified3 = df_modified.copy()
 df_modified3['CO2_Emitted/Fuel_Usage_Rate'] = df_modified3['CO2_Emitted (US Ton)'] / df_modified3['Fuel_Consumption_Rate (liters/hr)']
-df_modified2 = df_modified3.drop(columns=['CO2_Emitted (US Ton)', 'Fuel_Consumption_Rate (liters/hr)']) #, 'Total_Duration'
-
-corr_matrix = df_modified3.corr()
-# Plot the heatmap
-plt.figure(figsize=(12, 8))
-sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
-plt.title("Correlation Matrix of Features 3")
-plt.show()
 
 plt.figure(figsize=(10, 5))
 plt.hist(df_modified3['CO2_Emitted/Fuel_Usage_Rate'], bins=30, color='skyblue', edgecolor='black')
@@ -427,36 +421,46 @@
 plt.title('Distribution of Total_Duration')
 plt.show()
 
-X_df2 = df_modified3.drop(columns=['CO2_Emitted/Fuel_Usage_Rate'])
-y2 = df_modified3['CO2_Emitted/Fuel_Usage_Rate'].values
-X_train2, X_test2, y_train2, y_test2 = train_test_split(X_df2, y2, test_size=0.2, random_state=42)
+df_modified3 = df_modified3.drop(columns=['CO2_Emitted (US Ton)', 'Fuel_Consumption_Rate (liters/hr)']) #, 'Total_Duration'
+
+corr_matrix3 = df_modified3.drop(columns=['Fleet_Airbus A320', 'Fleet_Boeing 737s', 'Frequent_Route'])
+corr_matrix3 = corr_matrix3.corr()
+# Plot the heatmap
+plt.figure(figsize=(12, 8))
+sns.heatmap(corr_matrix3, annot=True, cmap="coolwarm", fmt=".2f")
+plt.title("Correlation Matrix of Features 3")
+plt.show()
+
+X_df3 = df_modified3.drop(columns=['CO2_Emitted/Fuel_Usage_Rate'])
+y3 = df_modified3['CO2_Emitted/Fuel_Usage_Rate'].values
+X_train3, X_test3, y_train3, y_test3 = train_test_split(X_df3, y3, test_size=0.2, random_state=42)
 tree_model3 = DecisionTreeRegressor(criterion="squared_error", random_state=42)
-tree_model3.fit(X_train2, y_train2)
+tree_model3.fit(X_train3, y_train3)
 
 # Generate predictions
-y_pred2 = tree_model3.predict(X_test2)
+y_pred3 = tree_model3.predict(X_test3)
 
 # Calculate and print RMSE and R^2 Score on test data
-rmse = np.sqrt(mean_squared_error(y_test2, y_pred2))
-r2 = r2_score(y_test2, y_pred2)
+rmse = np.sqrt(mean_squared_error(y_test3, y_pred3))
+r2 = r2_score(y_test3, y_pred3)
 print(f"Tuned RMSE on Test Set: {rmse}")
 print(f"Tuned R^2 Score on Test Set: {r2}")
 
 #PLOTS TO VISUALIZE THE OUTCOME
 
 #scatter plot
 plt.figure(figsize=(10,6))
-plt.scatter(y_test2, y_pred2, alpha=0.6)
-plt.plot([y_test2.min(), y_test2.max()], [y_test2.min(), y_test2.max()], 'r--')
+plt.scatter(y_test3, y_pred3, alpha=0.6)
+plt.plot([y_test3.min(), y_test3.max()], [y_test3.min(), y_test3.max()], 'r--')
 plt.xlabel('Actual Values')
 plt.ylabel('Predicted Values')
 plt.title('Actual vs Predicted Values')
 plt.show()
 
 #residuals plot
-residuals2 = y_test2 - y_pred2
+residuals3 = y_test3 - y_pred3
 plt.figure(figsize=(10, 6))
-plt.scatter(y_pred2, residuals, alpha=0.6)
+plt.scatter(y_pred3, residuals3, alpha=0.6)
 plt.axhline(y=0, color='r', linestyle='--')
 plt.xlabel('Predicted Values')
 plt.ylabel('Residuals')
@@ -465,8 +469,68 @@
 
 #histogram
 plt.figure(figsize=(10, 6))
-plt.hist(residuals2, bins=30, color='skyblue', edgecolor='black')
+plt.hist(residuals3, bins=30, color='skyblue', edgecolor='black')
 plt.xlabel('Residuals')
 plt.ylabel('Frequency')
 plt.title('Distribution of Residuals')
 plt.show()
+
+
+#FEATURE SELECTION FROM RANDOM FOREST 1
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import train_test_split
+from sklearn.feature_selection import RFECV
+
+df_analyse_features1 = df_modified.copy()
+#df_analyse_features1['CO2_Emitted/Fuel_Usage_Rate'] = df_analyse_features1['CO2_Emitted (US Ton)'] / df_analyse_features1['Fuel_Consumption_Rate (liters/hr)']
+df_analyse_features1['CO2_Emitted/Hour'] = df_analyse_features1['CO2_Emitted (US Ton)'] / df_analyse_features1['Total_Duration']
+df_analyse_features1 = df_analyse_features1.drop(columns=['CO2_Emitted (US Ton)', 'Total_Duration'])
+#df_modified4 = df_modified4.drop(columns=['CO2_Emitted (US Ton)', 'Fuel_Consumption_Rate (liters/hr)'])
+
+X_analyse_features1 = df_analyse_features1.drop(columns=['CO2_Emitted/Hour'])
+y_analyse_features1 = df_analyse_features1['CO2_Emitted/Hour'].values
+
+features = X_analyse_features1.columns
+
+X_train_analyse_features1, X_test_analyse_features1, y_train_analyse_features1, y_test_analyse_features1 = train_test_split(X_analyse_features1, y_analyse_features1, test_size=0.2, random_state=42)
+
+rf = RandomForestRegressor(random_state=0)
+
+rf.fit(X_analyse_features1,y_analyse_features1)
+
+f_i = list(zip(features,rf.feature_importances_))
+f_i.sort(key = lambda x : x[1])
+plt.barh([x[0] for x in f_i],[x[1] for x in f_i])
+plt.xlabel("Feature Importance 1")
+plt.ylabel("Feature 1")
+plt.title("Feature Importances from Random Forest 1")
+
+plt.show()
+
+
+#FEATURE SELECTION FROM RANDOM FOREST 2
+df_analyse_features2 = df_modified.copy()
+df_analyse_features2['CO2_Emitted/Fuel_Usage_Rate'] = df_analyse_features2['CO2_Emitted (US Ton)'] / df_analyse_features2['Fuel_Consumption_Rate (liters/hr)']
+#df_modified4['CO2_Emitted/Hour'] = df_modified4['CO2_Emitted (US Ton)'] / df_modified4['Total_Duration']
+#df_analyse_features1 = df_analyse_features1.drop(columns=['CO2_Emitted (US Ton)', 'Total_Duration'])
+df_analyse_features2 = df_analyse_features2.drop(columns=['CO2_Emitted (US Ton)', 'Fuel_Consumption_Rate (liters/hr)'])
+
+X_analyse_features2 = df_analyse_features2.drop(columns=['CO2_Emitted/Fuel_Usage_Rate'])
+y_analyse_features2 = df_analyse_features2['CO2_Emitted/Fuel_Usage_Rate'].values
+
+features2 = X_analyse_features2.columns
+
+X_train_analyse_features2, X_test_analyse_features2, y_train_analyse_features2, y_test_analyse_features2 = train_test_split(X_analyse_features2, y_analyse_features2, test_size=0.2, random_state=42)
+
+rf2 = RandomForestRegressor(random_state=0)
+
+rf2.fit(X_analyse_features2,y_analyse_features2)
+
+f_i2 = list(zip(features2,rf2.feature_importances_))
+f_i.sort(key = lambda x : x[1])
+plt.barh([x[0] for x in f_i],[x[1] for x in f_i])
+plt.xlabel("Feature Importance 2")
+plt.ylabel("Feature 2")
+plt.title("Feature Importances from Random Forest 2")
+
+plt.show()