-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
e183935
commit ca3ebc4
Showing
3 changed files
with
51 additions
and
211 deletions.
There are no files selected for viewing
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,129 +1,90 @@ | ||
import pandas as pd | ||
import numpy as np | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder | ||
from sklearn.compose import ColumnTransformer | ||
from sklearn.pipeline import Pipeline | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.linear_model import LogisticRegression | ||
from imblearn.over_sampling import RandomOverSampler | ||
from sklearn.base import BaseEstimator, TransformerMixin | ||
from fairml import audit_model | ||
import matplotlib.pyplot as plt | ||
from fairml import plot_dependencies | ||
from imblearn.over_sampling import RandomOverSampler | ||
from sklearn.linear_model import LogisticRegression | ||
from fairml import audit_model, plot_dependencies | ||
from imblearn.pipeline import Pipeline as ImbPipeline | ||
import matplotlib.pyplot as plt | ||
|
||
|
||
# custom transformer to drop columns | ||
class DropColumns(BaseEstimator, TransformerMixin): | ||
def __init__(self, columns): | ||
self.columns = columns | ||
class DataFrameEnsurer(BaseEstimator, TransformerMixin): | ||
"""Transforms an array back to a dataframe, ensuring proper column names after transformations.""" | ||
def __init__(self, preprocessor): | ||
self.preprocessor = preprocessor | ||
self.feature_names = None | ||
|
||
def fit(self, X, y=None): | ||
self.feature_names = self.preprocessor.get_feature_names_out() | ||
return self | ||
|
||
def transform(self, X): | ||
return X.drop(self.columns, axis=1) | ||
if self.feature_names is None: | ||
raise Exception("The transformer is not yet fitted with feature names.") | ||
return pd.DataFrame(X, columns=self.feature_names) | ||
|
||
# load data | ||
def load_data(filepath): | ||
# Load and preprocess data | ||
def preprocess_data(filepath): | ||
df = pd.read_csv(filepath, delimiter=';') | ||
df['y'] = df['y'].map({'yes': 1, 'no': 0}) | ||
return df | ||
|
||
class DataFrameTransformer(BaseEstimator, TransformerMixin): | ||
def __init__(self, columns): | ||
self.columns = columns # columns should be a list of column names | ||
|
||
def fit(self, X, y=None): | ||
return self | ||
|
||
def transform(self, X): | ||
return pd.DataFrame(X, columns=self.columns) | ||
df = df.drop(['contact', 'poutcome', 'duration'], axis=1) # drop columns that are not informative | ||
|
||
def preprocess_features(df): | ||
# categorical and numerical columns | ||
categorical_cols = [col for col in df.columns if df[col].dtype == 'object'] | ||
numerical_cols = [col for col in df.select_dtypes(include=['int64', 'float64']).columns if col not in ['y']] | ||
|
||
numerical_transformer = Pipeline(steps=[ | ||
('imputer', SimpleImputer(strategy='mean')), | ||
('scaler', MinMaxScaler()), | ||
('to_df', DataFrameTransformer(numerical_cols)) # Convert back to DataFrame | ||
]) | ||
|
||
categorical_transformer = Pipeline(steps=[ | ||
('imputer', SimpleImputer(strategy='constant', fill_value='missing')), | ||
('onehot', OneHotEncoder(handle_unknown='ignore')), | ||
('to_df', DataFrameTransformer(categorical_cols)) # Convert back to DataFrame | ||
]) | ||
|
||
# Combine using ColumnTransformer | ||
preprocessor = ColumnTransformer( | ||
transformers=[ | ||
('num', numerical_transformer, numerical_cols), | ||
('cat', categorical_transformer, categorical_cols) | ||
], remainder='passthrough') # 'passthrough' to keep columns as DataFrame if untouched | ||
numerical_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64'] and col != 'y'] | ||
|
||
return preprocessor | ||
|
||
"""" | ||
# Preprocess features | ||
def preprocess_features(df): | ||
categorical_cols = [col for col in df.columns if df[col].dtype == 'object'] | ||
numerical_cols = [col for col in df.select_dtypes(include=['int64', 'float64']).columns if col not in ['y']] | ||
numerical_transformer = Pipeline(steps=[ | ||
numerical_transformer = Pipeline([ | ||
('imputer', SimpleImputer(strategy='mean')), | ||
('scaler', MinMaxScaler()) | ||
]) | ||
|
||
categorical_transformer = Pipeline(steps=[ | ||
categorical_transformer = Pipeline([ | ||
('imputer', SimpleImputer(strategy='constant', fill_value='missing')), | ||
('onehot', OneHotEncoder(handle_unknown='ignore')) | ||
]) | ||
|
||
preprocessor = ColumnTransformer( | ||
transformers=[ | ||
('num', numerical_transformer, numerical_cols), | ||
('cat', categorical_transformer, categorical_cols) | ||
]) | ||
preprocessor = ColumnTransformer([ | ||
('num', numerical_transformer, numerical_cols), | ||
('cat', categorical_transformer, categorical_cols) | ||
], remainder='passthrough') | ||
|
||
return preprocessor | ||
""" | ||
|
||
# build the full preprocessing and modeling pipeline | ||
def build_pipeline(df, drop_cols): | ||
pipeline = ImbPipeline(steps=[ | ||
('drop_columns', DropColumns(columns=drop_cols)), | ||
('preprocessor', preprocess_features(df)), | ||
# Setup the full pipeline | ||
pipeline = ImbPipeline([ | ||
('preprocessor', preprocessor), | ||
('to_df', DataFrameEnsurer(preprocessor)), | ||
('resampler', RandomOverSampler(random_state=0)), | ||
('classifier', LogisticRegression(random_state=42)) | ||
('classifier', LogisticRegression(random_state=42, max_iter=1000)) | ||
]) | ||
return pipeline | ||
|
||
return df, pipeline | ||
|
||
|
||
def perform_fairness_analysis(filepath): | ||
df, pipeline = preprocess_data(filepath) | ||
X_train, X_test, y_train, y_test = train_test_split(df.drop('y', axis=1), df['y'], test_size=0.2, random_state=42) | ||
pipeline.fit(X_train, y_train) | ||
|
||
filepath = '/Users/sheidamajidi/Desktop/Winter2024/Winter2024-2/INSY695-076/Project/bank-full.csv' | ||
df = load_data(filepath) | ||
X_train, X_test, y_train, y_test = train_test_split(df.drop('y', axis=1), df['y'], test_size=0.2, random_state=42) | ||
model = pipeline.named_steps['classifier'] | ||
preprocessed_data = pipeline.named_steps['to_df'].transform(pipeline.named_steps['preprocessor'].transform(X_train)) | ||
|
||
# Build and train the pipeline | ||
drop_cols = [] # no columns to drop but for future if we want to drop columns | ||
pipeline = build_pipeline(df, drop_cols) | ||
pipeline.fit(X_train, y_train) | ||
# predict_proba | ||
predictions = model.predict_proba(preprocessed_data)[:, 1] # probabilities for the positive class | ||
|
||
# Audit the model using the probabilities of the positive class | ||
total, _ = audit_model(lambda x: model.predict_proba(x)[:, 1], preprocessed_data) | ||
|
||
|
||
# Fairness analysis | ||
# using the trained model for predictions and pass the whole test set for audit | ||
model = pipeline.named_steps['classifier'] | ||
preprocessed_data = pipeline.named_steps['preprocessor'].transform(X_train) | ||
fig = plot_dependencies( | ||
total.median(), | ||
title="FairML feature dependence", | ||
fig_size=(10, 8) | ||
) | ||
plt.savefig('fairness_analysis.png') | ||
|
||
# Audit the model | ||
total, _ = audit_model(model.predict_proba, preprocessed_data) | ||
|
||
# Plotting the results | ||
fig = plot_dependencies( | ||
total.median(), | ||
title="FairML feature dependence", | ||
fig_size=(10, 8) | ||
) | ||
plt.show() | ||
if __name__ == '__main__': | ||
perform_fairness_analysis('/Users/sheidamajidi/Desktop/Winter2024/Winter2024-2/INSY695-076/Project/bank-full.csv') |
This file was deleted.
Oops, something went wrong.