Skip to content

Commit

Permalink
fairml final
Browse files Browse the repository at this point in the history
  • Loading branch information
SheidaMajidi committed Apr 25, 2024
1 parent e183935 commit ca3ebc4
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 211 deletions.
File renamed without changes.
141 changes: 51 additions & 90 deletions fairness_analysis/fairML_preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,129 +1,90 @@
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from sklearn.base import BaseEstimator, TransformerMixin
from fairml import audit_model
import matplotlib.pyplot as plt
from fairml import plot_dependencies
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from fairml import audit_model, plot_dependencies
from imblearn.pipeline import Pipeline as ImbPipeline
import matplotlib.pyplot as plt


# custom transformer to drop columns
class DropColumns(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
class DataFrameEnsurer(BaseEstimator, TransformerMixin):
"""Transforms an array back to a dataframe, ensuring proper column names after transformations."""
def __init__(self, preprocessor):
self.preprocessor = preprocessor
self.feature_names = None

def fit(self, X, y=None):
self.feature_names = self.preprocessor.get_feature_names_out()
return self

def transform(self, X):
return X.drop(self.columns, axis=1)
if self.feature_names is None:
raise Exception("The transformer is not yet fitted with feature names.")
return pd.DataFrame(X, columns=self.feature_names)

# load data
def load_data(filepath):
# Load and preprocess data
def preprocess_data(filepath):
df = pd.read_csv(filepath, delimiter=';')
df['y'] = df['y'].map({'yes': 1, 'no': 0})
return df

class DataFrameTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns # columns should be a list of column names

def fit(self, X, y=None):
return self

def transform(self, X):
return pd.DataFrame(X, columns=self.columns)
df = df.drop(['contact', 'poutcome', 'duration'], axis=1) # drop columns that are not informative

def preprocess_features(df):
# categorical and numerical columns
categorical_cols = [col for col in df.columns if df[col].dtype == 'object']
numerical_cols = [col for col in df.select_dtypes(include=['int64', 'float64']).columns if col not in ['y']]

numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', MinMaxScaler()),
('to_df', DataFrameTransformer(numerical_cols)) # Convert back to DataFrame
])

categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore')),
('to_df', DataFrameTransformer(categorical_cols)) # Convert back to DataFrame
])

# Combine using ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
], remainder='passthrough') # 'passthrough' to keep columns as DataFrame if untouched
numerical_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64'] and col != 'y']

return preprocessor

""""
# Preprocess features
def preprocess_features(df):
categorical_cols = [col for col in df.columns if df[col].dtype == 'object']
numerical_cols = [col for col in df.select_dtypes(include=['int64', 'float64']).columns if col not in ['y']]
numerical_transformer = Pipeline(steps=[
numerical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='mean')),
('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
categorical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
preprocessor = ColumnTransformer([
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
], remainder='passthrough')

return preprocessor
"""

# build the full preprocessing and modeling pipeline
def build_pipeline(df, drop_cols):
pipeline = ImbPipeline(steps=[
('drop_columns', DropColumns(columns=drop_cols)),
('preprocessor', preprocess_features(df)),
# Setup the full pipeline
pipeline = ImbPipeline([
('preprocessor', preprocessor),
('to_df', DataFrameEnsurer(preprocessor)),
('resampler', RandomOverSampler(random_state=0)),
('classifier', LogisticRegression(random_state=42))
('classifier', LogisticRegression(random_state=42, max_iter=1000))
])
return pipeline

return df, pipeline


def perform_fairness_analysis(filepath):
df, pipeline = preprocess_data(filepath)
X_train, X_test, y_train, y_test = train_test_split(df.drop('y', axis=1), df['y'], test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

filepath = '/Users/sheidamajidi/Desktop/Winter2024/Winter2024-2/INSY695-076/Project/bank-full.csv'
df = load_data(filepath)
X_train, X_test, y_train, y_test = train_test_split(df.drop('y', axis=1), df['y'], test_size=0.2, random_state=42)
model = pipeline.named_steps['classifier']
preprocessed_data = pipeline.named_steps['to_df'].transform(pipeline.named_steps['preprocessor'].transform(X_train))

# Build and train the pipeline
drop_cols = [] # no columns to drop but for future if we want to drop columns
pipeline = build_pipeline(df, drop_cols)
pipeline.fit(X_train, y_train)
# predict_proba
predictions = model.predict_proba(preprocessed_data)[:, 1] # probabilities for the positive class

# Audit the model using the probabilities of the positive class
total, _ = audit_model(lambda x: model.predict_proba(x)[:, 1], preprocessed_data)


# Fairness analysis
# using the trained model for predictions and pass the whole test set for audit
model = pipeline.named_steps['classifier']
preprocessed_data = pipeline.named_steps['preprocessor'].transform(X_train)
fig = plot_dependencies(
total.median(),
title="FairML feature dependence",
fig_size=(10, 8)
)
plt.savefig('fairness_analysis.png')

# Audit the model
total, _ = audit_model(model.predict_proba, preprocessed_data)

# Plotting the results
fig = plot_dependencies(
total.median(),
title="FairML feature dependence",
fig_size=(10, 8)
)
plt.show()
if __name__ == '__main__':
perform_fairness_analysis('/Users/sheidamajidi/Desktop/Winter2024/Winter2024-2/INSY695-076/Project/bank-full.csv')
121 changes: 0 additions & 121 deletions fairness_analysis/fairML_preprocessing_.py

This file was deleted.

0 comments on commit ca3ebc4

Please sign in to comment.