diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index 2e5da7d..835542d 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc index 2f9a42a..ebfbddc 100644 Binary files a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/build.cpython-36.pyc b/q01_outlier_removal/__pycache__/build.cpython-36.pyc index 8248a16..1aba4e3 100644 Binary files a/q01_outlier_removal/__pycache__/build.cpython-36.pyc and b/q01_outlier_removal/__pycache__/build.cpython-36.pyc differ diff --git a/q01_outlier_removal/build.py b/q01_outlier_removal/build.py index ec278ba..a7154f3 100644 --- a/q01_outlier_removal/build.py +++ b/q01_outlier_removal/build.py @@ -1,8 +1,31 @@ +# %load q01_outlier_removal/build.py # Default imports import pandas as pd - +import numpy as np loan_data = pd.read_csv('data/loan_prediction_uncleaned.csv') loan_data = loan_data.drop('Loan_ID', 1) # Write your Solution here: +def outlier_removal(data): + App_inc_UQR = data.quantile(0.95,interpolation='nearest')[0] + Coo_app_inc_UQR = data.quantile(0.95,interpolation='nearest')[1] + Loan_amount_UQR = data.quantile(0.95,interpolation='nearest')[2] + Loan_amount_term_UQR = data.quantile(0.95,interpolation='nearest')[3] + + data = data[data['ApplicantIncome']< App_inc_UQR] + #loan_data = loan_data[loan_data['CoapplicantIncome']<= Coo_app_inc_UQR] + data = data[data['LoanAmount'] < Loan_amount_UQR] + #loan_data = loan_data[loan_data['Loan_Amount_Term']<= Loan_amount_term_UQR] + + return data + + + + + + + + + + diff --git a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc index 5a057ff..a5783f7 100644 Binary files a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc index 4c0b6c7..a6bd2d9 100644 Binary files a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..556ff9e Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..8f794ee Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/build.py b/q02_data_cleaning_all/build.py index b56e2bc..4227ebc 100644 --- a/q02_data_cleaning_all/build.py +++ b/q02_data_cleaning_all/build.py @@ -1,5 +1,7 @@ +# %load q02_data_cleaning_all/build.py # Default Imports import sys, os +import random sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname('__file__')))) import pandas as pd import numpy as np @@ -10,5 +12,41 @@ loan_data = loan_data.drop('Loan_ID', 1) loan_data = outlier_removal(loan_data) +random.seed(9) # Write your solution here : +def data_cleaning(loan_data): + columns = np.array(loan_data.columns) + + null_values_columns=[] + for i in range(len(columns)): + print('-----------------') + print(columns[i]) + print(loan_data.loc[:,columns[i]].isnull().values.any()) + if (loan_data.loc[:,columns[i]].isnull().values.any() == True): + null_values_columns.append(columns[i]) + + + for i in range(len(null_values_columns)): + dtype = loan_data.loc[:,null_values_columns[i]].get_dtype_counts().index[0] + if dtype == 'float64': + mean = loan_data.loc[:,null_values_columns[i]].mean() + loan_data.loc[:,null_values_columns[i]].fillna(mean,inplace=True) + elif dtype=='object': + mode = loan_data.loc[:,null_values_columns[i]].mode()[0] + loan_data.loc[:,null_values_columns[i]].fillna(mode,inplace=True) + + X_train,X_test,y_train,y_test = train_test_split(loan_data.iloc[:,:-1],loan_data.iloc[:,-1],test_size=0.25) + X , y = loan_data.iloc[:,:-1],loan_data.iloc[:,-1] + + return X,y,X_train,X_test,y_train,y_test + + + + + + + + + + diff --git a/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..b61cb82 Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc new file mode 100644 index 0000000..fbc24ea Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc differ