Skip to content

Commit 2593084

Browse files
committed
update tests/test_features/test_data_preprocess.py
1 parent 6a79d64 commit 2593084

File tree

1 file changed

+140
-0
lines changed

1 file changed

+140
-0
lines changed

tests/test_features/test_data_preprocess.py

+140
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import pytest
22
import pandas as pd
33
import os
4+
import numpy as np
45
import src.features.data_preprocess as dp
56

67

@@ -83,3 +84,142 @@ def test_ordinalencoding_test(mock_model_path):
8384
assert transformed_df_train["EmploymentStatus"].equals(
8485
transformed_df_test["EmploymentStatus"]
8586
)
87+
88+
89+
def test_onehotencoding_train(mock_model_path):
90+
91+
data = {
92+
"EducationLevel": ["Master", "Associate", "Bachelor", "High School"],
93+
"MaritalStatus": ["Married", "Single", "Married", "Single"],
94+
"HomeOwnershipStatus": ["Rent", "Own", "Mortgage", "Rent"],
95+
"LoanPurpose": ["Auto", "Debt Consolidation", "Home", "Other"],
96+
"NumberOfDependents": [0, 2, 1, 3]
97+
}
98+
sample_dataframe = pd.DataFrame(data)
99+
100+
transformed_df = dp.onehotencoding(
101+
sample_dataframe, mock_model_path, train=True
102+
)
103+
104+
assert "EducationLevel" not in transformed_df.columns
105+
assert "MaritalStatus" not in transformed_df.columns
106+
assert "HomeOwnershipStatus" not in transformed_df.columns
107+
assert "LoanPurpose" not in transformed_df.columns
108+
assert "NumberOfDependents" not in transformed_df.columns
109+
110+
assert any(col.startswith("EducationLevel_")
111+
for col in transformed_df.columns)
112+
assert any(col.startswith("MaritalStatus_")
113+
for col in transformed_df.columns)
114+
assert any(col.startswith("HomeOwnershipStatus_")
115+
for col in transformed_df.columns)
116+
assert any(col.startswith("LoanPurpose_")
117+
for col in transformed_df.columns)
118+
assert any(col.startswith("NumberOfDependents_")
119+
for col in transformed_df.columns)
120+
121+
model_file = os.path.join(mock_model_path, "one_hot_encoder.pkl")
122+
assert os.path.exists(model_file)
123+
124+
125+
def test_onehotencoding_test(mock_model_path):
126+
127+
data = {
128+
"EducationLevel": ["Master", "Associate", "Bachelor", "High School"],
129+
"MaritalStatus": ["Married", "Single", "Married", "Single"],
130+
"HomeOwnershipStatus": ["Rent", "Own", "Mortgage", "Rent"],
131+
"LoanPurpose": ["Auto", "Debt Consolidation", "Home", "Other"],
132+
"NumberOfDependents": [0, 2, 1, 3]
133+
}
134+
sample_dataframe = pd.DataFrame(data)
135+
136+
_ = dp.onehotencoding(
137+
sample_dataframe, str(mock_model_path), train=True
138+
)
139+
140+
transformed_df_test = dp.onehotencoding(
141+
sample_dataframe, str(mock_model_path), train=False
142+
)
143+
144+
assert "EducationLevel" not in transformed_df_test.columns
145+
assert "MaritalStatus" not in transformed_df_test.columns
146+
assert "HomeOwnershipStatus" not in transformed_df_test.columns
147+
assert "LoanPurpose" not in transformed_df_test.columns
148+
assert "NumberOfDependents" not in transformed_df_test.columns
149+
150+
assert any(col.startswith("EducationLevel_")
151+
for col in transformed_df_test.columns)
152+
assert any(col.startswith("MaritalStatus_")
153+
for col in transformed_df_test.columns)
154+
assert any(col.startswith("HomeOwnershipStatus_")
155+
for col in transformed_df_test.columns)
156+
assert any(col.startswith("LoanPurpose_")
157+
for col in transformed_df_test.columns)
158+
assert any(col.startswith("NumberOfDependents_")
159+
for col in transformed_df_test.columns)
160+
161+
162+
def test_normalization_train(mock_model_path):
163+
164+
num_cols = ['Age', 'AnnualIncome', 'CreditScore',
165+
'Experience', 'LoanAmount', 'LoanDuration',
166+
'MonthlyDebtPayments', 'CreditCardUtilizationRate',
167+
'NumberOfOpenCreditLines', 'NumberOfCreditInquiries',
168+
'DebtToIncomeRatio', 'PaymentHistory', 'LengthOfCreditHistory',
169+
'SavingsAccountBalance', 'CheckingAccountBalance',
170+
'TotalAssets', 'TotalLiabilities', 'MonthlyIncome',
171+
'UtilityBillsPaymentHistory', 'JobTenure', 'NetWorth',
172+
'BaseInterestRate', 'InterestRate', 'MonthlyLoanPayment',
173+
'TotalDebtToIncomeRatio', 'AnIncomeToAssetsRatio',
174+
'AnExperienceToAnIncomeRatio', 'LoantoAnIncomeRatio',
175+
'DependetToAnIncomeRatio', 'LoansToAssetsRatio',
176+
'LoanPaymentToIncomeRatio', 'AnIncomeToDepts', 'AssetsToLoan']
177+
178+
data = np.random.rand(10, len(num_cols)) * 100
179+
df_train = pd.DataFrame(data, columns=num_cols)
180+
181+
df_normalized = dp.normalization(
182+
df_train, str(mock_model_path), train=True
183+
)
184+
185+
model_file = os.path.join(mock_model_path, 'standardscaler.pkl')
186+
assert os.path.exists(model_file)
187+
188+
assert np.all(
189+
np.isclose(df_normalized.mean(), 0, atol=1e-1)
190+
)
191+
assert np.all(
192+
np.isclose(df_normalized.std(), 1, atol=1e-1)
193+
)
194+
195+
196+
def test_normalization_test(mock_model_path):
197+
198+
num_cols = ['Age', 'AnnualIncome', 'CreditScore',
199+
'Experience', 'LoanAmount', 'LoanDuration',
200+
'MonthlyDebtPayments', 'CreditCardUtilizationRate',
201+
'NumberOfOpenCreditLines', 'NumberOfCreditInquiries',
202+
'DebtToIncomeRatio', 'PaymentHistory', 'LengthOfCreditHistory',
203+
'SavingsAccountBalance', 'CheckingAccountBalance',
204+
'TotalAssets', 'TotalLiabilities', 'MonthlyIncome',
205+
'UtilityBillsPaymentHistory', 'JobTenure', 'NetWorth',
206+
'BaseInterestRate', 'InterestRate', 'MonthlyLoanPayment',
207+
'TotalDebtToIncomeRatio', 'AnIncomeToAssetsRatio',
208+
'AnExperienceToAnIncomeRatio', 'LoantoAnIncomeRatio',
209+
'DependetToAnIncomeRatio', 'LoansToAssetsRatio',
210+
'LoanPaymentToIncomeRatio', 'AnIncomeToDepts', 'AssetsToLoan']
211+
212+
data = np.random.rand(10, len(num_cols)) * 100
213+
df_train = pd.DataFrame(data, columns=num_cols)
214+
df_test = pd.DataFrame(data, columns=num_cols)
215+
216+
df_train_normalized = dp.normalization(
217+
df_train, str(mock_model_path), train=True
218+
)
219+
df_test_normalized = dp.normalization(
220+
df_test, (mock_model_path), train=False
221+
)
222+
223+
np.testing.assert_array_almost_equal(
224+
df_train_normalized.values, df_test_normalized, decimal=5
225+
)

0 commit comments

Comments
 (0)