From bb7807467207ab2ddcd097b136edc99cae244be2 Mon Sep 17 00:00:00 2001 From: Ashkan Ghanavati Date: Wed, 25 Oct 2023 14:21:34 -0400 Subject: [PATCH] final cleaning to follow pylint rules for spacing and docstrings --- src/anomaly_code_handler.py | 27 +++++++++++---------- src/data_loader.py | 32 ++++++++++--------------- src/duplicates_handler.py | 23 +++++++++++------- src/missing_values_handler.py | 21 ++++++++-------- src/transaction_status_handler.py | 25 ++++++++++--------- test/test_anomaly_code_handler.py | 25 ++++++++++--------- test/test_data_loader.py | 16 ++++++------- test/test_duplicates_handler.py | 30 +++++++++++++---------- test/test_missing_values_handler.py | 22 ++++++++++------- test/test_transaction_status_handler.py | 25 ++++++++++--------- 10 files changed, 130 insertions(+), 116 deletions(-) diff --git a/src/anomaly_code_handler.py b/src/anomaly_code_handler.py index 529cf46..82bcf07 100644 --- a/src/anomaly_code_handler.py +++ b/src/anomaly_code_handler.py @@ -1,40 +1,43 @@ -import pandas as pd +""" +A module for removing anomalies from StockCode column if they have 0 or 1 +digit characters since the normal values are 5 or 6 digits. +""" + import pickle import os # Determine the absolute path of the project directory PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', + 'processed','after_transaction_status.pkl') +OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', + 'processed', 'after_anomaly_code.pkl') -INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_transaction_status.pkl') -OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'after_anomaly_code.pkl') - -def handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH): +def handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH, + output_pickle_path=OUTPUT_PICKLE_PATH): """ Load the DataFrame from the input pickle, remove rows with stock codes that - have 0 or 1 numeric characters, then save the DataFrame back to a pickle and return its path. + have 0 or 1 numeric characters, + then save the DataFrame back to a pickle and return its path. :param input_pickle_path: Path to the input pickle file. :param output_pickle_path: Path to the output pickle file. :return: Path to the saved pickle file. """ - # Load DataFrame from input pickle if os.path.exists(input_pickle_path): with open(input_pickle_path, "rb") as file: df = pickle.load(file) else: raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}") - # Finding the stock codes with 0 and 1 numeric characters unique_stock_codes = df['StockCode'].unique() - anomalous_stock_codes = [code for code in unique_stock_codes if sum(c.isdigit() for c in str(code)) in (0, 1)] - + anomalous_stock_codes = [code for code in unique_stock_codes if + sum(c.isdigit() for c in str(code)) in (0, 1)] # Removing rows with these anomalous stock codes df = df[~df['StockCode'].isin(anomalous_stock_codes)] - # Save the data to output pickle with open(output_pickle_path, "wb") as file: pickle.dump(df, file) - print(f"Data saved to {output_pickle_path}.") return output_pickle_path diff --git a/src/data_loader.py b/src/data_loader.py index 67dc4e4..149f296 100644 --- a/src/data_loader.py +++ b/src/data_loader.py @@ -1,55 +1,47 @@ +""" +Module to handle the loading of e-commerce dataset from either pickle or Excel file format. +""" + +import pickle import os import pandas as pd -import pickle # Determine the absolute path of the project directory PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # Use the project directory to construct paths to other directories -DEFAULT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'raw_data.pkl') +DEFAULT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', + 'processed', 'raw_data.pkl') DEFAULT_EXCEL_PATH = os.path.join(PROJECT_DIR, 'data', 'Online Retail.xlsx') def load_data(pickle_path=DEFAULT_PICKLE_PATH, excel_path=DEFAULT_EXCEL_PATH): """ - Load the e-commerce dataset. + Load the e-commerce dataset. First, try to load from the pickle file. If it doesn't exist, load from the excel file. - Regardless of the source, save the loaded data as a pickle for future use and return the path to that pickle. + Regardless of the source, save the loaded data as a pickle for future use and + return the path to that pickle. :param pickle_path: Path to the pickle file. - :param csv_path: Path to the Excel file. + :param excel_path: Path to the Excel file. :return: Path to the saved pickle file. """ - # Placeholder for the DataFrame df = None - # Check if pickle file exists if os.path.exists(pickle_path): with open(pickle_path, "rb") as file: df = pickle.load(file) print(f"Data loaded successfully from {pickle_path}.") - - # If pickle doesn't exist, load CSV + # If pickle doesn't exist, load from Excel elif os.path.exists(excel_path): df = pd.read_excel(excel_path) print(f"Data loaded from {excel_path}.") - - else: error_message = f"No data found in the specified paths: {pickle_path} or {excel_path}" print(error_message) raise FileNotFoundError(error_message) - # Save the data to pickle for future use (or re-save it if loaded from existing pickle) with open(pickle_path, "wb") as file: pickle.dump(df, file) - print(f"Data saved to {pickle_path} for future use.") return pickle_path - - - - - - - diff --git a/src/duplicates_handler.py b/src/duplicates_handler.py index 2b03919..82824d8 100644 --- a/src/duplicates_handler.py +++ b/src/duplicates_handler.py @@ -1,12 +1,22 @@ -import pandas as pd +""" +A module for removing duplicates in dataset based on subset of +following columns: +- InvoiceNo +- StockCode +- Description +- CustomerID +- Quantity +""" + import pickle import os # Determine the absolute path of the project directory PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - -INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_missing_values.pkl') -OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'after_duplicates.pkl') +INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', + 'processed','after_missing_values.pkl') +OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', + 'processed', 'after_duplicates.pkl') def remove_duplicates(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH): """ @@ -17,23 +27,18 @@ def remove_duplicates(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OU :param output_pickle_path: Path to the output pickle file. :return: Path to the saved pickle file. """ - # Load DataFrame from input pickle if os.path.exists(input_pickle_path): with open(input_pickle_path, "rb") as file: df = pickle.load(file) else: raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}") - # Columns to check for duplicates columns_to_check = ['InvoiceNo', 'StockCode', 'Description', 'CustomerID', 'Quantity'] - # Drop duplicates df = df.drop_duplicates(subset=columns_to_check) - # Save the data to output pickle with open(output_pickle_path, "wb") as file: pickle.dump(df, file) - print(f"Data saved to {output_pickle_path}.") return output_pickle_path diff --git a/src/missing_values_handler.py b/src/missing_values_handler.py index ae6bec4..b4e41d7 100644 --- a/src/missing_values_handler.py +++ b/src/missing_values_handler.py @@ -1,7 +1,10 @@ -import pickle -import pandas as pd +""" +A module for removing missig values from dataset based on CustomeID +and Description column. +""" + import os -# ======================== Missing Values ======================== +import pickle # Determine the absolute path of the project directory PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -11,36 +14,32 @@ def handle_missing(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH): """ - Load the DataFrame from the input pickle, remove rows with missing values in 'CustomerID' and 'Description' columns. + Load the DataFrame from the input pickle, + remove rows with missing values in 'CustomerID' and 'Description' columns. Then, check if there are any missing values left in the dataframe. - If there are, raise a ValueError. Finally, save the DataFrame back to a pickle and return its path. + If there are, raise a ValueError. Finally, + save the DataFrame back to a pickle and return its path. :param input_pickle_path: Path to the input pickle file. :param output_pickle_path: Path to the output pickle file. :return: Path to the saved pickle file. """ - # Load DataFrame from input pickle if os.path.exists(input_pickle_path): with open(input_pickle_path, "rb") as file: df = pickle.load(file) else: raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}") - # Remove rows with missing values in 'CustomerID' and 'Description' df = df.dropna(subset=['CustomerID', 'Description']) - # Check if there are any missing values left if df.isna().sum().sum() != 0: missing_count = df.isna().sum().sum() message = f"There are {missing_count} missing values left in the dataframe." print(message) raise ValueError(message) - # Save the data to output pickle with open(output_pickle_path, "wb") as file: pickle.dump(df, file) - print(f"Data saved to {output_pickle_path}.") return output_pickle_path - diff --git a/src/transaction_status_handler.py b/src/transaction_status_handler.py index 63c90eb..ace6197 100644 --- a/src/transaction_status_handler.py +++ b/src/transaction_status_handler.py @@ -1,15 +1,22 @@ -import pandas as pd +""" +A module for adding a new column named transaction_status based on the +starting character of InvoiceNo column. +transaction_status values are Cancelled or Completed. +""" + import pickle import os import numpy as np # Determine the absolute path of the project directory PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', + 'processed','after_duplicates.pkl') +OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', + 'processed', 'after_transaction_status.pkl') -INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_duplicates.pkl') -OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'after_transaction_status.pkl') - -def handle_transaction_status(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH): +def handle_transaction_status(input_pickle_path=INPUT_PICKLE_PATH, + output_pickle_path=OUTPUT_PICKLE_PATH): """ Load the DataFrame from the input pickle, add a 'transaction_status' column to indicate whether the transaction was 'Cancelled' or 'Completed'. @@ -21,24 +28,20 @@ def handle_transaction_status(input_pickle_path=INPUT_PICKLE_PATH, output_pickle :raises KeyError: If the 'InvoiceNo' column doesn't exist in the dataframe. """ - # Load DataFrame from input pickle if os.path.exists(input_pickle_path): with open(input_pickle_path, "rb") as file: df = pickle.load(file) else: raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}") - # Check if 'InvoiceNo' column exists if 'InvoiceNo' not in df.columns: raise KeyError("The input dataframe does not contain an 'InvoiceNo' column.") - # Add the 'Transaction_Status' column - df['transaction_status'] = np.where(df['InvoiceNo'].astype(str).str.startswith('C'), 'Cancelled', 'Completed') - + df['transaction_status'] = np.where(df['InvoiceNo'].astype(str).str.startswith('C'), + 'Cancelled', 'Completed') # Save the data to output pickle with open(output_pickle_path, "wb") as file: pickle.dump(df, file) - print(f"Data saved to {output_pickle_path}.") return output_pickle_path diff --git a/test/test_anomaly_code_handler.py b/test/test_anomaly_code_handler.py index 0bf3f9b..8058a71 100644 --- a/test/test_anomaly_code_handler.py +++ b/test/test_anomaly_code_handler.py @@ -1,30 +1,33 @@ -import pytest +""" +A module for testing anomaly_code_handler module. +""" + import os import pickle from src.anomaly_code_handler import handle_anomalous_codes # Determine the absolute path of the project directory PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - -INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_transaction_status.pkl') -OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'after_anomaly_code.pkl') +INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', + 'processed','after_transaction_status.pkl') +OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', + 'processed', 'after_anomaly_code.pkl') def test_handle_anomalous_codes(): """ Test that handle_anomalous_codes correctly removes rows with stock codes that have 0 or 1 numeric characters. """ - result = handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH) - assert result == OUTPUT_PICKLE_PATH, f"Expected {OUTPUT_PICKLE_PATH}, but got {result}." - + result = handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH, + output_pickle_path=OUTPUT_PICKLE_PATH) + assert result == OUTPUT_PICKLE_PATH,\ + f"Expected {OUTPUT_PICKLE_PATH}, but got {result}." # Load the output pickle file to check the 'StockCode' column with open(OUTPUT_PICKLE_PATH, "rb") as file: df = pickle.load(file) - # Check for stock codes with 0 or 1 numeric characters unique_stock_codes = df['StockCode'].unique() - anomalous_stock_codes = [code for code in unique_stock_codes if sum(c.isdigit() for c in str(code)) in (0, 1)] - + anomalous_stock_codes = [code for code in unique_stock_codes if + sum(c.isdigit() for c in str(code)) in (0, 1)] # Assert that no such anomalous stock codes exist assert len(anomalous_stock_codes) == 0, "Anomalous stock codes found in the dataframe." - diff --git a/test/test_data_loader.py b/test/test_data_loader.py index 8cfba50..b580cc5 100644 --- a/test/test_data_loader.py +++ b/test/test_data_loader.py @@ -1,5 +1,9 @@ -import pytest +""" +Tests for data_loader module. +""" + import os +import pytest from src.data_loader import load_data # Determine the absolute path of the project directory @@ -15,25 +19,22 @@ # """ # # Ensure the pickle file exists for this test # assert os.path.exists(PICKLE_PATH), "Pickle file doesn't exist for testing." - # result = load_data(pickle_path=PICKLE_PATH, csv_path=CSV_PATH) # assert result == PICKLE_PATH, f"Expected {PICKLE_PATH}, but got {result}." def test_load_data_from_excel(): """ - Test that load_data correctly loads data from Excel and saves as pickle when pickle doesn't exist. + Test that load_data correctly loads data from Excel and saves as pickle + when pickle doesn't exist. """ # Temporarily rename the pickle to simulate its absence if os.path.exists(PICKLE_PATH): os.rename(PICKLE_PATH, PICKLE_PATH + ".bak") - result = load_data(pickle_path=PICKLE_PATH, excel_path=EXCEL_PATH) assert result == PICKLE_PATH, f"Expected {PICKLE_PATH}, but got {result}." - # Rename pickle back to its original name if os.path.exists(PICKLE_PATH + ".bak"): os.rename(PICKLE_PATH + ".bak", PICKLE_PATH) - def test_load_data_no_files(): """ Test that load_data raises an error when neither pickle nor Excel exists. @@ -43,13 +44,10 @@ def test_load_data_no_files(): os.rename(PICKLE_PATH, PICKLE_PATH + ".bak") if os.path.exists(EXCEL_PATH): os.rename(EXCEL_PATH, EXCEL_PATH + ".bak") - with pytest.raises(FileNotFoundError): load_data(pickle_path=PICKLE_PATH, excel_path=EXCEL_PATH) - # Rename files back to their original names if os.path.exists(PICKLE_PATH + ".bak"): os.rename(PICKLE_PATH + ".bak", PICKLE_PATH) if os.path.exists(EXCEL_PATH + ".bak"): os.rename(EXCEL_PATH + ".bak", EXCEL_PATH) - diff --git a/test/test_duplicates_handler.py b/test/test_duplicates_handler.py index 47d6522..7f66c34 100644 --- a/test/test_duplicates_handler.py +++ b/test/test_duplicates_handler.py @@ -1,14 +1,18 @@ -import pytest +""" +A test for duplicates_handler module. +""" + import os -import pandas as pd import pickle +import pytest from src.duplicates_handler import remove_duplicates # Determine the absolute path of the project directory PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - -INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_missing_values.pkl') -OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'after_duplicates.pkl') +INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', + 'processed','after_missing_values.pkl') +OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', + 'processed', 'after_duplicates.pkl') def test_remove_duplicates_no_input_file(): """ @@ -17,24 +21,24 @@ def test_remove_duplicates_no_input_file(): # Temporarily rename the input file if os.path.exists(INPUT_PICKLE_PATH): os.rename(INPUT_PICKLE_PATH, INPUT_PICKLE_PATH + ".bak") - with pytest.raises(FileNotFoundError): - remove_duplicates(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH) - + remove_duplicates(input_pickle_path=INPUT_PICKLE_PATH, + output_pickle_path=OUTPUT_PICKLE_PATH) # Rename input file back to its original name if os.path.exists(INPUT_PICKLE_PATH + ".bak"): os.rename(INPUT_PICKLE_PATH + ".bak", INPUT_PICKLE_PATH) def test_remove_duplicates(): """ - Test that remove_duplicates correctly removes duplicates and saves to the output pickle. + Test that remove_duplicates correctly removes duplicates and + saves to the output pickle. """ - result = remove_duplicates(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH) + result = remove_duplicates(input_pickle_path=INPUT_PICKLE_PATH, + output_pickle_path=OUTPUT_PICKLE_PATH) assert result == OUTPUT_PICKLE_PATH, f"Expected {OUTPUT_PICKLE_PATH}, but got {result}." - # Check if duplicates are truly removed with open(OUTPUT_PICKLE_PATH, "rb") as file: df = pickle.load(file) columns_to_check = ['InvoiceNo', 'StockCode', 'Description', 'CustomerID', 'Quantity'] - assert not df.duplicated(subset=columns_to_check).any(), "There are still duplicates in the dataframe." - + assert not df.duplicated(subset=columns_to_check).any(),\ + "There are still duplicates in the dataframe." diff --git a/test/test_missing_values_handler.py b/test/test_missing_values_handler.py index 106f943..fb9f5e2 100644 --- a/test/test_missing_values_handler.py +++ b/test/test_missing_values_handler.py @@ -1,20 +1,25 @@ +""" +A test module for testing missing_values_handler module. +""" + +import os import pytest from src.missing_values_handler import handle_missing -import os # Determine the absolute path of the project directory PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - -INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'raw_data.pkl') -OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_missing_values.pkl') +INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', + 'processed', 'raw_data.pkl') +OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', + 'processed','after_missing_values.pkl') def test_handle_missing_success(): """ Test successful removal of rows with missing values and saving of the dataframe. """ - result = handle_missing(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH) + result = handle_missing(input_pickle_path=INPUT_PICKLE_PATH, + output_pickle_path=OUTPUT_PICKLE_PATH) assert result == OUTPUT_PICKLE_PATH, f"Expected {OUTPUT_PICKLE_PATH}, but got {result}." - def test_handle_missing_file_not_found(): """ Test that handle_missing raises an error when the input pickle doesn't exist. @@ -22,10 +27,9 @@ def test_handle_missing_file_not_found(): # Rename the input pickle temporarily to simulate its absence if os.path.exists(INPUT_PICKLE_PATH): os.rename(INPUT_PICKLE_PATH, INPUT_PICKLE_PATH + ".bak") - with pytest.raises(FileNotFoundError): - handle_missing(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH) - + handle_missing(input_pickle_path=INPUT_PICKLE_PATH, + output_pickle_path=OUTPUT_PICKLE_PATH) # Rename the input pickle back to its original name if os.path.exists(INPUT_PICKLE_PATH + ".bak"): os.rename(INPUT_PICKLE_PATH + ".bak", INPUT_PICKLE_PATH) diff --git a/test/test_transaction_status_handler.py b/test/test_transaction_status_handler.py index aacfb38..83df9cb 100644 --- a/test/test_transaction_status_handler.py +++ b/test/test_transaction_status_handler.py @@ -1,30 +1,33 @@ -import pytest +""" +A module for testing transaction_status_handler module. +""" + import os import pickle from src.transaction_status_handler import handle_transaction_status # Determine the absolute path of the project directory PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - -INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_duplicates.pkl') -OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'after_transaction_status.pkl') +INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', + 'processed','after_duplicates.pkl') +OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', + 'processed', 'after_transaction_status.pkl') def test_handle_transaction_status(): """ Test that handle_transaction_status correctly adds the 'transaction_status' column based on the 'InvoiceNo' and ensures statuses are 'Cancelled' or 'Completed'. """ - result = handle_transaction_status(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH) + result = handle_transaction_status(input_pickle_path=INPUT_PICKLE_PATH, + output_pickle_path=OUTPUT_PICKLE_PATH) assert result == OUTPUT_PICKLE_PATH, f"Expected {OUTPUT_PICKLE_PATH}, but got {result}." - # Load the output pickle file and check the 'transaction_status' column with open(OUTPUT_PICKLE_PATH, "rb") as file: df = pickle.load(file) - # Assert that 'transaction_status' column exists - assert 'transaction_status' in df.columns, "'transaction_status' column not found in the dataframe." - + assert 'transaction_status' in df.columns,\ + "'transaction_status' column not found in the dataframe." # Check if all values in 'transaction_status' are either 'Cancelled' or 'Completed' unique_statuses = df['transaction_status'].unique() - assert set(unique_statuses) == {'Cancelled', 'Completed'}, "Unexpected values found in 'transaction_status' column." - + assert set(unique_statuses) == {'Cancelled', 'Completed'},\ + "Unexpected values found in 'transaction_status' column."