final cleaning to follow pylint rules for spacing and docstrings

Thomas-George-T · Oct 25, 2023 · bb78074 · bb78074
1 parent ac561f8
commit bb78074
Show file tree

Hide file tree

Showing 10 changed files with 130 additions and 116 deletions.
diff --git a/src/anomaly_code_handler.py b/src/anomaly_code_handler.py
@@ -1,40 +1,43 @@
-import pandas as pd
+"""
+A module for removing anomalies from StockCode column if they have 0 or 1 
+digit characters since the normal values are 5 or 6 digits.
+"""
+
 import pickle
 import os
 
 # Determine the absolute path of the project directory
 PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                 'processed','after_transaction_status.pkl')
+OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                  'processed', 'after_anomaly_code.pkl')
 
-INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_transaction_status.pkl')
-OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'after_anomaly_code.pkl')
-
-def handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH):
+def handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH,
+                           output_pickle_path=OUTPUT_PICKLE_PATH):
     """
     Load the DataFrame from the input pickle, remove rows with stock codes that 
-    have 0 or 1 numeric characters, then save the DataFrame back to a pickle and return its path.
+    have 0 or 1 numeric characters, 
+    then save the DataFrame back to a pickle and return its path.
     
     :param input_pickle_path: Path to the input pickle file.
     :param output_pickle_path: Path to the output pickle file.
     :return: Path to the saved pickle file.
     """
-
     # Load DataFrame from input pickle
     if os.path.exists(input_pickle_path):
         with open(input_pickle_path, "rb") as file:
             df = pickle.load(file)
     else:
         raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}")
-
     # Finding the stock codes with 0 and 1 numeric characters
     unique_stock_codes = df['StockCode'].unique()
-    anomalous_stock_codes = [code for code in unique_stock_codes if sum(c.isdigit() for c in str(code)) in (0, 1)]
-
+    anomalous_stock_codes = [code for code in unique_stock_codes if
+                             sum(c.isdigit() for c in str(code)) in (0, 1)]
     # Removing rows with these anomalous stock codes
     df = df[~df['StockCode'].isin(anomalous_stock_codes)]
-
     # Save the data to output pickle
     with open(output_pickle_path, "wb") as file:
         pickle.dump(df, file)
-
     print(f"Data saved to {output_pickle_path}.")
     return output_pickle_path
diff --git a/src/data_loader.py b/src/data_loader.py
@@ -1,55 +1,47 @@
+"""
+Module to handle the loading of e-commerce dataset from either pickle or Excel file format.
+"""
+
+import pickle
 import os
 import pandas as pd
-import pickle
 
 # Determine the absolute path of the project directory
 PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
 # Use the project directory to construct paths to other directories
-DEFAULT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'raw_data.pkl')
+DEFAULT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                   'processed', 'raw_data.pkl')
 DEFAULT_EXCEL_PATH = os.path.join(PROJECT_DIR, 'data', 'Online Retail.xlsx')
 
 def load_data(pickle_path=DEFAULT_PICKLE_PATH, excel_path=DEFAULT_EXCEL_PATH):
     """
-    Load the e-commerce dataset. 
+    Load the e-commerce dataset.
     First, try to load from the pickle file. If it doesn't exist, load from the excel file.
-    Regardless of the source, save the loaded data as a pickle for future use and return the path to that pickle.
+    Regardless of the source, save the loaded data as a pickle for future use and
+    return the path to that pickle.
     
     :param pickle_path: Path to the pickle file.
-    :param csv_path: Path to the Excel file.
+    :param excel_path: Path to the Excel file.
     :return: Path to the saved pickle file.
     """
-
     # Placeholder for the DataFrame
     df = None
-
     # Check if pickle file exists
     if os.path.exists(pickle_path):
         with open(pickle_path, "rb") as file:
             df = pickle.load(file)
         print(f"Data loaded successfully from {pickle_path}.")
-
-    # If pickle doesn't exist, load CSV
+    # If pickle doesn't exist, load from Excel
     elif os.path.exists(excel_path):
         df = pd.read_excel(excel_path)
         print(f"Data loaded from {excel_path}.")
-
-
     else:
         error_message = f"No data found in the specified paths: {pickle_path} or {excel_path}"
         print(error_message)
         raise FileNotFoundError(error_message)
-
     # Save the data to pickle for future use (or re-save it if loaded from existing pickle)
     with open(pickle_path, "wb") as file:
         pickle.dump(df, file)
-
     print(f"Data saved to {pickle_path} for future use.")
     return pickle_path
-
-
-
-
-
-
-
diff --git a/src/duplicates_handler.py b/src/duplicates_handler.py
@@ -1,12 +1,22 @@
-import pandas as pd
+"""
+A module for removing duplicates in dataset based on subset of 
+following columns:
+- InvoiceNo
+- StockCode
+- Description
+- CustomerID
+- Quantity
+"""
+
 import pickle
 import os
 
 # Determine the absolute path of the project directory
 PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-
-INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_missing_values.pkl')
-OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'after_duplicates.pkl')
+INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                 'processed','after_missing_values.pkl')
+OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                  'processed', 'after_duplicates.pkl')
 
 def remove_duplicates(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH):
     """
@@ -17,23 +27,18 @@ def remove_duplicates(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OU
     :param output_pickle_path: Path to the output pickle file.
     :return: Path to the saved pickle file.
     """
-
     # Load DataFrame from input pickle
     if os.path.exists(input_pickle_path):
         with open(input_pickle_path, "rb") as file:
             df = pickle.load(file)
     else:
         raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}")
-
     # Columns to check for duplicates
     columns_to_check = ['InvoiceNo', 'StockCode', 'Description', 'CustomerID', 'Quantity']
-
     # Drop duplicates
     df = df.drop_duplicates(subset=columns_to_check)
-
     # Save the data to output pickle
     with open(output_pickle_path, "wb") as file:
         pickle.dump(df, file)
-
     print(f"Data saved to {output_pickle_path}.")
     return output_pickle_path
diff --git a/src/missing_values_handler.py b/src/missing_values_handler.py
@@ -1,7 +1,10 @@
-import pickle
-import pandas as pd
+"""
+A module for removing missig values from dataset based on CustomeID
+and Description column.
+"""
+
 import os
-# ======================== Missing Values ========================
+import pickle
 
 # Determine the absolute path of the project directory
 PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -11,36 +14,32 @@
 
 def handle_missing(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH):
     """
-    Load the DataFrame from the input pickle, remove rows with missing values in 'CustomerID' and 'Description' columns.
+    Load the DataFrame from the input pickle, 
+    remove rows with missing values in 'CustomerID' and 'Description' columns.
     Then, check if there are any missing values left in the dataframe.
-    If there are, raise a ValueError. Finally, save the DataFrame back to a pickle and return its path.
+    If there are, raise a ValueError. Finally, 
+    save the DataFrame back to a pickle and return its path.
     
     :param input_pickle_path: Path to the input pickle file.
     :param output_pickle_path: Path to the output pickle file.
     :return: Path to the saved pickle file.
     """
-
     # Load DataFrame from input pickle
     if os.path.exists(input_pickle_path):
         with open(input_pickle_path, "rb") as file:
             df = pickle.load(file)
     else:
         raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}")
-
     # Remove rows with missing values in 'CustomerID' and 'Description'
     df = df.dropna(subset=['CustomerID', 'Description'])
-
     # Check if there are any missing values left
     if df.isna().sum().sum() != 0:
         missing_count = df.isna().sum().sum()
         message = f"There are {missing_count} missing values left in the dataframe."
         print(message)
         raise ValueError(message)
-
     # Save the data to output pickle
     with open(output_pickle_path, "wb") as file:
         pickle.dump(df, file)
-
     print(f"Data saved to {output_pickle_path}.")
     return output_pickle_path
-
diff --git a/src/transaction_status_handler.py b/src/transaction_status_handler.py
@@ -1,15 +1,22 @@
-import pandas as pd
+"""
+A module for adding a new column named transaction_status based on the 
+starting character of InvoiceNo column. 
+transaction_status values are Cancelled or Completed.
+"""
+
 import pickle
 import os
 import numpy as np
 
 # Determine the absolute path of the project directory
 PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                 'processed','after_duplicates.pkl')
+OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                  'processed', 'after_transaction_status.pkl')
 
-INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_duplicates.pkl')
-OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'after_transaction_status.pkl')
-
-def handle_transaction_status(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH):
+def handle_transaction_status(input_pickle_path=INPUT_PICKLE_PATH,
+                              output_pickle_path=OUTPUT_PICKLE_PATH):
     """
     Load the DataFrame from the input pickle, add a 'transaction_status' column 
     to indicate whether the transaction was 'Cancelled' or 'Completed'. 
@@ -21,24 +28,20 @@ def handle_transaction_status(input_pickle_path=INPUT_PICKLE_PATH, output_pickle
     
     :raises KeyError: If the 'InvoiceNo' column doesn't exist in the dataframe.
     """
-
     # Load DataFrame from input pickle
     if os.path.exists(input_pickle_path):
         with open(input_pickle_path, "rb") as file:
             df = pickle.load(file)
     else:
         raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}")
-
     # Check if 'InvoiceNo' column exists
     if 'InvoiceNo' not in df.columns:
         raise KeyError("The input dataframe does not contain an 'InvoiceNo' column.")
-
     # Add the 'Transaction_Status' column
-    df['transaction_status'] = np.where(df['InvoiceNo'].astype(str).str.startswith('C'), 'Cancelled', 'Completed')
-
+    df['transaction_status'] = np.where(df['InvoiceNo'].astype(str).str.startswith('C'),
+                                        'Cancelled', 'Completed')
     # Save the data to output pickle
     with open(output_pickle_path, "wb") as file:
         pickle.dump(df, file)
-
     print(f"Data saved to {output_pickle_path}.")
     return output_pickle_path
diff --git a/test/test_anomaly_code_handler.py b/test/test_anomaly_code_handler.py
@@ -1,30 +1,33 @@
-import pytest
+"""
+A module for testing anomaly_code_handler module.
+"""
+
 import os
 import pickle
 from src.anomaly_code_handler import handle_anomalous_codes
 
 # Determine the absolute path of the project directory
 PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-
-INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_transaction_status.pkl')
-OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'after_anomaly_code.pkl')
+INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                 'processed','after_transaction_status.pkl')
+OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                  'processed', 'after_anomaly_code.pkl')
 
 def test_handle_anomalous_codes():
     """
     Test that handle_anomalous_codes correctly removes rows with stock codes 
     that have 0 or 1 numeric characters.
     """
-    result = handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH)
-    assert result == OUTPUT_PICKLE_PATH, f"Expected {OUTPUT_PICKLE_PATH}, but got {result}."
-
+    result = handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH,
+                                    output_pickle_path=OUTPUT_PICKLE_PATH)
+    assert result == OUTPUT_PICKLE_PATH,\
+        f"Expected {OUTPUT_PICKLE_PATH}, but got {result}."
     # Load the output pickle file to check the 'StockCode' column
     with open(OUTPUT_PICKLE_PATH, "rb") as file:
         df = pickle.load(file)
-
     # Check for stock codes with 0 or 1 numeric characters
     unique_stock_codes = df['StockCode'].unique()
-    anomalous_stock_codes = [code for code in unique_stock_codes if sum(c.isdigit() for c in str(code)) in (0, 1)]
-
+    anomalous_stock_codes = [code for code in unique_stock_codes if
+                             sum(c.isdigit() for c in str(code)) in (0, 1)]
     # Assert that no such anomalous stock codes exist
     assert len(anomalous_stock_codes) == 0, "Anomalous stock codes found in the dataframe."
-
diff --git a/test/test_data_loader.py b/test/test_data_loader.py
@@ -1,5 +1,9 @@
-import pytest
+"""
+Tests for data_loader module.
+"""
+
 import os
+import pytest
 from src.data_loader import load_data
 
 # Determine the absolute path of the project directory
@@ -15,25 +19,22 @@
 #     """
 #     # Ensure the pickle file exists for this test
 #     assert os.path.exists(PICKLE_PATH), "Pickle file doesn't exist for testing."
-
 #     result = load_data(pickle_path=PICKLE_PATH, csv_path=CSV_PATH)
 #     assert result == PICKLE_PATH, f"Expected {PICKLE_PATH}, but got {result}."
 
 def test_load_data_from_excel():
     """
-    Test that load_data correctly loads data from Excel and saves as pickle when pickle doesn't exist.
+    Test that load_data correctly loads data from Excel and saves as pickle
+    when pickle doesn't exist.
     """
     # Temporarily rename the pickle to simulate its absence
     if os.path.exists(PICKLE_PATH):
         os.rename(PICKLE_PATH, PICKLE_PATH + ".bak")
-
     result = load_data(pickle_path=PICKLE_PATH, excel_path=EXCEL_PATH)
     assert result == PICKLE_PATH, f"Expected {PICKLE_PATH}, but got {result}."
-
     # Rename pickle back to its original name
     if os.path.exists(PICKLE_PATH + ".bak"):
         os.rename(PICKLE_PATH + ".bak", PICKLE_PATH)
-
 def test_load_data_no_files():
     """
     Test that load_data raises an error when neither pickle nor Excel exists.
@@ -43,13 +44,10 @@ def test_load_data_no_files():
         os.rename(PICKLE_PATH, PICKLE_PATH + ".bak")
     if os.path.exists(EXCEL_PATH):
         os.rename(EXCEL_PATH, EXCEL_PATH + ".bak")
-
     with pytest.raises(FileNotFoundError):
         load_data(pickle_path=PICKLE_PATH, excel_path=EXCEL_PATH)
-
     # Rename files back to their original names
     if os.path.exists(PICKLE_PATH + ".bak"):
         os.rename(PICKLE_PATH + ".bak", PICKLE_PATH)
     if os.path.exists(EXCEL_PATH + ".bak"):
         os.rename(EXCEL_PATH + ".bak", EXCEL_PATH)
-