Merge pull request #1 from Thomas-George-T/feature_ashkan

Feature ashkan
Thomas-George-T · Oct 25, 2023 · 95c4d19 · 95c4d19
2 parents 8e9628d + 6243313
commit 95c4d19
Show file tree

Hide file tree

Showing 19 changed files with 492 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,69 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
 lab01
 .pytest_cache
-__pycache__/
 *.pyo
-*.pyc
+*.pyc
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+maildir/
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+maildir
+
+# Notebooks
+notebooks/
+
+*/.DS_Store/*
diff --git a/data/after_missing_values.pkl b/data/after_missing_values.pkl
diff --git a/data/processed/after_anomaly_code.pkl b/data/processed/after_anomaly_code.pkl
diff --git a/data/processed/after_duplicates.pkl b/data/processed/after_duplicates.pkl
diff --git a/data/processed/after_missing_values.pkl b/data/processed/after_missing_values.pkl
diff --git a/data/processed/after_transaction_status.pkl b/data/processed/after_transaction_status.pkl
diff --git a/data/processed/raw_data.pkl b/data/processed/raw_data.pkl
diff --git a/data/raw_data.pkl b/data/raw_data.pkl
diff --git a/requirements.txt b/requirements.txt
@@ -6,4 +6,5 @@ ipykernel
 mlflow
 requests
 pytest-mock
-pytest-pylint
+pytest-pylint
+openpyxl
diff --git a/src/anomaly_code_handler.py b/src/anomaly_code_handler.py
@@ -0,0 +1,43 @@
+"""
+A module for removing anomalies from StockCode column if they have 0 or 1 
+digit characters since the normal values are 5 or 6 digits.
+"""
+
+import pickle
+import os
+
+# Determine the absolute path of the project directory
+PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                 'processed','after_transaction_status.pkl')
+OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                  'processed', 'after_anomaly_code.pkl')
+
+def handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH,
+                           output_pickle_path=OUTPUT_PICKLE_PATH):
+    """
+    Load the DataFrame from the input pickle, remove rows with stock codes that 
+    have 0 or 1 numeric characters, 
+    then save the DataFrame back to a pickle and return its path.
+    
+    :param input_pickle_path: Path to the input pickle file.
+    :param output_pickle_path: Path to the output pickle file.
+    :return: Path to the saved pickle file.
+    """
+    # Load DataFrame from input pickle
+    if os.path.exists(input_pickle_path):
+        with open(input_pickle_path, "rb") as file:
+            df = pickle.load(file)
+    else:
+        raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}")
+    # Finding the stock codes with 0 and 1 numeric characters
+    unique_stock_codes = df['StockCode'].unique()
+    anomalous_stock_codes = [code for code in unique_stock_codes if
+                             sum(c.isdigit() for c in str(code)) in (0, 1)]
+    # Removing rows with these anomalous stock codes
+    df = df[~df['StockCode'].isin(anomalous_stock_codes)]
+    # Save the data to output pickle
+    with open(output_pickle_path, "wb") as file:
+        pickle.dump(df, file)
+    print(f"Data saved to {output_pickle_path}.")
+    return output_pickle_path
diff --git a/src/data_loader.py b/src/data_loader.py
@@ -0,0 +1,47 @@
+"""
+Module to handle the loading of e-commerce dataset from either pickle or Excel file format.
+"""
+
+import pickle
+import os
+import pandas as pd
+
+# Determine the absolute path of the project directory
+PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+# Use the project directory to construct paths to other directories
+DEFAULT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                   'processed', 'raw_data.pkl')
+DEFAULT_EXCEL_PATH = os.path.join(PROJECT_DIR, 'data', 'Online Retail.xlsx')
+
+def load_data(pickle_path=DEFAULT_PICKLE_PATH, excel_path=DEFAULT_EXCEL_PATH):
+    """
+    Load the e-commerce dataset.
+    First, try to load from the pickle file. If it doesn't exist, load from the excel file.
+    Regardless of the source, save the loaded data as a pickle for future use and
+    return the path to that pickle.
+    
+    :param pickle_path: Path to the pickle file.
+    :param excel_path: Path to the Excel file.
+    :return: Path to the saved pickle file.
+    """
+    # Placeholder for the DataFrame
+    df = None
+    # Check if pickle file exists
+    if os.path.exists(pickle_path):
+        with open(pickle_path, "rb") as file:
+            df = pickle.load(file)
+        print(f"Data loaded successfully from {pickle_path}.")
+    # If pickle doesn't exist, load from Excel
+    elif os.path.exists(excel_path):
+        df = pd.read_excel(excel_path)
+        print(f"Data loaded from {excel_path}.")
+    else:
+        error_message = f"No data found in the specified paths: {pickle_path} or {excel_path}"
+        print(error_message)
+        raise FileNotFoundError(error_message)
+    # Save the data to pickle for future use (or re-save it if loaded from existing pickle)
+    with open(pickle_path, "wb") as file:
+        pickle.dump(df, file)
+    print(f"Data saved to {pickle_path} for future use.")
+    return pickle_path
diff --git a/src/duplicates_handler.py b/src/duplicates_handler.py
@@ -0,0 +1,44 @@
+"""
+A module for removing duplicates in dataset based on subset of 
+following columns:
+- InvoiceNo
+- StockCode
+- Description
+- CustomerID
+- Quantity
+"""
+
+import pickle
+import os
+
+# Determine the absolute path of the project directory
+PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                 'processed','after_missing_values.pkl')
+OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                  'processed', 'after_duplicates.pkl')
+
+def remove_duplicates(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH):
+    """
+    Load the DataFrame from the input pickle, drop duplicates based on certain columns.
+    Save the DataFrame back to a pickle and return its path.
+    
+    :param input_pickle_path: Path to the input pickle file.
+    :param output_pickle_path: Path to the output pickle file.
+    :return: Path to the saved pickle file.
+    """
+    # Load DataFrame from input pickle
+    if os.path.exists(input_pickle_path):
+        with open(input_pickle_path, "rb") as file:
+            df = pickle.load(file)
+    else:
+        raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}")
+    # Columns to check for duplicates
+    columns_to_check = ['InvoiceNo', 'StockCode', 'Description', 'CustomerID', 'Quantity']
+    # Drop duplicates
+    df = df.drop_duplicates(subset=columns_to_check)
+    # Save the data to output pickle
+    with open(output_pickle_path, "wb") as file:
+        pickle.dump(df, file)
+    print(f"Data saved to {output_pickle_path}.")
+    return output_pickle_path
diff --git a/src/missing_values_handler.py b/src/missing_values_handler.py
@@ -0,0 +1,45 @@
+"""
+A module for removing missig values from dataset based on CustomeID
+and Description column.
+"""
+
+import os
+import pickle
+
+# Determine the absolute path of the project directory
+PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','raw_data.pkl')
+OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_missing_values.pkl')
+
+def handle_missing(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH):
+    """
+    Load the DataFrame from the input pickle, 
+    remove rows with missing values in 'CustomerID' and 'Description' columns.
+    Then, check if there are any missing values left in the dataframe.
+    If there are, raise a ValueError. Finally, 
+    save the DataFrame back to a pickle and return its path.
+    
+    :param input_pickle_path: Path to the input pickle file.
+    :param output_pickle_path: Path to the output pickle file.
+    :return: Path to the saved pickle file.
+    """
+    # Load DataFrame from input pickle
+    if os.path.exists(input_pickle_path):
+        with open(input_pickle_path, "rb") as file:
+            df = pickle.load(file)
+    else:
+        raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}")
+    # Remove rows with missing values in 'CustomerID' and 'Description'
+    df = df.dropna(subset=['CustomerID', 'Description'])
+    # Check if there are any missing values left
+    if df.isna().sum().sum() != 0:
+        missing_count = df.isna().sum().sum()
+        message = f"There are {missing_count} missing values left in the dataframe."
+        print(message)
+        raise ValueError(message)
+    # Save the data to output pickle
+    with open(output_pickle_path, "wb") as file:
+        pickle.dump(df, file)
+    print(f"Data saved to {output_pickle_path}.")
+    return output_pickle_path
diff --git a/src/transaction_status_handler.py b/src/transaction_status_handler.py
@@ -0,0 +1,47 @@
+"""
+A module for adding a new column named transaction_status based on the 
+starting character of InvoiceNo column. 
+transaction_status values are Cancelled or Completed.
+"""
+
+import pickle
+import os
+import numpy as np
+
+# Determine the absolute path of the project directory
+PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                 'processed','after_duplicates.pkl')
+OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                  'processed', 'after_transaction_status.pkl')
+
+def handle_transaction_status(input_pickle_path=INPUT_PICKLE_PATH,
+                              output_pickle_path=OUTPUT_PICKLE_PATH):
+    """
+    Load the DataFrame from the input pickle, add a 'transaction_status' column 
+    to indicate whether the transaction was 'Cancelled' or 'Completed'. 
+    Save the DataFrame back to a pickle and return its path.
+    
+    :param input_pickle_path: Path to the input pickle file.
+    :param output_pickle_path: Path to the output pickle file.
+    :return: Path to the saved pickle file.
+    
+    :raises KeyError: If the 'InvoiceNo' column doesn't exist in the dataframe.
+    """
+    # Load DataFrame from input pickle
+    if os.path.exists(input_pickle_path):
+        with open(input_pickle_path, "rb") as file:
+            df = pickle.load(file)
+    else:
+        raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}")
+    # Check if 'InvoiceNo' column exists
+    if 'InvoiceNo' not in df.columns:
+        raise KeyError("The input dataframe does not contain an 'InvoiceNo' column.")
+    # Add the 'Transaction_Status' column
+    df['transaction_status'] = np.where(df['InvoiceNo'].astype(str).str.startswith('C'),
+                                        'Cancelled', 'Completed')
+    # Save the data to output pickle
+    with open(output_pickle_path, "wb") as file:
+        pickle.dump(df, file)
+    print(f"Data saved to {output_pickle_path}.")
+    return output_pickle_path
diff --git a/test/test_anomaly_code_handler.py b/test/test_anomaly_code_handler.py
@@ -0,0 +1,33 @@
+"""
+A module for testing anomaly_code_handler module.
+"""
+
+import os
+import pickle
+from src.anomaly_code_handler import handle_anomalous_codes
+
+# Determine the absolute path of the project directory
+PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                 'processed','after_transaction_status.pkl')
+OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
+                                  'processed', 'after_anomaly_code.pkl')
+
+def test_handle_anomalous_codes():
+    """
+    Test that handle_anomalous_codes correctly removes rows with stock codes 
+    that have 0 or 1 numeric characters.
+    """
+    result = handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH,
+                                    output_pickle_path=OUTPUT_PICKLE_PATH)
+    assert result == OUTPUT_PICKLE_PATH,\
+        f"Expected {OUTPUT_PICKLE_PATH}, but got {result}."
+    # Load the output pickle file to check the 'StockCode' column
+    with open(OUTPUT_PICKLE_PATH, "rb") as file:
+        df = pickle.load(file)
+    # Check for stock codes with 0 or 1 numeric characters
+    unique_stock_codes = df['StockCode'].unique()
+    anomalous_stock_codes = [code for code in unique_stock_codes if
+                             sum(c.isdigit() for c in str(code)) in (0, 1)]
+    # Assert that no such anomalous stock codes exist
+    assert len(anomalous_stock_codes) == 0, "Anomalous stock codes found in the dataframe."