From 85795c9cd5f3ba5cdd9e8836fb233ed4dc1443e3 Mon Sep 17 00:00:00 2001
From: Ashkan Ghanavati <ashkan.ghanavati92@gmail.com>
Date: Wed, 25 Oct 2023 09:21:11 -0400
Subject: [PATCH] delete src/data_preprocessing.py file

---
 src/data_preprocessing.py | 84 ---------------------------------------
 1 file changed, 84 deletions(-)
 delete mode 100644 src/data_preprocessing.py

diff --git a/src/data_preprocessing.py b/src/data_preprocessing.py
deleted file mode 100644
index 1f347d8..0000000
--- a/src/data_preprocessing.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import pandas as pd
-import numpy as np
-
-# ======================== Missing Values ========================
-def remove_and_check_missing(df):
-    """
-    Remove rows with missing values in 'CustomerID' and 'Description' columns.
-    Then, check if there are any missing values left in the dataframe.
-    If there are, raise a MissingValueError.
-    """
-    
-    # Remove rows with missing values in 'CustomerID' and 'Description'
-    df = df.dropna(subset=['CustomerID', 'Description'])
-    
-    # Check if there are any missing values left
-    if df.isna().sum().sum() != 0:
-        missing_count = df.isna().sum().sum()
-        message = f"There are {missing_count} missing values left in the dataframe."
-        print(message)
-        raise ValueError(message)
-    
-    return df
-
-# ======================== Drop Duplicates ========================
-def drop_duplicates(df):
-    """
-    Drop duplicates from the dataframe based on the columns: 
-    'InvoiceNo', 'StockCode', 'Description', 'CustomerID', 'Quantity'.
-    
-    Parameters:
-    - df: Input dataframe.
-    
-    Return:
-    - Dataframe with duplicates removed.
-    """
-    
-    columns_to_check = ['InvoiceNo', 'StockCode', 'Description', 'CustomerID', 'Quantity']
-    df = df.drop_duplicates(subset=columns_to_check)
-    
-    return df
-
-# ======================== Adding Transaction Status column ========================
-def add_transaction_status(df):
-    """
-    Add a new column 'transaction_status' to the dataframe. 
-    The column indicates whether the transaction was 'Cancelled' or 'Completed' 
-    based on the 'InvoiceNo' column.
-    
-    Parameters:
-    - Input dataframe.
-    
-    Return:
-    - Dataframe with the new 'Transaction_Status' column added.
-    
-    :raises KeyError: If the 'InvoiceNo' column doesn't exist in the dataframe.
-    """
-    
-    # Check if 'InvoiceNo' column exists
-    if 'InvoiceNo' not in df.columns:
-        raise KeyError("The input dataframe does not contain an 'InvoiceNo' column.")
-    
-    # Add the 'Transaction_Status' column
-    df['transaction_status'] = np.where(df['InvoiceNo'].astype(str).str.startswith('C'), 
-                                        'Cancelled', 'Completed')
-    
-    return df
-
-# ======================== Remove Anomaly codes ========================
-def remove_anomalous_stock_codes(df):
-    """
-    Removes rows with stock codes that have 0 or 1 numeric characters.
-
-    :param df: Input dataframe.
-    :return: Dataframe with anomalous stock codes removed.
-    """
-    
-    # Finding the stock codes with 0 and 1 numeric characters
-    unique_stock_codes = df['StockCode'].unique()
-    anomalous_stock_codes = [code for code in unique_stock_codes if sum(c.isdigit() for c in str(code)) in (0, 1)]
-    
-    # Removing rows with these anomalous stock codes
-    df = df[~df['StockCode'].isin(anomalous_stock_codes)]
-    
-    return df