Skip to content

Commit

Permalink
final cleaning to follow pylint rules for spacing and docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
AshyScripts committed Oct 25, 2023
1 parent ac561f8 commit bb78074
Show file tree
Hide file tree
Showing 10 changed files with 130 additions and 116 deletions.
27 changes: 15 additions & 12 deletions src/anomaly_code_handler.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,43 @@
import pandas as pd
"""
A module for removing anomalies from StockCode column if they have 0 or 1
digit characters since the normal values are 5 or 6 digits.
"""

import pickle
import os

# Determine the absolute path of the project directory
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed','after_transaction_status.pkl')
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed', 'after_anomaly_code.pkl')

INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_transaction_status.pkl')
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'after_anomaly_code.pkl')

def handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH):
def handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH,
output_pickle_path=OUTPUT_PICKLE_PATH):
"""
Load the DataFrame from the input pickle, remove rows with stock codes that
have 0 or 1 numeric characters, then save the DataFrame back to a pickle and return its path.
have 0 or 1 numeric characters,
then save the DataFrame back to a pickle and return its path.
:param input_pickle_path: Path to the input pickle file.
:param output_pickle_path: Path to the output pickle file.
:return: Path to the saved pickle file.
"""

# Load DataFrame from input pickle
if os.path.exists(input_pickle_path):
with open(input_pickle_path, "rb") as file:
df = pickle.load(file)
else:
raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}")

# Finding the stock codes with 0 and 1 numeric characters
unique_stock_codes = df['StockCode'].unique()
anomalous_stock_codes = [code for code in unique_stock_codes if sum(c.isdigit() for c in str(code)) in (0, 1)]

anomalous_stock_codes = [code for code in unique_stock_codes if
sum(c.isdigit() for c in str(code)) in (0, 1)]
# Removing rows with these anomalous stock codes
df = df[~df['StockCode'].isin(anomalous_stock_codes)]

# Save the data to output pickle
with open(output_pickle_path, "wb") as file:
pickle.dump(df, file)

print(f"Data saved to {output_pickle_path}.")
return output_pickle_path
32 changes: 12 additions & 20 deletions src/data_loader.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,47 @@
"""
Module to handle the loading of e-commerce dataset from either pickle or Excel file format.
"""

import pickle
import os
import pandas as pd
import pickle

# Determine the absolute path of the project directory
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

# Use the project directory to construct paths to other directories
DEFAULT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'raw_data.pkl')
DEFAULT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed', 'raw_data.pkl')
DEFAULT_EXCEL_PATH = os.path.join(PROJECT_DIR, 'data', 'Online Retail.xlsx')

def load_data(pickle_path=DEFAULT_PICKLE_PATH, excel_path=DEFAULT_EXCEL_PATH):
"""
Load the e-commerce dataset.
Load the e-commerce dataset.
First, try to load from the pickle file. If it doesn't exist, load from the excel file.
Regardless of the source, save the loaded data as a pickle for future use and return the path to that pickle.
Regardless of the source, save the loaded data as a pickle for future use and
return the path to that pickle.
:param pickle_path: Path to the pickle file.
:param csv_path: Path to the Excel file.
:param excel_path: Path to the Excel file.
:return: Path to the saved pickle file.
"""

# Placeholder for the DataFrame
df = None

# Check if pickle file exists
if os.path.exists(pickle_path):
with open(pickle_path, "rb") as file:
df = pickle.load(file)
print(f"Data loaded successfully from {pickle_path}.")

# If pickle doesn't exist, load CSV
# If pickle doesn't exist, load from Excel
elif os.path.exists(excel_path):
df = pd.read_excel(excel_path)
print(f"Data loaded from {excel_path}.")


else:
error_message = f"No data found in the specified paths: {pickle_path} or {excel_path}"
print(error_message)
raise FileNotFoundError(error_message)

# Save the data to pickle for future use (or re-save it if loaded from existing pickle)
with open(pickle_path, "wb") as file:
pickle.dump(df, file)

print(f"Data saved to {pickle_path} for future use.")
return pickle_path







23 changes: 14 additions & 9 deletions src/duplicates_handler.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
import pandas as pd
"""
A module for removing duplicates in dataset based on subset of
following columns:
- InvoiceNo
- StockCode
- Description
- CustomerID
- Quantity
"""

import pickle
import os

# Determine the absolute path of the project directory
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_missing_values.pkl')
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'after_duplicates.pkl')
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed','after_missing_values.pkl')
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed', 'after_duplicates.pkl')

def remove_duplicates(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH):
"""
Expand All @@ -17,23 +27,18 @@ def remove_duplicates(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OU
:param output_pickle_path: Path to the output pickle file.
:return: Path to the saved pickle file.
"""

# Load DataFrame from input pickle
if os.path.exists(input_pickle_path):
with open(input_pickle_path, "rb") as file:
df = pickle.load(file)
else:
raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}")

# Columns to check for duplicates
columns_to_check = ['InvoiceNo', 'StockCode', 'Description', 'CustomerID', 'Quantity']

# Drop duplicates
df = df.drop_duplicates(subset=columns_to_check)

# Save the data to output pickle
with open(output_pickle_path, "wb") as file:
pickle.dump(df, file)

print(f"Data saved to {output_pickle_path}.")
return output_pickle_path
21 changes: 10 additions & 11 deletions src/missing_values_handler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import pickle
import pandas as pd
"""
A module for removing missig values from dataset based on CustomeID
and Description column.
"""

import os
# ======================== Missing Values ========================
import pickle

# Determine the absolute path of the project directory
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
Expand All @@ -11,36 +14,32 @@

def handle_missing(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH):
"""
Load the DataFrame from the input pickle, remove rows with missing values in 'CustomerID' and 'Description' columns.
Load the DataFrame from the input pickle,
remove rows with missing values in 'CustomerID' and 'Description' columns.
Then, check if there are any missing values left in the dataframe.
If there are, raise a ValueError. Finally, save the DataFrame back to a pickle and return its path.
If there are, raise a ValueError. Finally,
save the DataFrame back to a pickle and return its path.
:param input_pickle_path: Path to the input pickle file.
:param output_pickle_path: Path to the output pickle file.
:return: Path to the saved pickle file.
"""

# Load DataFrame from input pickle
if os.path.exists(input_pickle_path):
with open(input_pickle_path, "rb") as file:
df = pickle.load(file)
else:
raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}")

# Remove rows with missing values in 'CustomerID' and 'Description'
df = df.dropna(subset=['CustomerID', 'Description'])

# Check if there are any missing values left
if df.isna().sum().sum() != 0:
missing_count = df.isna().sum().sum()
message = f"There are {missing_count} missing values left in the dataframe."
print(message)
raise ValueError(message)

# Save the data to output pickle
with open(output_pickle_path, "wb") as file:
pickle.dump(df, file)

print(f"Data saved to {output_pickle_path}.")
return output_pickle_path

25 changes: 14 additions & 11 deletions src/transaction_status_handler.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
import pandas as pd
"""
A module for adding a new column named transaction_status based on the
starting character of InvoiceNo column.
transaction_status values are Cancelled or Completed.
"""

import pickle
import os
import numpy as np

# Determine the absolute path of the project directory
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed','after_duplicates.pkl')
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed', 'after_transaction_status.pkl')

INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_duplicates.pkl')
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'after_transaction_status.pkl')

def handle_transaction_status(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH):
def handle_transaction_status(input_pickle_path=INPUT_PICKLE_PATH,
output_pickle_path=OUTPUT_PICKLE_PATH):
"""
Load the DataFrame from the input pickle, add a 'transaction_status' column
to indicate whether the transaction was 'Cancelled' or 'Completed'.
Expand All @@ -21,24 +28,20 @@ def handle_transaction_status(input_pickle_path=INPUT_PICKLE_PATH, output_pickle
:raises KeyError: If the 'InvoiceNo' column doesn't exist in the dataframe.
"""

# Load DataFrame from input pickle
if os.path.exists(input_pickle_path):
with open(input_pickle_path, "rb") as file:
df = pickle.load(file)
else:
raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}")

# Check if 'InvoiceNo' column exists
if 'InvoiceNo' not in df.columns:
raise KeyError("The input dataframe does not contain an 'InvoiceNo' column.")

# Add the 'Transaction_Status' column
df['transaction_status'] = np.where(df['InvoiceNo'].astype(str).str.startswith('C'), 'Cancelled', 'Completed')

df['transaction_status'] = np.where(df['InvoiceNo'].astype(str).str.startswith('C'),
'Cancelled', 'Completed')
# Save the data to output pickle
with open(output_pickle_path, "wb") as file:
pickle.dump(df, file)

print(f"Data saved to {output_pickle_path}.")
return output_pickle_path
25 changes: 14 additions & 11 deletions test/test_anomaly_code_handler.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,33 @@
import pytest
"""
A module for testing anomaly_code_handler module.
"""

import os
import pickle
from src.anomaly_code_handler import handle_anomalous_codes

# Determine the absolute path of the project directory
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_transaction_status.pkl')
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'after_anomaly_code.pkl')
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed','after_transaction_status.pkl')
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed', 'after_anomaly_code.pkl')

def test_handle_anomalous_codes():
"""
Test that handle_anomalous_codes correctly removes rows with stock codes
that have 0 or 1 numeric characters.
"""
result = handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH)
assert result == OUTPUT_PICKLE_PATH, f"Expected {OUTPUT_PICKLE_PATH}, but got {result}."

result = handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH,
output_pickle_path=OUTPUT_PICKLE_PATH)
assert result == OUTPUT_PICKLE_PATH,\
f"Expected {OUTPUT_PICKLE_PATH}, but got {result}."
# Load the output pickle file to check the 'StockCode' column
with open(OUTPUT_PICKLE_PATH, "rb") as file:
df = pickle.load(file)

# Check for stock codes with 0 or 1 numeric characters
unique_stock_codes = df['StockCode'].unique()
anomalous_stock_codes = [code for code in unique_stock_codes if sum(c.isdigit() for c in str(code)) in (0, 1)]

anomalous_stock_codes = [code for code in unique_stock_codes if
sum(c.isdigit() for c in str(code)) in (0, 1)]
# Assert that no such anomalous stock codes exist
assert len(anomalous_stock_codes) == 0, "Anomalous stock codes found in the dataframe."

16 changes: 7 additions & 9 deletions test/test_data_loader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import pytest
"""
Tests for data_loader module.
"""

import os
import pytest
from src.data_loader import load_data

# Determine the absolute path of the project directory
Expand All @@ -15,25 +19,22 @@
# """
# # Ensure the pickle file exists for this test
# assert os.path.exists(PICKLE_PATH), "Pickle file doesn't exist for testing."

# result = load_data(pickle_path=PICKLE_PATH, csv_path=CSV_PATH)
# assert result == PICKLE_PATH, f"Expected {PICKLE_PATH}, but got {result}."

def test_load_data_from_excel():
"""
Test that load_data correctly loads data from Excel and saves as pickle when pickle doesn't exist.
Test that load_data correctly loads data from Excel and saves as pickle
when pickle doesn't exist.
"""
# Temporarily rename the pickle to simulate its absence
if os.path.exists(PICKLE_PATH):
os.rename(PICKLE_PATH, PICKLE_PATH + ".bak")

result = load_data(pickle_path=PICKLE_PATH, excel_path=EXCEL_PATH)
assert result == PICKLE_PATH, f"Expected {PICKLE_PATH}, but got {result}."

# Rename pickle back to its original name
if os.path.exists(PICKLE_PATH + ".bak"):
os.rename(PICKLE_PATH + ".bak", PICKLE_PATH)

def test_load_data_no_files():
"""
Test that load_data raises an error when neither pickle nor Excel exists.
Expand All @@ -43,13 +44,10 @@ def test_load_data_no_files():
os.rename(PICKLE_PATH, PICKLE_PATH + ".bak")
if os.path.exists(EXCEL_PATH):
os.rename(EXCEL_PATH, EXCEL_PATH + ".bak")

with pytest.raises(FileNotFoundError):
load_data(pickle_path=PICKLE_PATH, excel_path=EXCEL_PATH)

# Rename files back to their original names
if os.path.exists(PICKLE_PATH + ".bak"):
os.rename(PICKLE_PATH + ".bak", PICKLE_PATH)
if os.path.exists(EXCEL_PATH + ".bak"):
os.rename(EXCEL_PATH + ".bak", EXCEL_PATH)

Loading

0 comments on commit bb78074

Please sign in to comment.