Skip to content

Commit

Permalink
Merge pull request #1 from Thomas-George-T/feature_ashkan
Browse files Browse the repository at this point in the history
Feature ashkan
  • Loading branch information
AshyScripts authored Oct 25, 2023
2 parents 8e9628d + 6243313 commit 95c4d19
Show file tree
Hide file tree
Showing 19 changed files with 492 additions and 3 deletions.
68 changes: 66 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,69 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
lab01
.pytest_cache
__pycache__/
*.pyo
*.pyc
*.pyc

# C extensions
*.so

# Distribution / packaging
.Python
maildir/
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover

# Translations
*.mo
*.pot

# Django stuff:
*.log

# Sphinx documentation
docs/_build/

# PyBuilder
target/
maildir

# Notebooks
notebooks/

*/.DS_Store/*
Binary file added data/after_missing_values.pkl
Binary file not shown.
Binary file added data/processed/after_anomaly_code.pkl
Binary file not shown.
Binary file added data/processed/after_duplicates.pkl
Binary file not shown.
Binary file added data/processed/after_missing_values.pkl
Binary file not shown.
Binary file added data/processed/after_transaction_status.pkl
Binary file not shown.
Binary file added data/processed/raw_data.pkl
Binary file not shown.
Binary file added data/raw_data.pkl
Binary file not shown.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ ipykernel
mlflow
requests
pytest-mock
pytest-pylint
pytest-pylint
openpyxl
43 changes: 43 additions & 0 deletions src/anomaly_code_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""
A module for removing anomalies from StockCode column if they have 0 or 1
digit characters since the normal values are 5 or 6 digits.
"""

import pickle
import os

# Determine the absolute path of the project directory
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed','after_transaction_status.pkl')
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed', 'after_anomaly_code.pkl')

def handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH,
output_pickle_path=OUTPUT_PICKLE_PATH):
"""
Load the DataFrame from the input pickle, remove rows with stock codes that
have 0 or 1 numeric characters,
then save the DataFrame back to a pickle and return its path.
:param input_pickle_path: Path to the input pickle file.
:param output_pickle_path: Path to the output pickle file.
:return: Path to the saved pickle file.
"""
# Load DataFrame from input pickle
if os.path.exists(input_pickle_path):
with open(input_pickle_path, "rb") as file:
df = pickle.load(file)
else:
raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}")
# Finding the stock codes with 0 and 1 numeric characters
unique_stock_codes = df['StockCode'].unique()
anomalous_stock_codes = [code for code in unique_stock_codes if
sum(c.isdigit() for c in str(code)) in (0, 1)]
# Removing rows with these anomalous stock codes
df = df[~df['StockCode'].isin(anomalous_stock_codes)]
# Save the data to output pickle
with open(output_pickle_path, "wb") as file:
pickle.dump(df, file)
print(f"Data saved to {output_pickle_path}.")
return output_pickle_path
47 changes: 47 additions & 0 deletions src/data_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Module to handle the loading of e-commerce dataset from either pickle or Excel file format.
"""

import pickle
import os
import pandas as pd

# Determine the absolute path of the project directory
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

# Use the project directory to construct paths to other directories
DEFAULT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed', 'raw_data.pkl')
DEFAULT_EXCEL_PATH = os.path.join(PROJECT_DIR, 'data', 'Online Retail.xlsx')

def load_data(pickle_path=DEFAULT_PICKLE_PATH, excel_path=DEFAULT_EXCEL_PATH):
"""
Load the e-commerce dataset.
First, try to load from the pickle file. If it doesn't exist, load from the excel file.
Regardless of the source, save the loaded data as a pickle for future use and
return the path to that pickle.
:param pickle_path: Path to the pickle file.
:param excel_path: Path to the Excel file.
:return: Path to the saved pickle file.
"""
# Placeholder for the DataFrame
df = None
# Check if pickle file exists
if os.path.exists(pickle_path):
with open(pickle_path, "rb") as file:
df = pickle.load(file)
print(f"Data loaded successfully from {pickle_path}.")
# If pickle doesn't exist, load from Excel
elif os.path.exists(excel_path):
df = pd.read_excel(excel_path)
print(f"Data loaded from {excel_path}.")
else:
error_message = f"No data found in the specified paths: {pickle_path} or {excel_path}"
print(error_message)
raise FileNotFoundError(error_message)
# Save the data to pickle for future use (or re-save it if loaded from existing pickle)
with open(pickle_path, "wb") as file:
pickle.dump(df, file)
print(f"Data saved to {pickle_path} for future use.")
return pickle_path
44 changes: 44 additions & 0 deletions src/duplicates_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
A module for removing duplicates in dataset based on subset of
following columns:
- InvoiceNo
- StockCode
- Description
- CustomerID
- Quantity
"""

import pickle
import os

# Determine the absolute path of the project directory
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed','after_missing_values.pkl')
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed', 'after_duplicates.pkl')

def remove_duplicates(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH):
"""
Load the DataFrame from the input pickle, drop duplicates based on certain columns.
Save the DataFrame back to a pickle and return its path.
:param input_pickle_path: Path to the input pickle file.
:param output_pickle_path: Path to the output pickle file.
:return: Path to the saved pickle file.
"""
# Load DataFrame from input pickle
if os.path.exists(input_pickle_path):
with open(input_pickle_path, "rb") as file:
df = pickle.load(file)
else:
raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}")
# Columns to check for duplicates
columns_to_check = ['InvoiceNo', 'StockCode', 'Description', 'CustomerID', 'Quantity']
# Drop duplicates
df = df.drop_duplicates(subset=columns_to_check)
# Save the data to output pickle
with open(output_pickle_path, "wb") as file:
pickle.dump(df, file)
print(f"Data saved to {output_pickle_path}.")
return output_pickle_path
45 changes: 45 additions & 0 deletions src/missing_values_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""
A module for removing missig values from dataset based on CustomeID
and Description column.
"""

import os
import pickle

# Determine the absolute path of the project directory
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','raw_data.pkl')
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_missing_values.pkl')

def handle_missing(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH):
"""
Load the DataFrame from the input pickle,
remove rows with missing values in 'CustomerID' and 'Description' columns.
Then, check if there are any missing values left in the dataframe.
If there are, raise a ValueError. Finally,
save the DataFrame back to a pickle and return its path.
:param input_pickle_path: Path to the input pickle file.
:param output_pickle_path: Path to the output pickle file.
:return: Path to the saved pickle file.
"""
# Load DataFrame from input pickle
if os.path.exists(input_pickle_path):
with open(input_pickle_path, "rb") as file:
df = pickle.load(file)
else:
raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}")
# Remove rows with missing values in 'CustomerID' and 'Description'
df = df.dropna(subset=['CustomerID', 'Description'])
# Check if there are any missing values left
if df.isna().sum().sum() != 0:
missing_count = df.isna().sum().sum()
message = f"There are {missing_count} missing values left in the dataframe."
print(message)
raise ValueError(message)
# Save the data to output pickle
with open(output_pickle_path, "wb") as file:
pickle.dump(df, file)
print(f"Data saved to {output_pickle_path}.")
return output_pickle_path
47 changes: 47 additions & 0 deletions src/transaction_status_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
A module for adding a new column named transaction_status based on the
starting character of InvoiceNo column.
transaction_status values are Cancelled or Completed.
"""

import pickle
import os
import numpy as np

# Determine the absolute path of the project directory
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed','after_duplicates.pkl')
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed', 'after_transaction_status.pkl')

def handle_transaction_status(input_pickle_path=INPUT_PICKLE_PATH,
output_pickle_path=OUTPUT_PICKLE_PATH):
"""
Load the DataFrame from the input pickle, add a 'transaction_status' column
to indicate whether the transaction was 'Cancelled' or 'Completed'.
Save the DataFrame back to a pickle and return its path.
:param input_pickle_path: Path to the input pickle file.
:param output_pickle_path: Path to the output pickle file.
:return: Path to the saved pickle file.
:raises KeyError: If the 'InvoiceNo' column doesn't exist in the dataframe.
"""
# Load DataFrame from input pickle
if os.path.exists(input_pickle_path):
with open(input_pickle_path, "rb") as file:
df = pickle.load(file)
else:
raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}")
# Check if 'InvoiceNo' column exists
if 'InvoiceNo' not in df.columns:
raise KeyError("The input dataframe does not contain an 'InvoiceNo' column.")
# Add the 'Transaction_Status' column
df['transaction_status'] = np.where(df['InvoiceNo'].astype(str).str.startswith('C'),
'Cancelled', 'Completed')
# Save the data to output pickle
with open(output_pickle_path, "wb") as file:
pickle.dump(df, file)
print(f"Data saved to {output_pickle_path}.")
return output_pickle_path
33 changes: 33 additions & 0 deletions test/test_anomaly_code_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""
A module for testing anomaly_code_handler module.
"""

import os
import pickle
from src.anomaly_code_handler import handle_anomalous_codes

# Determine the absolute path of the project directory
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed','after_transaction_status.pkl')
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data',
'processed', 'after_anomaly_code.pkl')

def test_handle_anomalous_codes():
"""
Test that handle_anomalous_codes correctly removes rows with stock codes
that have 0 or 1 numeric characters.
"""
result = handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH,
output_pickle_path=OUTPUT_PICKLE_PATH)
assert result == OUTPUT_PICKLE_PATH,\
f"Expected {OUTPUT_PICKLE_PATH}, but got {result}."
# Load the output pickle file to check the 'StockCode' column
with open(OUTPUT_PICKLE_PATH, "rb") as file:
df = pickle.load(file)
# Check for stock codes with 0 or 1 numeric characters
unique_stock_codes = df['StockCode'].unique()
anomalous_stock_codes = [code for code in unique_stock_codes if
sum(c.isdigit() for c in str(code)) in (0, 1)]
# Assert that no such anomalous stock codes exist
assert len(anomalous_stock_codes) == 0, "Anomalous stock codes found in the dataframe."
Loading

0 comments on commit 95c4d19

Please sign in to comment.