-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from Thomas-George-T/feature_ashkan
Feature ashkan
- Loading branch information
Showing
19 changed files
with
492 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,69 @@ | ||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
lab01 | ||
.pytest_cache | ||
__pycache__/ | ||
*.pyo | ||
*.pyc | ||
*.pyc | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Distribution / packaging | ||
.Python | ||
maildir/ | ||
env/ | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
var/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
|
||
# PyInstaller | ||
# Usually these files are written by a python script from a template | ||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
*.manifest | ||
*.spec | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*,cover | ||
|
||
# Translations | ||
*.mo | ||
*.pot | ||
|
||
# Django stuff: | ||
*.log | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
||
# PyBuilder | ||
target/ | ||
maildir | ||
|
||
# Notebooks | ||
notebooks/ | ||
|
||
*/.DS_Store/* |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,4 +6,5 @@ ipykernel | |
mlflow | ||
requests | ||
pytest-mock | ||
pytest-pylint | ||
pytest-pylint | ||
openpyxl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
""" | ||
A module for removing anomalies from StockCode column if they have 0 or 1 | ||
digit characters since the normal values are 5 or 6 digits. | ||
""" | ||
|
||
import pickle | ||
import os | ||
|
||
# Determine the absolute path of the project directory | ||
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', | ||
'processed','after_transaction_status.pkl') | ||
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', | ||
'processed', 'after_anomaly_code.pkl') | ||
|
||
def handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH, | ||
output_pickle_path=OUTPUT_PICKLE_PATH): | ||
""" | ||
Load the DataFrame from the input pickle, remove rows with stock codes that | ||
have 0 or 1 numeric characters, | ||
then save the DataFrame back to a pickle and return its path. | ||
:param input_pickle_path: Path to the input pickle file. | ||
:param output_pickle_path: Path to the output pickle file. | ||
:return: Path to the saved pickle file. | ||
""" | ||
# Load DataFrame from input pickle | ||
if os.path.exists(input_pickle_path): | ||
with open(input_pickle_path, "rb") as file: | ||
df = pickle.load(file) | ||
else: | ||
raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}") | ||
# Finding the stock codes with 0 and 1 numeric characters | ||
unique_stock_codes = df['StockCode'].unique() | ||
anomalous_stock_codes = [code for code in unique_stock_codes if | ||
sum(c.isdigit() for c in str(code)) in (0, 1)] | ||
# Removing rows with these anomalous stock codes | ||
df = df[~df['StockCode'].isin(anomalous_stock_codes)] | ||
# Save the data to output pickle | ||
with open(output_pickle_path, "wb") as file: | ||
pickle.dump(df, file) | ||
print(f"Data saved to {output_pickle_path}.") | ||
return output_pickle_path |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
""" | ||
Module to handle the loading of e-commerce dataset from either pickle or Excel file format. | ||
""" | ||
|
||
import pickle | ||
import os | ||
import pandas as pd | ||
|
||
# Determine the absolute path of the project directory | ||
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
|
||
# Use the project directory to construct paths to other directories | ||
DEFAULT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', | ||
'processed', 'raw_data.pkl') | ||
DEFAULT_EXCEL_PATH = os.path.join(PROJECT_DIR, 'data', 'Online Retail.xlsx') | ||
|
||
def load_data(pickle_path=DEFAULT_PICKLE_PATH, excel_path=DEFAULT_EXCEL_PATH): | ||
""" | ||
Load the e-commerce dataset. | ||
First, try to load from the pickle file. If it doesn't exist, load from the excel file. | ||
Regardless of the source, save the loaded data as a pickle for future use and | ||
return the path to that pickle. | ||
:param pickle_path: Path to the pickle file. | ||
:param excel_path: Path to the Excel file. | ||
:return: Path to the saved pickle file. | ||
""" | ||
# Placeholder for the DataFrame | ||
df = None | ||
# Check if pickle file exists | ||
if os.path.exists(pickle_path): | ||
with open(pickle_path, "rb") as file: | ||
df = pickle.load(file) | ||
print(f"Data loaded successfully from {pickle_path}.") | ||
# If pickle doesn't exist, load from Excel | ||
elif os.path.exists(excel_path): | ||
df = pd.read_excel(excel_path) | ||
print(f"Data loaded from {excel_path}.") | ||
else: | ||
error_message = f"No data found in the specified paths: {pickle_path} or {excel_path}" | ||
print(error_message) | ||
raise FileNotFoundError(error_message) | ||
# Save the data to pickle for future use (or re-save it if loaded from existing pickle) | ||
with open(pickle_path, "wb") as file: | ||
pickle.dump(df, file) | ||
print(f"Data saved to {pickle_path} for future use.") | ||
return pickle_path |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
""" | ||
A module for removing duplicates in dataset based on subset of | ||
following columns: | ||
- InvoiceNo | ||
- StockCode | ||
- Description | ||
- CustomerID | ||
- Quantity | ||
""" | ||
|
||
import pickle | ||
import os | ||
|
||
# Determine the absolute path of the project directory | ||
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', | ||
'processed','after_missing_values.pkl') | ||
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', | ||
'processed', 'after_duplicates.pkl') | ||
|
||
def remove_duplicates(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH): | ||
""" | ||
Load the DataFrame from the input pickle, drop duplicates based on certain columns. | ||
Save the DataFrame back to a pickle and return its path. | ||
:param input_pickle_path: Path to the input pickle file. | ||
:param output_pickle_path: Path to the output pickle file. | ||
:return: Path to the saved pickle file. | ||
""" | ||
# Load DataFrame from input pickle | ||
if os.path.exists(input_pickle_path): | ||
with open(input_pickle_path, "rb") as file: | ||
df = pickle.load(file) | ||
else: | ||
raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}") | ||
# Columns to check for duplicates | ||
columns_to_check = ['InvoiceNo', 'StockCode', 'Description', 'CustomerID', 'Quantity'] | ||
# Drop duplicates | ||
df = df.drop_duplicates(subset=columns_to_check) | ||
# Save the data to output pickle | ||
with open(output_pickle_path, "wb") as file: | ||
pickle.dump(df, file) | ||
print(f"Data saved to {output_pickle_path}.") | ||
return output_pickle_path |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
""" | ||
A module for removing missig values from dataset based on CustomeID | ||
and Description column. | ||
""" | ||
|
||
import os | ||
import pickle | ||
|
||
# Determine the absolute path of the project directory | ||
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
|
||
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','raw_data.pkl') | ||
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_missing_values.pkl') | ||
|
||
def handle_missing(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH): | ||
""" | ||
Load the DataFrame from the input pickle, | ||
remove rows with missing values in 'CustomerID' and 'Description' columns. | ||
Then, check if there are any missing values left in the dataframe. | ||
If there are, raise a ValueError. Finally, | ||
save the DataFrame back to a pickle and return its path. | ||
:param input_pickle_path: Path to the input pickle file. | ||
:param output_pickle_path: Path to the output pickle file. | ||
:return: Path to the saved pickle file. | ||
""" | ||
# Load DataFrame from input pickle | ||
if os.path.exists(input_pickle_path): | ||
with open(input_pickle_path, "rb") as file: | ||
df = pickle.load(file) | ||
else: | ||
raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}") | ||
# Remove rows with missing values in 'CustomerID' and 'Description' | ||
df = df.dropna(subset=['CustomerID', 'Description']) | ||
# Check if there are any missing values left | ||
if df.isna().sum().sum() != 0: | ||
missing_count = df.isna().sum().sum() | ||
message = f"There are {missing_count} missing values left in the dataframe." | ||
print(message) | ||
raise ValueError(message) | ||
# Save the data to output pickle | ||
with open(output_pickle_path, "wb") as file: | ||
pickle.dump(df, file) | ||
print(f"Data saved to {output_pickle_path}.") | ||
return output_pickle_path |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
""" | ||
A module for adding a new column named transaction_status based on the | ||
starting character of InvoiceNo column. | ||
transaction_status values are Cancelled or Completed. | ||
""" | ||
|
||
import pickle | ||
import os | ||
import numpy as np | ||
|
||
# Determine the absolute path of the project directory | ||
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', | ||
'processed','after_duplicates.pkl') | ||
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', | ||
'processed', 'after_transaction_status.pkl') | ||
|
||
def handle_transaction_status(input_pickle_path=INPUT_PICKLE_PATH, | ||
output_pickle_path=OUTPUT_PICKLE_PATH): | ||
""" | ||
Load the DataFrame from the input pickle, add a 'transaction_status' column | ||
to indicate whether the transaction was 'Cancelled' or 'Completed'. | ||
Save the DataFrame back to a pickle and return its path. | ||
:param input_pickle_path: Path to the input pickle file. | ||
:param output_pickle_path: Path to the output pickle file. | ||
:return: Path to the saved pickle file. | ||
:raises KeyError: If the 'InvoiceNo' column doesn't exist in the dataframe. | ||
""" | ||
# Load DataFrame from input pickle | ||
if os.path.exists(input_pickle_path): | ||
with open(input_pickle_path, "rb") as file: | ||
df = pickle.load(file) | ||
else: | ||
raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}") | ||
# Check if 'InvoiceNo' column exists | ||
if 'InvoiceNo' not in df.columns: | ||
raise KeyError("The input dataframe does not contain an 'InvoiceNo' column.") | ||
# Add the 'Transaction_Status' column | ||
df['transaction_status'] = np.where(df['InvoiceNo'].astype(str).str.startswith('C'), | ||
'Cancelled', 'Completed') | ||
# Save the data to output pickle | ||
with open(output_pickle_path, "wb") as file: | ||
pickle.dump(df, file) | ||
print(f"Data saved to {output_pickle_path}.") | ||
return output_pickle_path |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
""" | ||
A module for testing anomaly_code_handler module. | ||
""" | ||
|
||
import os | ||
import pickle | ||
from src.anomaly_code_handler import handle_anomalous_codes | ||
|
||
# Determine the absolute path of the project directory | ||
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', | ||
'processed','after_transaction_status.pkl') | ||
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', | ||
'processed', 'after_anomaly_code.pkl') | ||
|
||
def test_handle_anomalous_codes(): | ||
""" | ||
Test that handle_anomalous_codes correctly removes rows with stock codes | ||
that have 0 or 1 numeric characters. | ||
""" | ||
result = handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH, | ||
output_pickle_path=OUTPUT_PICKLE_PATH) | ||
assert result == OUTPUT_PICKLE_PATH,\ | ||
f"Expected {OUTPUT_PICKLE_PATH}, but got {result}." | ||
# Load the output pickle file to check the 'StockCode' column | ||
with open(OUTPUT_PICKLE_PATH, "rb") as file: | ||
df = pickle.load(file) | ||
# Check for stock codes with 0 or 1 numeric characters | ||
unique_stock_codes = df['StockCode'].unique() | ||
anomalous_stock_codes = [code for code in unique_stock_codes if | ||
sum(c.isdigit() for c in str(code)) in (0, 1)] | ||
# Assert that no such anomalous stock codes exist | ||
assert len(anomalous_stock_codes) == 0, "Anomalous stock codes found in the dataframe." |
Oops, something went wrong.