-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
final cleaning to follow pylint rules for spacing and docstrings
- Loading branch information
1 parent
ac561f8
commit bb78074
Showing
10 changed files
with
130 additions
and
116 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,40 +1,43 @@ | ||
import pandas as pd | ||
""" | ||
A module for removing anomalies from StockCode column if they have 0 or 1 | ||
digit characters since the normal values are 5 or 6 digits. | ||
""" | ||
|
||
import pickle | ||
import os | ||
|
||
# Determine the absolute path of the project directory | ||
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', | ||
'processed','after_transaction_status.pkl') | ||
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', | ||
'processed', 'after_anomaly_code.pkl') | ||
|
||
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_transaction_status.pkl') | ||
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'after_anomaly_code.pkl') | ||
|
||
def handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH): | ||
def handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH, | ||
output_pickle_path=OUTPUT_PICKLE_PATH): | ||
""" | ||
Load the DataFrame from the input pickle, remove rows with stock codes that | ||
have 0 or 1 numeric characters, then save the DataFrame back to a pickle and return its path. | ||
have 0 or 1 numeric characters, | ||
then save the DataFrame back to a pickle and return its path. | ||
:param input_pickle_path: Path to the input pickle file. | ||
:param output_pickle_path: Path to the output pickle file. | ||
:return: Path to the saved pickle file. | ||
""" | ||
|
||
# Load DataFrame from input pickle | ||
if os.path.exists(input_pickle_path): | ||
with open(input_pickle_path, "rb") as file: | ||
df = pickle.load(file) | ||
else: | ||
raise FileNotFoundError(f"No data found at the specified path: {input_pickle_path}") | ||
|
||
# Finding the stock codes with 0 and 1 numeric characters | ||
unique_stock_codes = df['StockCode'].unique() | ||
anomalous_stock_codes = [code for code in unique_stock_codes if sum(c.isdigit() for c in str(code)) in (0, 1)] | ||
|
||
anomalous_stock_codes = [code for code in unique_stock_codes if | ||
sum(c.isdigit() for c in str(code)) in (0, 1)] | ||
# Removing rows with these anomalous stock codes | ||
df = df[~df['StockCode'].isin(anomalous_stock_codes)] | ||
|
||
# Save the data to output pickle | ||
with open(output_pickle_path, "wb") as file: | ||
pickle.dump(df, file) | ||
|
||
print(f"Data saved to {output_pickle_path}.") | ||
return output_pickle_path |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,55 +1,47 @@ | ||
""" | ||
Module to handle the loading of e-commerce dataset from either pickle or Excel file format. | ||
""" | ||
|
||
import pickle | ||
import os | ||
import pandas as pd | ||
import pickle | ||
|
||
# Determine the absolute path of the project directory | ||
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
|
||
# Use the project directory to construct paths to other directories | ||
DEFAULT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'raw_data.pkl') | ||
DEFAULT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', | ||
'processed', 'raw_data.pkl') | ||
DEFAULT_EXCEL_PATH = os.path.join(PROJECT_DIR, 'data', 'Online Retail.xlsx') | ||
|
||
def load_data(pickle_path=DEFAULT_PICKLE_PATH, excel_path=DEFAULT_EXCEL_PATH): | ||
""" | ||
Load the e-commerce dataset. | ||
Load the e-commerce dataset. | ||
First, try to load from the pickle file. If it doesn't exist, load from the excel file. | ||
Regardless of the source, save the loaded data as a pickle for future use and return the path to that pickle. | ||
Regardless of the source, save the loaded data as a pickle for future use and | ||
return the path to that pickle. | ||
:param pickle_path: Path to the pickle file. | ||
:param csv_path: Path to the Excel file. | ||
:param excel_path: Path to the Excel file. | ||
:return: Path to the saved pickle file. | ||
""" | ||
|
||
# Placeholder for the DataFrame | ||
df = None | ||
|
||
# Check if pickle file exists | ||
if os.path.exists(pickle_path): | ||
with open(pickle_path, "rb") as file: | ||
df = pickle.load(file) | ||
print(f"Data loaded successfully from {pickle_path}.") | ||
|
||
# If pickle doesn't exist, load CSV | ||
# If pickle doesn't exist, load from Excel | ||
elif os.path.exists(excel_path): | ||
df = pd.read_excel(excel_path) | ||
print(f"Data loaded from {excel_path}.") | ||
|
||
|
||
else: | ||
error_message = f"No data found in the specified paths: {pickle_path} or {excel_path}" | ||
print(error_message) | ||
raise FileNotFoundError(error_message) | ||
|
||
# Save the data to pickle for future use (or re-save it if loaded from existing pickle) | ||
with open(pickle_path, "wb") as file: | ||
pickle.dump(df, file) | ||
|
||
print(f"Data saved to {pickle_path} for future use.") | ||
return pickle_path | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,33 @@ | ||
import pytest | ||
""" | ||
A module for testing anomaly_code_handler module. | ||
""" | ||
|
||
import os | ||
import pickle | ||
from src.anomaly_code_handler import handle_anomalous_codes | ||
|
||
# Determine the absolute path of the project directory | ||
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
|
||
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed','after_transaction_status.pkl') | ||
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed', 'after_anomaly_code.pkl') | ||
INPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', | ||
'processed','after_transaction_status.pkl') | ||
OUTPUT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', | ||
'processed', 'after_anomaly_code.pkl') | ||
|
||
def test_handle_anomalous_codes(): | ||
""" | ||
Test that handle_anomalous_codes correctly removes rows with stock codes | ||
that have 0 or 1 numeric characters. | ||
""" | ||
result = handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH, output_pickle_path=OUTPUT_PICKLE_PATH) | ||
assert result == OUTPUT_PICKLE_PATH, f"Expected {OUTPUT_PICKLE_PATH}, but got {result}." | ||
|
||
result = handle_anomalous_codes(input_pickle_path=INPUT_PICKLE_PATH, | ||
output_pickle_path=OUTPUT_PICKLE_PATH) | ||
assert result == OUTPUT_PICKLE_PATH,\ | ||
f"Expected {OUTPUT_PICKLE_PATH}, but got {result}." | ||
# Load the output pickle file to check the 'StockCode' column | ||
with open(OUTPUT_PICKLE_PATH, "rb") as file: | ||
df = pickle.load(file) | ||
|
||
# Check for stock codes with 0 or 1 numeric characters | ||
unique_stock_codes = df['StockCode'].unique() | ||
anomalous_stock_codes = [code for code in unique_stock_codes if sum(c.isdigit() for c in str(code)) in (0, 1)] | ||
|
||
anomalous_stock_codes = [code for code in unique_stock_codes if | ||
sum(c.isdigit() for c in str(code)) in (0, 1)] | ||
# Assert that no such anomalous stock codes exist | ||
assert len(anomalous_stock_codes) == 0, "Anomalous stock codes found in the dataframe." | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.