Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Module updates #3

Merged
merged 6 commits into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# When enabled, pylint would attempt to guess common misconfiguration and emit
# user-friendly hints instead of false-positive error messages.
suggestion-mode=yes
init-hook='import sys; sys.path.append("src")'

[MESSAGES CONTROL]

Expand Down
1 change: 1 addition & 0 deletions data/bad.zip
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is not a valid zip file
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ mlflow
requests
pytest-mock
pytest-pylint
openpyxl
openpyxl
requests-mock
43 changes: 6 additions & 37 deletions src/datapipeline.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,11 @@
"""
Functions to ingest and process data
Modularized Data pipeline to form DAGs in the future
"""
import zipfile
import requests

def ingest_data():
"""
Function to download file from URL
"""
file_url = "https://archive.ics.uci.edu/static/public/352/online+retail.zip"

# Send an HTTP GET request to the URL
response = requests.get(file_url, timeout=30)

# Check if the request was successful (status code 200)
if response.status_code == 200:
# Save file to data
with open("data/data.zip", "wb") as file:
file.write(response.content)
print("File downloaded successfully.")
else:
print(f"Failed to download the file. Status code: {response.status_code}")


def unzip_file():
"""
Function to unzip the downloaded data
"""
zip_filename ='data/data.zip'
extract_to = 'data/'
try:
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
zip_ref.extractall(extract_to)
print(f"File {zip_filename} successfully unzipped to {extract_to}")
except zipfile.BadZipFile:
print(f"Failed to unzip {zip_filename}")
from download_data import ingest_data
from unzip_data import unzip_file


if __name__ == "__main__":
ingest_data()
unzip_file()
ZIPFILE_PATH = ingest_data(
"""https://archive.ics.uci.edu/static/public/352/online+retail.zip""")
UNZIPPED_FILE = unzip_file(ZIPFILE_PATH, 'data')
38 changes: 38 additions & 0 deletions src/download_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""
Function to download and ingest the data file
"""
import os
import requests

DEFAULT_FILE_URL = "https://archive.ics.uci.edu/static/public/352/online+retail.zip"

# Set the root directory variable using a relative path
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))

def ingest_data(file_url=DEFAULT_FILE_URL):
"""
Function to download file from URL
Args:
file_url: URL of the file, A default is used if not specified
Returns:
zipfile_path: The zipped file path to the data
"""
# Send an HTTP GET request to the URL
response = requests.get(file_url, timeout=30)

# Path to store the zipfile
zipfile_path=os.path.join(ROOT_DIR, 'data','data.zip')
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Save file to data
with open(zipfile_path, "wb") as file:
file.write(response.content)
print(f"File downloaded successfully. Zip file available under {zipfile_path}")
else:
print(f"Failed to download the file. Status code: {response.status_code}")

return zipfile_path

if __name__ == "__main__":
ZIPFILE_PATH = ingest_data("https://archive.ics.uci.edu/static/public/352/online+retail.zip")

33 changes: 33 additions & 0 deletions src/unzip_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""
Function to unzip data and make it available
"""
import zipfile
import os

# Set the root directory variable using a relative path
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))

ZIP_FILENAME = os.path.join(ROOT_DIR, 'data','data.zip')
EXTRACT_TO = os.path.join(ROOT_DIR,'data')

def unzip_file(zip_filename=ZIP_FILENAME, extract_to=EXTRACT_TO):
"""
Function to unzip the downloaded data
Args:
zip_filename: zipfile path, a default is used if not specified
extract_to: Path where the unzipped and extracted data is available
Returns:
extract_to: filepath where the data is available
"""
try:
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
zip_ref.extractall(extract_to)
print(f"File {zip_filename} successfully unzipped to {extract_to}")
except zipfile.BadZipFile:
print(f"Failed to unzip {zip_filename}")
# Return unzipped file
unzipped_file = os.path.join(extract_to, 'Online Retail.xlsx')
return unzipped_file

if __name__ == "__main__":
UNZIPPED_FILE = unzip_file(ZIP_FILENAME, EXTRACT_TO)
Binary file removed test/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
Binary file not shown.
42 changes: 0 additions & 42 deletions test/test_datapipeline.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,3 @@
"""
Tests for datapipeline functions
"""
from src import datapipeline

def test_ingest_data(mocker):
"""
Test for ingest_data()
"""

# arrange:
# mocked dependencies

mock_print = mocker.MagicMock(name='print')
mocker.patch('src.datapipeline.print', new=mock_print)

# act: invoking the tested code
datapipeline.ingest_data()

# assert:
assert 1 == mock_print.call_count


def test_unzip_file(mocker):
"""
Tests for unzip()
"""

# arrange:
# mocked dependencies

mock_zipfile = mocker.MagicMock(name='ZipFile')
mocker.patch('src.datapipeline.zipfile.ZipFile', new=mock_zipfile)

mock_print = mocker.MagicMock(name='print')
mocker.patch('src.datapipeline.print', new=mock_print)

mock_exception = mocker.MagicMock(name='Exception')
mocker.patch('src.datapipeline.Exception', new=mock_exception)

# act: invoking the tested code
datapipeline.unzip_file()

# assert:
mock_exception.assert_not_called()
47 changes: 47 additions & 0 deletions test/test_download_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Tests for downloda_data.py
"""
import os
import requests
import requests_mock
from src import download_data

DEFAULT_FILE_URL = "https://archive.ics.uci.edu/static/public/352/online+retail.zip"

def test_ingest_data(mocker):
"""
Tests for checking print call
"""
# arrange:
# mocked dependencies
mock_print = mocker.MagicMock(name='print')
mocker.patch('src.download_data.print', new=mock_print)
# act: invoking the tested code
download_data.ingest_data(DEFAULT_FILE_URL)
# assert: todo
assert 1 == mock_print.call_count

def test_ingest_data_successful_download():
"""
Test for checking successful download of the file
"""
# Create a session and attach the requests_mock to it
with requests.Session() as session:
adapter = requests_mock.Adapter()
# session.mount('http://', adapter)
session.mount('https://', adapter)

# Set the root directory variable using a relative path
root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))

# Path to store the zipfile
zipfile_path=os.path.join(root_dir, 'data','data.zip')

# Define the mock response
adapter.register_uri('GET', DEFAULT_FILE_URL, text=zipfile_path)

# Call your function that makes the HTTP requests
result = download_data.ingest_data(DEFAULT_FILE_URL) # Replace with your actual function

# Perform assertions
assert result == zipfile_path
46 changes: 46 additions & 0 deletions test/test_unzip_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
Function to test the unzip_data functions
"""
import os
from src import unzip_data

# Define constants or variables for testing
# Set the root directory variable using a relative path
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))

ZIP_FILENAME = os.path.join(ROOT_DIR, 'data','data.zip')
EXTRACT_TO = os.path.join(ROOT_DIR,'data')
BAD_ZIP_FILENAME = os.path.join(ROOT_DIR, 'data', 'bad.zip')

# Test for successful unzipping
def test_unzip_file_successful():
"""
Test for successful unzipping
"""
# Call the function to unzip a valid file
unzipped_file = unzip_data.unzip_file(ZIP_FILENAME, EXTRACT_TO)

# Check if the function returned the expected unzipped file path
assert unzipped_file == os.path.join(EXTRACT_TO, 'Online Retail.xlsx')

# Check if the unzipped file exists
assert os.path.isfile(unzipped_file)

# Test for handling a bad zip file
def test_unzip_file_bad_zip(tmp_path, capsys):
"""
Test for handling a bad zip file
"""
# Create a bad zip file in the temporary directory
with open(BAD_ZIP_FILENAME, "wb") as file:
file.write(b"This is not a valid zip file")

# Create a temporary directory for testing
test_dir = tmp_path / "test_dir"
test_dir.mkdir()
# Call the function to unzip a bad zip file
unzip_data.unzip_file(BAD_ZIP_FILENAME, test_dir)

# Check if the function printed the appropriate error message
captured = capsys.readouterr()
assert "Failed to unzip" in captured.out