Skip to content

Commit

Permalink
Merge pull request #27 from beeldengeluid/19-s3-test
Browse files Browse the repository at this point in the history
19 s3 test
  • Loading branch information
greenw0lf authored May 10, 2024
2 parents ea90ceb + 18f2727 commit 189ee1a
Show file tree
Hide file tree
Showing 8 changed files with 553 additions and 122 deletions.
10 changes: 8 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
.vscode
testsource__testcarrier
/data/*
!/data/README.md
__pycache__
.pytest_cache
.coverage
/config.yml
s3-creds.env
.vscode/*
8 changes: 5 additions & 3 deletions config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,21 @@ ELASTICSEARCH:
SCHEME: http
INDEX: dane-index-k8s
FILE_SYSTEM:
BASE_MOUNT: /data # data when running locally
BASE_MOUNT: data # data when running S3 integration test, /data when running via Docker
INPUT_DIR: input-files
OUTPUT_DIR: output-files
INPUT:
TEST_INPUT_PATH: testsource__testcarrier/inputfile.wav
S3_ENDPOINT_URL: https://s3-host
MODEL: s3://bucket/model
S3_BUCKET: input
S3_FOLDER_IN_BUCKET: assets # folder within the bucket
S3_BUCKET_MODEL: model
DELETE_ON_COMPLETION: False
OUTPUT:
DELETE_ON_COMPLETION: True
TRANSFER_ON_COMPLETION: True
S3_ENDPOINT_URL: https://s3-host
S3_BUCKET: bucket-name # bucket reserved for 1 type of output
S3_BUCKET: output # bucket reserved for 1 type of output
S3_FOLDER_IN_BUCKET: folder # folder within the bucket
WHISPER_ASR_SETTINGS:
WORD_TIMESTAMPS: True
Expand Down
7 changes: 5 additions & 2 deletions io_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,8 @@ def obtain_input_file(s3_uri: str) -> WhisperASRInput:
success = s3.download_file(bucket, object_name, output_folder)
if success:
# uncompress the <input_base>.tar.gz
untar_input_file(input_file_path)
if input_file_path.find(".tar.gz") != -1:
input_file_path = untar_input_file(input_file_path)

provenance = Provenance(
activity_name="download",
Expand Down Expand Up @@ -297,8 +298,10 @@ def fetch_input_s3_uri(handler, doc: Document) -> str:


# untars somefile.tar.gz into the same dir
def untar_input_file(tar_file_path: str):
def untar_input_file(tar_file_path: str) -> str:
logger.info(f"Uncompressing {tar_file_path}")
path = str(Path(tar_file_path).parent)
with tarfile.open(tar_file_path) as tar:
tar.extractall(path=path, filter="data") # type: ignore
filename = tar.getmembers()[0].name
return path + f"/{filename}"
537 changes: 422 additions & 115 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ dane = "^0.4.2"
faster-whisper = "^1.0.1"

[tool.poetry.group.dev.dependencies]
moto = "^5.0.3"
boto3 = "^1.34.84"
pytest = "^7.2.0"
mockito = "^1.3.3"
flake8 = "^5.0.4"
Expand All @@ -27,6 +29,7 @@ build-backend = "poetry.core.masonry.api"
[tool.pytest.ini_options]
testpaths = [
"tests/unit",
"tests/integration",
]
addopts = [
"--cov",
Expand Down Expand Up @@ -66,5 +69,6 @@ module = [
'yaml',
'yacs.*',
'faster_whisper.*',
'boto3.*'
]
ignore_missing_imports = true
Empty file added tests/__init__.py
Empty file.
109 changes: 109 additions & 0 deletions tests/integration/S3_integration_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from moto import mock_aws
import boto3
import pytest
import os
import shutil
import tarfile

from main_data_processor import run
from dane.config import cfg
from io_util import untar_input_file, S3_OUTPUT_TYPES


carrier_id = "carrier"
resource_id = "resource__" + carrier_id
fn_tar_in = f"{resource_id}.tar.gz"
key_in = f"{cfg.INPUT.S3_FOLDER_IN_BUCKET}/{fn_tar_in}"
tar_out = f"{carrier_id}/out__{carrier_id}.tar.gz"
key_out = f"{cfg.OUTPUT.S3_FOLDER_IN_BUCKET}/{tar_out}"


@pytest.fixture
def aws_credentials():
"""Create custom AWS setup: mocked AWS Credentials for moto."""
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
os.environ["AWS_SECURITY_TOKEN"] = "testing"
os.environ["AWS_SESSION_TOKEN"] = "testing"
os.environ["AWS_DEFAULT_REGION"] = "us-east-1" # Other regions make stuff complex
os.environ["MOTO_S3_CUSTOM_ENDPOINTS"] = cfg.INPUT.S3_ENDPOINT_URL


@pytest.fixture
def aws(aws_credentials):
"""Spin up local aws for testing"""
with mock_aws():
yield boto3.client("s3")


@pytest.fixture
def create_sample_input():
"""
Add sample input for test to input bucket.
"""
fn = (
f"{cfg.FILE_SYSTEM.BASE_MOUNT}/"
f"{cfg.FILE_SYSTEM.INPUT_DIR}/"
f"{cfg.INPUT.TEST_INPUT_PATH}"
)
with tarfile.open(fn_tar_in, "w:gz") as tar:
tar.add(fn, arcname="inputfile.wav")
yield
# after test: cleanup
os.remove(fn_tar_in)


@pytest.fixture
def create_and_fill_buckets(aws, create_sample_input):
"""Make sure input and output buckets exist, and add sample input"""
client = boto3.client("s3")
for bucket in [
cfg.INPUT.S3_BUCKET,
cfg.OUTPUT.S3_BUCKET,
cfg.INPUT.S3_BUCKET_MODEL,
]:
client.create_bucket(Bucket=bucket)
client.upload_file(
Filename=fn_tar_in,
Bucket=cfg.INPUT.S3_BUCKET,
Key=key_in,
)


@pytest.fixture
def setup_fs():
"""Create test output dir, abort if dir is not empty."""
try:
os.makedirs(carrier_id)
except FileExistsError:
print("Destination for output is not empty: abort.")
assert False
yield
# after test: cleanup
shutil.rmtree(carrier_id)


def test_main_data_processor(aws, aws_credentials, create_and_fill_buckets, setup_fs):
"""Test the main_data_processor.run function, running on URI in mocked S3.
Relies on fixtures: aws, aws_credentials, create_and_fill_buckets, setup_fs"""
if cfg.OUTPUT.TRANSFER_ON_COMPLETION:
# run the main data processor
run(input_file_path=f"s3://{cfg.INPUT.S3_BUCKET}/{key_in}")

# Check if the output is present in S3
client = boto3.client("s3")
found = False
for item in client.list_objects(Bucket=cfg.OUTPUT.S3_BUCKET)["Contents"]:
found = item["Key"] == key_out
if found:
break
assert found

client.download_file(Bucket=cfg.OUTPUT.S3_BUCKET, Key=key_out, Filename=tar_out)
untar_input_file(tar_out)
for type in S3_OUTPUT_TYPES:
assert type.value in os.listdir(carrier_id)

else:
print("Not configured to transfer output!")
assert False
Empty file added tests/integration/__init__.py
Empty file.

0 comments on commit 189ee1a

Please sign in to comment.