Skip to content

Commit

Permalink
Import Datasets: Validate that bucket is unique (data-dot-all#1498)
Browse files Browse the repository at this point in the history
### Feature or Bugfix
<!-- please choose -->
- Feature
 From each bucket we can for now on import only one dataset

### Detail
Import dataset1 with s3 bucket: s3bucketName
Import dataset2 with s3 bucket: s3BucketName
=> result an error "Dataset with bucket s3BucketName already exists" 
### Relates
- data-dot-all#1278 

### Security
Please answer the questions below briefly where applicable, or write
`N/A`. Based on
[OWASP 10](https://owasp.org/Top10/en/).

- Does this PR introduce or modify any input fields or queries - this
includes
fetching data from storage outside the application (e.g. a database, an
S3 bucket)?
  - Is the input sanitized?
- What precautions are you taking before deserializing the data you
consume?
  - Is injection prevented by parametrizing queries?
  - Have you ensured no `eval` or similar functions are used?
- Does this PR introduce any functionality or component that requires
authorization?
- How have you ensured it respects the existing AuthN/AuthZ mechanisms?
  - Are you logging failed auth attempts?
- Are you using or adding any cryptographic features?
  - Do you use a standard proven implementations?
  - Are the used keys controlled by the customer? Where are they stored?
- Are you introducing any new policies/roles/users?
  - Have you used the least-privilege principle? How?


By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache 2.0 license.

---------

Co-authored-by: Sofia Sazonova <[email protected]>
  • Loading branch information
SofiaSazonova and Sofia Sazonova authored Aug 26, 2024
1 parent 24fca5f commit c677f8e
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,7 @@ def delete_dataset_buckets(session, dataset_uri) -> bool:
buckets = session.query(DatasetBucket).filter(DatasetBucket.datasetUri == dataset_uri).all()
for bucket in buckets:
session.delete(bucket)

@staticmethod
def get_dataset_bucket_by_name(session, bucket_name) -> DatasetBucket:
return session.query(DatasetBucket).filter(DatasetBucket.S3BucketName == bucket_name).first()
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,12 @@ def check_dataset_account(session, environment):

@staticmethod
def check_imported_resources(dataset: S3Dataset):
with get_context().db_engine.scoped_session() as session:
if DatasetBucketRepository.get_dataset_bucket_by_name(session, dataset.S3BucketName):
raise exceptions.ResourceAlreadyExists(
action=IMPORT_DATASET,
message=f'Dataset with bucket {dataset.S3BucketName} already exists',
)
if dataset.importedGlueDatabase:
if len(dataset.GlueDatabaseName) > NamingConventionPattern.GLUE.value.get('max_length'):
raise exceptions.InvalidInput(
Expand Down
7 changes: 7 additions & 0 deletions tests/modules/s3_datasets/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pytest

from dataall.base.context import set_context, RequestContext, dispose_context
from dataall.core.environment.db.environment_models import Environment, EnvironmentGroup
from dataall.core.organizations.db.organization_models import Organization
from dataall.core.permissions.services.resource_policy_service import ResourcePolicyService
Expand Down Expand Up @@ -437,3 +438,9 @@ def random_tag():

def random_tags():
return [random_tag() for i in range(1, random.choice([2, 3, 4, 5]))]


@pytest.fixture(scope='function')
def api_context_1(db, user, group):
yield set_context(RequestContext(db_engine=db, username=user.username, groups=[group.name], user_id=user.username))
dispose_context()
12 changes: 6 additions & 6 deletions tests/modules/s3_datasets/test_import_dataset_check_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,31 +8,31 @@
from dataall.modules.s3_datasets.db.dataset_models import S3Dataset


def test_s3_managed_bucket_import(mock_aws_client):
def test_s3_managed_bucket_import(mock_aws_client, api_context_1):
dataset = S3Dataset(KmsAlias=None)

mock_encryption_bucket(mock_aws_client, 'AES256', None)

assert DatasetService.check_imported_resources(dataset)


def test_s3_managed_bucket_but_bucket_encrypted_with_kms(mock_aws_client):
def test_s3_managed_bucket_but_bucket_encrypted_with_kms(mock_aws_client, api_context_1):
dataset = S3Dataset(KmsAlias=None)

mock_encryption_bucket(mock_aws_client, 'aws:kms', 'any')
with pytest.raises(RequiredParameter):
DatasetService.check_imported_resources(dataset)


def test_s3_managed_bucket_but_alias_provided(mock_aws_client):
def test_s3_managed_bucket_but_alias_provided(mock_aws_client, api_context_1):
dataset = S3Dataset(KmsAlias='Key')

mock_encryption_bucket(mock_aws_client, 'AES256', None)
with pytest.raises(InvalidInput):
DatasetService.check_imported_resources(dataset)


def test_kms_encrypted_bucket_but_key_not_exist(mock_aws_client):
def test_kms_encrypted_bucket_but_key_not_exist(mock_aws_client, api_context_1):
alias = 'alias'
dataset = S3Dataset(KmsAlias=alias)
mock_encryption_bucket(mock_aws_client, 'aws:kms', 'any')
Expand All @@ -42,7 +42,7 @@ def test_kms_encrypted_bucket_but_key_not_exist(mock_aws_client):
DatasetService.check_imported_resources(dataset)


def test_kms_encrypted_bucket_but_key_is_wrong(mock_aws_client):
def test_kms_encrypted_bucket_but_key_is_wrong(mock_aws_client, api_context_1):
alias = 'key_alias'
kms_id = 'kms_id'
dataset = S3Dataset(KmsAlias=alias)
Expand All @@ -54,7 +54,7 @@ def test_kms_encrypted_bucket_but_key_is_wrong(mock_aws_client):
DatasetService.check_imported_resources(dataset)


def test_kms_encrypted_bucket_imported(mock_aws_client):
def test_kms_encrypted_bucket_imported(mock_aws_client, api_context_1):
alias = 'key_alias'
kms_id = 'kms_id'
dataset = S3Dataset(KmsAlias=alias)
Expand Down

0 comments on commit c677f8e

Please sign in to comment.