Skip to content

Athena read_sql_query error when workgroup encryption is turned ON using CSE_KMS #2933

Closed as not planned
@leo4ever

Description

@leo4ever

Describe the bug

Running any query on Athena fails with the error message "UnicodeDecodeError: 'utf-8' codec can't decode byte 0x86 in position 2: invalid start byte " when workgroup encryption is turned ON using CSE_KMS

wr.athena.read_sql_query('select count(1) as num_records from mytable', database, workgroup)

Error stack trace:

UnicodeDecodeError                        Traceback (most recent call last)
Cell In[77], line 2
      1 ## Example 1: Compute Statistical Summary (Count, Sum, Min, Max, etc.) 
----> 2 wr.athena.read_sql_query(sql='select count(1) as num_record from loans', database=database_name, workgroup=database_name, keep_files=False)

File [/opt/conda/lib/python3.10/site-packages/awswrangler/_config.py:715](https://szvpbcvb4hhqgvw.studio.us-west-2.sagemaker.aws/opt/conda/lib/python3.10/site-packages/awswrangler/_config.py#line=714), in apply_configs.<locals>.wrapper(*args_raw, **kwargs)
    713         del args[name]
    714         args = {**args, **keywords}
--> 715 return function(**args)

File [/opt/conda/lib/python3.10/site-packages/awswrangler/_utils.py:178](https://szvpbcvb4hhqgvw.studio.us-west-2.sagemaker.aws/opt/conda/lib/python3.10/site-packages/awswrangler/_utils.py#line=177), in validate_kwargs.<locals>.decorator.<locals>.inner(*args, **kwargs)
    175 if condition_fn() and len(passed_unsupported_kwargs) > 0:
    176     raise exceptions.InvalidArgument(f"{message} `{', '.join(passed_unsupported_kwargs)}`.")
--> 178 return func(*args, **kwargs)

File [/opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py:1082](https://szvpbcvb4hhqgvw.studio.us-west-2.sagemaker.aws/opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py#line=1081), in read_sql_query(sql, database, ctas_approach, unload_approach, ctas_parameters, unload_parameters, categories, chunksize, s3_output, workgroup, encryption, kms_key, keep_files, use_threads, boto3_session, client_request_token, athena_cache_settings, data_source, athena_query_wait_polling_delay, params, paramstyle, dtype_backend, s3_additional_kwargs, pyarrow_additional_kwargs)
   1079 ctas_bucketing_info = ctas_parameters.get("bucketing_info")
   1080 ctas_write_compression = ctas_parameters.get("compression")
-> 1082 return _resolve_query_without_cache(
   1083     sql=sql,
   1084     database=database,
   1085     data_source=data_source,
   1086     ctas_approach=ctas_approach,
   1087     unload_approach=unload_approach,
   1088     unload_parameters=unload_parameters,
   1089     categories=categories,
   1090     chunksize=chunksize,
   1091     s3_output=s3_output,
   1092     workgroup=workgroup,
   1093     encryption=encryption,
   1094     kms_key=kms_key,
   1095     keep_files=keep_files,
   1096     ctas_database=ctas_database,
   1097     ctas_temp_table_name=ctas_temp_table_name,
   1098     ctas_bucketing_info=ctas_bucketing_info,
   1099     ctas_write_compression=ctas_write_compression,
   1100     athena_query_wait_polling_delay=athena_query_wait_polling_delay,
   1101     use_threads=use_threads,
   1102     s3_additional_kwargs=s3_additional_kwargs,
   1103     boto3_session=boto3_session,
   1104     pyarrow_additional_kwargs=pyarrow_additional_kwargs,
   1105     execution_params=execution_params,
   1106     dtype_backend=dtype_backend,
   1107     client_request_token=client_request_token,
   1108 )

File [/opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py:508](https://szvpbcvb4hhqgvw.studio.us-west-2.sagemaker.aws/opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py#line=507), in _resolve_query_without_cache(sql, database, data_source, ctas_approach, unload_approach, unload_parameters, categories, chunksize, s3_output, workgroup, encryption, kms_key, keep_files, ctas_database, ctas_temp_table_name, ctas_bucketing_info, ctas_write_compression, athena_query_wait_polling_delay, use_threads, s3_additional_kwargs, boto3_session, pyarrow_additional_kwargs, execution_params, dtype_backend, client_request_token)
    506     name = f"temp_table_{uuid.uuid4().hex}"
    507 try:
--> 508     return _resolve_query_without_cache_ctas(
    509         sql=sql,
    510         database=database,
    511         data_source=data_source,
    512         s3_output=s3_output,
    513         keep_files=keep_files,
    514         chunksize=chunksize,
    515         categories=categories,
    516         encryption=encryption,
    517         workgroup=workgroup,
    518         kms_key=kms_key,
    519         alt_database=ctas_database,
    520         name=name,
    521         ctas_bucketing_info=ctas_bucketing_info,
    522         ctas_write_compression=ctas_write_compression,
    523         athena_query_wait_polling_delay=athena_query_wait_polling_delay,
    524         use_threads=use_threads,
    525         s3_additional_kwargs=s3_additional_kwargs,
    526         boto3_session=boto3_session,
    527         pyarrow_additional_kwargs=pyarrow_additional_kwargs,
    528         execution_params=execution_params,
    529         dtype_backend=dtype_backend,
    530     )
    531 finally:
    532     catalog.delete_table_if_exists(database=ctas_database or database, table=name, boto3_session=boto3_session)

File [/opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py:346](https://szvpbcvb4hhqgvw.studio.us-west-2.sagemaker.aws/opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py#line=345), in _resolve_query_without_cache_ctas(sql, database, data_source, s3_output, keep_files, chunksize, categories, encryption, workgroup, kms_key, alt_database, name, ctas_bucketing_info, ctas_write_compression, athena_query_wait_polling_delay, use_threads, s3_additional_kwargs, boto3_session, pyarrow_additional_kwargs, execution_params, dtype_backend)
    344 ctas_query_metadata = cast(_QueryMetadata, ctas_query_info["ctas_query_metadata"])
    345 _logger.debug("CTAS query metadata: %s", ctas_query_metadata)
--> 346 return _fetch_parquet_result(
    347     query_metadata=ctas_query_metadata,
    348     keep_files=keep_files,
    349     categories=categories,
    350     chunksize=chunksize,
    351     use_threads=use_threads,
    352     s3_additional_kwargs=s3_additional_kwargs,
    353     boto3_session=boto3_session,
    354     temp_table_fqn=fully_qualified_name,
    355     pyarrow_additional_kwargs=pyarrow_additional_kwargs,
    356     dtype_backend=dtype_backend,
    357 )

File [/opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py:137](https://szvpbcvb4hhqgvw.studio.us-west-2.sagemaker.aws/opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py#line=136), in _fetch_parquet_result(query_metadata, keep_files, categories, chunksize, use_threads, boto3_session, s3_additional_kwargs, temp_table_fqn, pyarrow_additional_kwargs, dtype_backend)
    135 _logger.debug("Manifest path: %s", manifest_path)
    136 _logger.debug("Metadata path: %s", metadata_path)
--> 137 paths: list[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=boto3_session)
    138 if not paths:
    139     if not temp_table_fqn:

File [/opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py:60](https://szvpbcvb4hhqgvw.studio.us-west-2.sagemaker.aws/opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py#line=59), in _extract_ctas_manifest_paths(path, boto3_session)
     58 client_s3 = _utils.client(service_name="s3", session=boto3_session)
     59 body: bytes = client_s3.get_object(Bucket=bucket_name, Key=key_path)["Body"].read()
---> 60 paths = [x for x in body.decode("utf-8").split("\n") if x]
     61 _logger.debug("Read %d paths from manifest file in: %s", len(paths), path)
     62 return paths

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa9 in position 2: invalid start byte

How to Reproduce

import awswrangler as wr
wr.athena.read_sql_query(sql='select count(1) as num_record from loans', database=database_name, workgroup=database_name)

Expected behavior

No response

Your project

No response

Screenshots

No response

OS

Linux

Python version

3.10.14

AWS SDK for pandas version

3.9.0

Additional context

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions