diff --git a/target_s3_parquet/data_type_generator.py b/target_s3_parquet/data_type_generator.py index 87b5a20..f7e035b 100644 --- a/target_s3_parquet/data_type_generator.py +++ b/target_s3_parquet/data_type_generator.py @@ -9,19 +9,25 @@ def build_struct_type(attributes, level): return f"struct<{stringfy_data_types}>" -def coerce_types(name, type): +def coerce_types(name, type, format=None, description=None): if name == "_sdc_sequence": return "string" if name == "_sdc_table_version": return "string" - if type == "number": + if type == "number" or format == "singer.decimal": return "double" if type == "integer": return "int" + if format == "date-time": + return "timestamp" + + if description in ["raw", "blob"]: + return "binary" + return type @@ -63,7 +69,9 @@ def generate_tap_schema(schema, level=0, only_string=False): field_definitions[name] = f"array<{array_type}>" else: - type = coerce_types(name, cleaned_type) + format = attributes.get("format") + description = attributes.get("description") + type = coerce_types(name, cleaned_type, format, description) field_definitions[name] = type diff --git a/target_s3_parquet/tests/test_data_type_generator.py b/target_s3_parquet/tests/test_data_type_generator.py index 704d92d..0afecf3 100644 --- a/target_s3_parquet/tests/test_data_type_generator.py +++ b/target_s3_parquet/tests/test_data_type_generator.py @@ -20,7 +20,7 @@ def test_schema_with_all_of(): {"type": "string", "format": "date-time"}, {"type": ["string", "null"]}, ] - }, + } } assert generate_tap_schema(schema) == {"lastModifiedDate": "string"} @@ -84,10 +84,10 @@ def test_complex_schema(): } expected_result = { - "identity_profiles": "array>>>", + "identity_profiles": "array>>>" } assert generate_tap_schema(schema) == expected_result @@ -103,11 +103,7 @@ def test_number_type(): "type": ["null", "array"], "items": { "type": ["null", "object"], - "properties": { - "some_value": { - "type": ["null", "number"], - }, - }, + "properties": {"some_value": {"type": ["null", "number"]}}, }, }, } @@ -128,11 +124,7 @@ def test_integer_type(): "type": ["null", "array"], "items": { "type": ["null", "object"], - "properties": { - "some_value": { - "type": ["null", "integer"], - }, - }, + "properties": {"some_value": {"type": ["null", "integer"]}}, }, }, } @@ -154,10 +146,10 @@ def test_sdc_type_translation(): } assert generate_tap_schema(schema) == { - "_sdc_batched_at": "string", - "_sdc_received_at": "string", - "_sdc_extracted_at": "string", - "_sdc_deleted_at": "string", + "_sdc_batched_at": "timestamp", + "_sdc_received_at": "timestamp", + "_sdc_extracted_at": "timestamp", + "_sdc_deleted_at": "timestamp", "_sdc_sequence": "string", "_sdc_table_version": "string", } @@ -173,11 +165,7 @@ def test_only_string_definition(): "type": ["null", "array"], "items": { "type": ["null", "object"], - "properties": { - "some_value": { - "type": ["null", "integer"], - }, - }, + "properties": {"some_value": {"type": ["null", "integer"]}}, }, }, } @@ -188,6 +176,21 @@ def test_only_string_definition(): } +def test_binary_type(): + schema = { + "image": {"type": ["null", "string"], "description": "blob"}, + "free_text": {"type": ["null", "string"], "description": "raw"}, + } + + assert generate_tap_schema(schema) == {"image": "binary", "free_text": "binary"} + + +def test_singer_decimal_type(): + schema = {"measurement": {"type": ["null", "string"], "format": "singer.decimal"}} + + assert generate_tap_schema(schema) == {"measurement": "double"} + + def test_get_current_schema(): schema = { "Column Name": ["identity_profiles", "identities"],