From cef7f1975fcc49b3687e2eeec662f3e14de9ecd2 Mon Sep 17 00:00:00 2001 From: maxim-lixakov Date: Mon, 9 Oct 2023 11:34:47 +0300 Subject: [PATCH] [DOP-9645] - update tests, documentation --- onetl/file/format/xml.py | 8 +++++++- .../file_df_connection/generate_files.py | 19 ++++++++++++++---- .../with_attributes/file_with_attributes.xml | 2 +- .../xml/with_compression/file.xml.gz | Bin 294 -> 294 bytes .../xml/without_compression/file.xml | 2 +- .../test_xml_integration.py | 10 ++++----- 6 files changed, 29 insertions(+), 12 deletions(-) diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index 0f9a770d4..df94b24f1 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -108,7 +108,11 @@ class XML(ReadWriteFileFormat): The set of supported options depends on Spark version. See link above. - **Important**: When **reading** files with timestamps, it may be necessary to specify the ``timestampFormat`` option to ensure dates are parsed correctly. Without it, date parsing may return ``null`` values. Example: ``timestampFormat="yyyy-MM-dd HH:mm:ssXXX"``. + .. warning:: + + When interacting with files with timestamps, it may be necessary to specify the ``timestampFormat`` option to + ensure that dates are parsed correctly. Without it, date parsing may return ``null`` values. + Example: ``timestampFormat="yyyy-MM-ddTHH:mm:ss.SSSXXX"``. Examples -------- @@ -134,6 +138,8 @@ class XML(ReadWriteFileFormat): name: ClassVar[str] = "xml" row_tag: str = Field(alias="rowTag") + # Unable to use default timestamp_format due to a source code bug causing an UnsupportedTemporalTypeException (Unsupported field: ). + # timestamp_format: str = Field(default="yyyy-MM-ddTHH:mm:ss.SSSXXX", alias="timestampFormat") class Config: known_options = READ_OPTIONS | WRITE_OPTIONS diff --git a/tests/resources/file_df_connection/generate_files.py b/tests/resources/file_df_connection/generate_files.py index 21f3cf7fc..2417cb15d 100755 --- a/tests/resources/file_df_connection/generate_files.py +++ b/tests/resources/file_df_connection/generate_files.py @@ -481,7 +481,10 @@ def save_as_xml_plain(data: list[dict], path: Path) -> None: item = ElementTree.SubElement(root, "item") for key, value in record.items(): child = ElementTree.SubElement(item, key) - child.text = str(value) + if isinstance(value, datetime): + child.text = value.isoformat() + else: + child.text = str(value) tree = ElementTree.ElementTree(root) tree.write(path / "file.xml") @@ -492,11 +495,16 @@ def save_as_xml_with_attributes(data: list[dict], path: Path) -> None: root = ElementTree.Element("root") for record in data: - str_attributes = {key: str(value) for key, value in record.items()} + str_attributes = { + key: value.isoformat() if isinstance(value, datetime) else str(value) for key, value in record.items() + } item = ElementTree.SubElement(root, "item", attrib=str_attributes) for key, value in record.items(): child = ElementTree.SubElement(item, key) - child.text = str(value) + if isinstance(value, datetime): + child.text = value.isoformat() + else: + child.text = str(value) tree = ElementTree.ElementTree(root) tree.write(str(path / "file_with_attributes.xml")) @@ -510,7 +518,10 @@ def save_as_xml_gz(data: list[dict], path: Path) -> None: item = ElementTree.SubElement(root, "item") for key, value in record.items(): child = ElementTree.SubElement(item, key) - child.text = str(value) + if isinstance(value, datetime): + child.text = value.isoformat() + else: + child.text = str(value) ElementTree.ElementTree(root) xml_string = ElementTree.tostring(root, encoding="utf-8") diff --git a/tests/resources/file_df_connection/xml/with_attributes/file_with_attributes.xml b/tests/resources/file_df_connection/xml/with_attributes/file_with_attributes.xml index 9c170e560..f6fcbc7df 100644 --- a/tests/resources/file_df_connection/xml/with_attributes/file_with_attributes.xml +++ b/tests/resources/file_df_connection/xml/with_attributes/file_with_attributes.xml @@ -1 +1 @@ -1val11232021-01-012021-01-01 01:01:01+00:001.232val12342022-02-022022-02-02 02:02:02+00:002.343val23452023-03-032023-03-03 03:03:03+00:003.454val24562024-04-042024-04-04 04:04:04+00:004.565val35672025-05-052025-05-05 05:05:05+00:005.676val36782026-06-062026-06-06 06:06:06+00:006.787val37892027-07-072027-07-07 07:07:07+00:007.89 \ No newline at end of file +1val11232021-01-012021-01-01T01:01:01+00:001.232val12342022-02-022022-02-02T02:02:02+00:002.343val23452023-03-032023-03-03T03:03:03+00:003.454val24562024-04-042024-04-04T04:04:04+00:004.565val35672025-05-052025-05-05T05:05:05+00:005.676val36782026-06-062026-06-06T06:06:06+00:006.787val37892027-07-072027-07-07T07:07:07+00:007.89 \ No newline at end of file diff --git a/tests/resources/file_df_connection/xml/with_compression/file.xml.gz b/tests/resources/file_df_connection/xml/with_compression/file.xml.gz index 9d255211fe04d2d530721df454470dd1e577d301..aefbf24afd3c28d1bc2bc81979e4a425683aca82 100644 GIT binary patch literal 294 zcmV+>0onc^iwFpGxg%u)|7K}yWiEJaYyg$i?`py@6b0}H8cA>SS3~Gif15C|PY5}CnX>mx7%cMi?a42mEhhl)K zIZ7>naycyy#c`Q*=naQ_vCEgmgDl4qsxE4zyBQ8Cxrz701cLmt^fc4 literal 294 zcmV+>0onc^iwFpq)gEO6|7K}yWiEJaYyg$i?`py@6b0~C$#+S5o4+oEzRDE13Z4UYKHS=8K1)fm)AGxJ05~Rhuq_ad1E%IEH3rJ(v^bQ&WzwN{IJB{ZLvz5? z9HquU`1val11232021-01-012021-01-01 01:01:01+00:001.232val12342022-02-022022-02-02 02:02:02+00:002.343val23452023-03-032023-03-03 03:03:03+00:003.454val24562024-04-042024-04-04 04:04:04+00:004.565val35672025-05-052025-05-05 05:05:05+00:005.676val36782026-06-062026-06-06 06:06:06+00:006.787val37892027-07-072027-07-07 07:07:07+00:007.89 \ No newline at end of file +1val11232021-01-012021-01-01T01:01:01+00:001.232val12342022-02-022022-02-02T02:02:02+00:002.343val23452023-03-032023-03-03T03:03:03+00:003.454val24562024-04-042024-04-04T04:04:04+00:004.565val35672025-05-052025-05-05T05:05:05+00:005.676val36782026-06-062026-06-06T06:06:06+00:006.787val37892027-07-072027-07-07T07:07:07+00:007.89 \ No newline at end of file diff --git a/tests/tests_integration/test_file_format_integration/test_xml_integration.py b/tests/tests_integration/test_file_format_integration/test_xml_integration.py index 39f3245f9..d03a6f61d 100644 --- a/tests/tests_integration/test_file_format_integration/test_xml_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_xml_integration.py @@ -29,9 +29,9 @@ def expected_xml_attributes_df(file_df_dataframe): @pytest.mark.parametrize( "path, options", [ - ("without_compression", {"rowTag": "item", "timestampFormat": "yyyy-MM-dd HH:mm:ssXXX"}), - ("with_compression", {"rowTag": "item", "timestampFormat": "yyyy-MM-dd HH:mm:ssXXX", "compression": "gzip"}), - ("with_attributes", {"rowTag": "item", "timestampFormat": "yyyy-MM-dd HH:mm:ssXXX", "attributePrefix": "_"}), + ("without_compression", {"rowTag": "item"}), + ("with_compression", {"rowTag": "item", "compression": "gzip"}), + ("with_attributes", {"rowTag": "item", "attributePrefix": "_"}), ], ids=["without_compression", "with_compression", "with_attributes"], ) @@ -80,7 +80,7 @@ def test_xml_reader_with_infer_schema( reader = FileDFReader( connection=file_df_connection, - format=XML(rowTag="item", inferSchema=True, timestampFormat="yyyy-MM-dd HH:mm:ssXXX"), + format=XML(rowTag="item", inferSchema=True), source_path=xml_root, ) read_df = reader.run() @@ -139,7 +139,7 @@ def test_xml_writer( @pytest.mark.parametrize( "options", [ - {"rowTag": "item", "timestampFormat": "yyyy-MM-dd HH:mm:ssXXX", "attributePrefix": "_"}, + {"rowTag": "item", "attributePrefix": "_"}, ], ids=["read_attributes"], )