Skip to content

Commit

Permalink
[DOP-9645] - update tests, documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
maxim-lixakov committed Oct 9, 2023
1 parent 618786d commit cef7f19
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 12 deletions.
8 changes: 7 additions & 1 deletion onetl/file/format/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,11 @@ class XML(ReadWriteFileFormat):
The set of supported options depends on Spark version. See link above.
**Important**: When **reading** files with timestamps, it may be necessary to specify the ``timestampFormat`` option to ensure dates are parsed correctly. Without it, date parsing may return ``null`` values. Example: ``timestampFormat="yyyy-MM-dd HH:mm:ssXXX"``.
.. warning::
When interacting with files with timestamps, it may be necessary to specify the ``timestampFormat`` option to
ensure that dates are parsed correctly. Without it, date parsing may return ``null`` values.
Example: ``timestampFormat="yyyy-MM-ddTHH:mm:ss.SSSXXX"``.
Examples
--------
Expand All @@ -134,6 +138,8 @@ class XML(ReadWriteFileFormat):
name: ClassVar[str] = "xml"

row_tag: str = Field(alias="rowTag")
# Unable to use default timestamp_format due to a source code bug causing an UnsupportedTemporalTypeException (Unsupported field: <field_name>).
# timestamp_format: str = Field(default="yyyy-MM-ddTHH:mm:ss.SSSXXX", alias="timestampFormat")

class Config:
known_options = READ_OPTIONS | WRITE_OPTIONS
Expand Down
19 changes: 15 additions & 4 deletions tests/resources/file_df_connection/generate_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,10 @@ def save_as_xml_plain(data: list[dict], path: Path) -> None:
item = ElementTree.SubElement(root, "item")
for key, value in record.items():
child = ElementTree.SubElement(item, key)
child.text = str(value)
if isinstance(value, datetime):
child.text = value.isoformat()
else:
child.text = str(value)

tree = ElementTree.ElementTree(root)
tree.write(path / "file.xml")
Expand All @@ -492,11 +495,16 @@ def save_as_xml_with_attributes(data: list[dict], path: Path) -> None:
root = ElementTree.Element("root")

for record in data:
str_attributes = {key: str(value) for key, value in record.items()}
str_attributes = {
key: value.isoformat() if isinstance(value, datetime) else str(value) for key, value in record.items()
}
item = ElementTree.SubElement(root, "item", attrib=str_attributes)
for key, value in record.items():
child = ElementTree.SubElement(item, key)
child.text = str(value)
if isinstance(value, datetime):
child.text = value.isoformat()
else:
child.text = str(value)

tree = ElementTree.ElementTree(root)
tree.write(str(path / "file_with_attributes.xml"))
Expand All @@ -510,7 +518,10 @@ def save_as_xml_gz(data: list[dict], path: Path) -> None:
item = ElementTree.SubElement(root, "item")
for key, value in record.items():
child = ElementTree.SubElement(item, key)
child.text = str(value)
if isinstance(value, datetime):
child.text = value.isoformat()
else:
child.text = str(value)

ElementTree.ElementTree(root)
xml_string = ElementTree.tostring(root, encoding="utf-8")
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1 @@
<root><item id="1" str_value="val1" int_value="123" date_value="2021-01-01" datetime_value="2021-01-01 01:01:01+00:00" float_value="1.23"><id>1</id><str_value>val1</str_value><int_value>123</int_value><date_value>2021-01-01</date_value><datetime_value>2021-01-01 01:01:01+00:00</datetime_value><float_value>1.23</float_value></item><item id="2" str_value="val1" int_value="234" date_value="2022-02-02" datetime_value="2022-02-02 02:02:02+00:00" float_value="2.34"><id>2</id><str_value>val1</str_value><int_value>234</int_value><date_value>2022-02-02</date_value><datetime_value>2022-02-02 02:02:02+00:00</datetime_value><float_value>2.34</float_value></item><item id="3" str_value="val2" int_value="345" date_value="2023-03-03" datetime_value="2023-03-03 03:03:03+00:00" float_value="3.45"><id>3</id><str_value>val2</str_value><int_value>345</int_value><date_value>2023-03-03</date_value><datetime_value>2023-03-03 03:03:03+00:00</datetime_value><float_value>3.45</float_value></item><item id="4" str_value="val2" int_value="456" date_value="2024-04-04" datetime_value="2024-04-04 04:04:04+00:00" float_value="4.56"><id>4</id><str_value>val2</str_value><int_value>456</int_value><date_value>2024-04-04</date_value><datetime_value>2024-04-04 04:04:04+00:00</datetime_value><float_value>4.56</float_value></item><item id="5" str_value="val3" int_value="567" date_value="2025-05-05" datetime_value="2025-05-05 05:05:05+00:00" float_value="5.67"><id>5</id><str_value>val3</str_value><int_value>567</int_value><date_value>2025-05-05</date_value><datetime_value>2025-05-05 05:05:05+00:00</datetime_value><float_value>5.67</float_value></item><item id="6" str_value="val3" int_value="678" date_value="2026-06-06" datetime_value="2026-06-06 06:06:06+00:00" float_value="6.78"><id>6</id><str_value>val3</str_value><int_value>678</int_value><date_value>2026-06-06</date_value><datetime_value>2026-06-06 06:06:06+00:00</datetime_value><float_value>6.78</float_value></item><item id="7" str_value="val3" int_value="789" date_value="2027-07-07" datetime_value="2027-07-07 07:07:07+00:00" float_value="7.89"><id>7</id><str_value>val3</str_value><int_value>789</int_value><date_value>2027-07-07</date_value><datetime_value>2027-07-07 07:07:07+00:00</datetime_value><float_value>7.89</float_value></item></root>
<root><item id="1" str_value="val1" int_value="123" date_value="2021-01-01" datetime_value="2021-01-01T01:01:01+00:00" float_value="1.23"><id>1</id><str_value>val1</str_value><int_value>123</int_value><date_value>2021-01-01</date_value><datetime_value>2021-01-01T01:01:01+00:00</datetime_value><float_value>1.23</float_value></item><item id="2" str_value="val1" int_value="234" date_value="2022-02-02" datetime_value="2022-02-02T02:02:02+00:00" float_value="2.34"><id>2</id><str_value>val1</str_value><int_value>234</int_value><date_value>2022-02-02</date_value><datetime_value>2022-02-02T02:02:02+00:00</datetime_value><float_value>2.34</float_value></item><item id="3" str_value="val2" int_value="345" date_value="2023-03-03" datetime_value="2023-03-03T03:03:03+00:00" float_value="3.45"><id>3</id><str_value>val2</str_value><int_value>345</int_value><date_value>2023-03-03</date_value><datetime_value>2023-03-03T03:03:03+00:00</datetime_value><float_value>3.45</float_value></item><item id="4" str_value="val2" int_value="456" date_value="2024-04-04" datetime_value="2024-04-04T04:04:04+00:00" float_value="4.56"><id>4</id><str_value>val2</str_value><int_value>456</int_value><date_value>2024-04-04</date_value><datetime_value>2024-04-04T04:04:04+00:00</datetime_value><float_value>4.56</float_value></item><item id="5" str_value="val3" int_value="567" date_value="2025-05-05" datetime_value="2025-05-05T05:05:05+00:00" float_value="5.67"><id>5</id><str_value>val3</str_value><int_value>567</int_value><date_value>2025-05-05</date_value><datetime_value>2025-05-05T05:05:05+00:00</datetime_value><float_value>5.67</float_value></item><item id="6" str_value="val3" int_value="678" date_value="2026-06-06" datetime_value="2026-06-06T06:06:06+00:00" float_value="6.78"><id>6</id><str_value>val3</str_value><int_value>678</int_value><date_value>2026-06-06</date_value><datetime_value>2026-06-06T06:06:06+00:00</datetime_value><float_value>6.78</float_value></item><item id="7" str_value="val3" int_value="789" date_value="2027-07-07" datetime_value="2027-07-07T07:07:07+00:00" float_value="7.89"><id>7</id><str_value>val3</str_value><int_value>789</int_value><date_value>2027-07-07</date_value><datetime_value>2027-07-07T07:07:07+00:00</datetime_value><float_value>7.89</float_value></item></root>
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1 +1 @@
<root><item><id>1</id><str_value>val1</str_value><int_value>123</int_value><date_value>2021-01-01</date_value><datetime_value>2021-01-01 01:01:01+00:00</datetime_value><float_value>1.23</float_value></item><item><id>2</id><str_value>val1</str_value><int_value>234</int_value><date_value>2022-02-02</date_value><datetime_value>2022-02-02 02:02:02+00:00</datetime_value><float_value>2.34</float_value></item><item><id>3</id><str_value>val2</str_value><int_value>345</int_value><date_value>2023-03-03</date_value><datetime_value>2023-03-03 03:03:03+00:00</datetime_value><float_value>3.45</float_value></item><item><id>4</id><str_value>val2</str_value><int_value>456</int_value><date_value>2024-04-04</date_value><datetime_value>2024-04-04 04:04:04+00:00</datetime_value><float_value>4.56</float_value></item><item><id>5</id><str_value>val3</str_value><int_value>567</int_value><date_value>2025-05-05</date_value><datetime_value>2025-05-05 05:05:05+00:00</datetime_value><float_value>5.67</float_value></item><item><id>6</id><str_value>val3</str_value><int_value>678</int_value><date_value>2026-06-06</date_value><datetime_value>2026-06-06 06:06:06+00:00</datetime_value><float_value>6.78</float_value></item><item><id>7</id><str_value>val3</str_value><int_value>789</int_value><date_value>2027-07-07</date_value><datetime_value>2027-07-07 07:07:07+00:00</datetime_value><float_value>7.89</float_value></item></root>
<root><item><id>1</id><str_value>val1</str_value><int_value>123</int_value><date_value>2021-01-01</date_value><datetime_value>2021-01-01T01:01:01+00:00</datetime_value><float_value>1.23</float_value></item><item><id>2</id><str_value>val1</str_value><int_value>234</int_value><date_value>2022-02-02</date_value><datetime_value>2022-02-02T02:02:02+00:00</datetime_value><float_value>2.34</float_value></item><item><id>3</id><str_value>val2</str_value><int_value>345</int_value><date_value>2023-03-03</date_value><datetime_value>2023-03-03T03:03:03+00:00</datetime_value><float_value>3.45</float_value></item><item><id>4</id><str_value>val2</str_value><int_value>456</int_value><date_value>2024-04-04</date_value><datetime_value>2024-04-04T04:04:04+00:00</datetime_value><float_value>4.56</float_value></item><item><id>5</id><str_value>val3</str_value><int_value>567</int_value><date_value>2025-05-05</date_value><datetime_value>2025-05-05T05:05:05+00:00</datetime_value><float_value>5.67</float_value></item><item><id>6</id><str_value>val3</str_value><int_value>678</int_value><date_value>2026-06-06</date_value><datetime_value>2026-06-06T06:06:06+00:00</datetime_value><float_value>6.78</float_value></item><item><id>7</id><str_value>val3</str_value><int_value>789</int_value><date_value>2027-07-07</date_value><datetime_value>2027-07-07T07:07:07+00:00</datetime_value><float_value>7.89</float_value></item></root>
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ def expected_xml_attributes_df(file_df_dataframe):
@pytest.mark.parametrize(
"path, options",
[
("without_compression", {"rowTag": "item", "timestampFormat": "yyyy-MM-dd HH:mm:ssXXX"}),
("with_compression", {"rowTag": "item", "timestampFormat": "yyyy-MM-dd HH:mm:ssXXX", "compression": "gzip"}),
("with_attributes", {"rowTag": "item", "timestampFormat": "yyyy-MM-dd HH:mm:ssXXX", "attributePrefix": "_"}),
("without_compression", {"rowTag": "item"}),
("with_compression", {"rowTag": "item", "compression": "gzip"}),
("with_attributes", {"rowTag": "item", "attributePrefix": "_"}),
],
ids=["without_compression", "with_compression", "with_attributes"],
)
Expand Down Expand Up @@ -80,7 +80,7 @@ def test_xml_reader_with_infer_schema(

reader = FileDFReader(
connection=file_df_connection,
format=XML(rowTag="item", inferSchema=True, timestampFormat="yyyy-MM-dd HH:mm:ssXXX"),
format=XML(rowTag="item", inferSchema=True),
source_path=xml_root,
)
read_df = reader.run()
Expand Down Expand Up @@ -139,7 +139,7 @@ def test_xml_writer(
@pytest.mark.parametrize(
"options",
[
{"rowTag": "item", "timestampFormat": "yyyy-MM-dd HH:mm:ssXXX", "attributePrefix": "_"},
{"rowTag": "item", "attributePrefix": "_"},
],
ids=["read_attributes"],
)
Expand Down

0 comments on commit cef7f19

Please sign in to comment.