[DOP-11365] Add explicit order_by to dataframe assertion

MobileTeleSystems · Dec 12, 2023 · 8ec95e0 · 8ec95e0
1 parent cec6c49
commit 8ec95e0
Show file tree

Hide file tree

Showing 21 changed files with 43 additions and 63 deletions.
diff --git a/tests/tests_integration/test_file_format_integration/test_avro_integration.py b/tests/tests_integration/test_file_format_integration/test_avro_integration.py
@@ -13,8 +13,7 @@
 try:
     from tests.util.assert_df import assert_equal_df
 except ImportError:
-    # pandas and spark can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]
 
@@ -72,7 +71,7 @@ def test_avro_reader(
 
     assert read_df.count()
     assert read_df.schema == df.schema
-    assert_equal_df(read_df, df)
+    assert_equal_df(read_df, df, order_by="id")
 
 
 @pytest.mark.parametrize(
@@ -116,4 +115,4 @@ def test_avro_writer(
 
     assert read_df.count()
     assert read_df.schema == df.schema
-    assert_equal_df(read_df, df)
+    assert_equal_df(read_df, df, order_by="id")
diff --git a/tests/tests_integration/test_file_format_integration/test_csv_integration.py b/tests/tests_integration/test_file_format_integration/test_csv_integration.py
@@ -16,8 +16,7 @@
     from tests.util.assert_df import assert_equal_df
     from tests.util.spark_df import reset_column_names
 except ImportError:
-    # pandas and spark can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas or pyspark", allow_module_level=True)
 
 pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]
 
@@ -56,7 +55,7 @@ def test_csv_reader_with_infer_schema(
 
     assert read_df.schema != df.schema
     assert read_df.schema == expected_df.schema
-    assert_equal_df(read_df, expected_df)
+    assert_equal_df(read_df, expected_df, order_by="id")
 
 
 @pytest.mark.parametrize(
@@ -89,7 +88,7 @@ def test_csv_reader_with_options(
 
     assert read_df.count()
     assert read_df.schema == df.schema
-    assert_equal_df(read_df, df)
+    assert_equal_df(read_df, df, order_by="id")
 
 
 @pytest.mark.parametrize(
@@ -131,4 +130,4 @@ def test_csv_writer_with_options(
 
     assert read_df.count()
     assert read_df.schema == df.schema
-    assert_equal_df(read_df, df)
+    assert_equal_df(read_df, df, order_by="id")
diff --git a/tests/tests_integration/test_file_format_integration/test_excel_integration.py b/tests/tests_integration/test_file_format_integration/test_excel_integration.py
@@ -16,8 +16,7 @@
     from tests.util.assert_df import assert_equal_df
     from tests.util.spark_df import reset_column_names
 except ImportError:
-    # pandas and spark can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas or pyspark", allow_module_level=True)
 
 pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]
 
@@ -56,7 +55,7 @@ def test_excel_reader_with_infer_schema(
 
     assert read_df.schema != df.schema
     assert read_df.schema == expected_df.schema
-    assert_equal_df(read_df, expected_df)
+    assert_equal_df(read_df, expected_df, order_by="id")
 
 
 @pytest.mark.parametrize("format", ["xlsx", "xls"])
@@ -96,7 +95,7 @@ def test_excel_reader_with_options(
 
     assert read_df.count()
     assert read_df.schema == df.schema
-    assert_equal_df(read_df, df)
+    assert_equal_df(read_df, df, order_by="id")
 
 
 @pytest.mark.parametrize(
@@ -139,4 +138,4 @@ def test_excel_writer(
 
     assert read_df.count()
     assert read_df.schema == df.schema
-    assert_equal_df(read_df, df)
+    assert_equal_df(read_df, df, order_by="id")
diff --git a/tests/tests_integration/test_file_format_integration/test_json_integration.py b/tests/tests_integration/test_file_format_integration/test_json_integration.py
@@ -12,8 +12,7 @@
 try:
     from tests.util.assert_df import assert_equal_df
 except ImportError:
-    # pandas and spark can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]
 
@@ -47,7 +46,7 @@ def test_json_reader(
 
     assert read_df.count()
     assert read_df.schema == df.schema
-    assert_equal_df(read_df, df)
+    assert_equal_df(read_df, df, order_by="id")
 
 
 def test_json_writer_is_not_supported(

diff --git a/tests/tests_integration/test_file_format_integration/test_jsonline_integration.py b/tests/tests_integration/test_file_format_integration/test_jsonline_integration.py
@@ -12,8 +12,7 @@
 try:
     from tests.util.assert_df import assert_equal_df
 except ImportError:
-    # pandas and spark can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]
 
@@ -47,7 +46,7 @@ def test_jsonline_reader(
 
     assert read_df.count()
     assert read_df.schema == df.schema
-    assert_equal_df(read_df, df)
+    assert_equal_df(read_df, df, order_by="id")
 
 
 @pytest.mark.parametrize(
@@ -85,4 +84,4 @@ def test_jsonline_writer(
 
     assert read_df.count()
     assert read_df.schema == df.schema
-    assert_equal_df(read_df, df)
+    assert_equal_df(read_df, df, order_by="id")
diff --git a/tests/tests_integration/test_file_format_integration/test_orc_integration.py b/tests/tests_integration/test_file_format_integration/test_orc_integration.py
@@ -12,8 +12,7 @@
 try:
     from tests.util.assert_df import assert_equal_df
 except ImportError:
-    # pandas and spark can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]
 
@@ -47,7 +46,7 @@ def test_orc_reader(
 
     assert read_df.count()
     assert read_df.schema == df.schema
-    assert_equal_df(read_df, df)
+    assert_equal_df(read_df, df, order_by="id")
 
 
 @pytest.mark.parametrize(
@@ -85,4 +84,4 @@ def test_orc_writer(
 
     assert read_df.count()
     assert read_df.schema == df.schema
-    assert_equal_df(read_df, df)
+    assert_equal_df(read_df, df, order_by="id")
diff --git a/tests/tests_integration/test_file_format_integration/test_parquet_integration.py b/tests/tests_integration/test_file_format_integration/test_parquet_integration.py
@@ -12,8 +12,7 @@
 try:
     from tests.util.assert_df import assert_equal_df
 except ImportError:
-    # pandas and spark can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]
 
@@ -47,7 +46,7 @@ def test_parquet_reader(
 
     assert read_df.count()
     assert read_df.schema == df.schema
-    assert_equal_df(read_df, df)
+    assert_equal_df(read_df, df, order_by="id")
 
 
 @pytest.mark.parametrize(
@@ -85,4 +84,4 @@ def test_parquet_writer(
 
     assert read_df.count()
     assert read_df.schema == df.schema
-    assert_equal_df(read_df, df)
+    assert_equal_df(read_df, df, order_by="id")
diff --git a/tests/tests_integration/test_file_format_integration/test_xml_integration.py b/tests/tests_integration/test_file_format_integration/test_xml_integration.py
@@ -13,8 +13,7 @@
 try:
     from tests.util.assert_df import assert_equal_df
 except ImportError:
-    # pandas and spark can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]
 
@@ -60,7 +59,7 @@ def test_xml_reader(
     read_df = reader.run()
     assert read_df.count()
     assert read_df.schema == df.schema
-    assert_equal_df(read_df, df)
+    assert_equal_df(read_df, df, order_by="id")
 
 
 def test_xml_reader_with_infer_schema(
@@ -90,7 +89,7 @@ def test_xml_reader_with_infer_schema(
     assert set(read_df.columns) == set(
         expected_xml_attributes_df.columns,
     )  # "DataFrames have different column types: StructField('id', IntegerType(), True), StructField('id', LongType(), True), etc."
-    assert_equal_df(read_df, expected_xml_attributes_df)
+    assert_equal_df(read_df, expected_xml_attributes_df, order_by="id")
 
 
 @pytest.mark.parametrize(
@@ -133,7 +132,7 @@ def test_xml_writer(
 
     assert read_df.count()
     assert read_df.schema == df.schema
-    assert_equal_df(read_df, df)
+    assert_equal_df(read_df, df, order_by="id")
 
 
 @pytest.mark.parametrize(
@@ -166,4 +165,4 @@ def test_xml_reader_with_attributes(
     read_df = reader.run()
     assert read_df.count()
     assert read_df.schema == expected_xml_attributes_df.schema
-    assert_equal_df(read_df, expected_xml_attributes_df)
+    assert_equal_df(read_df, expected_xml_attributes_df, order_by="id")
diff --git a/...ore_integration/test_file_df_reader_integration/test_common_file_df_reader_integration.py b/...ore_integration/test_file_df_reader_integration/test_common_file_df_reader_integration.py
@@ -28,8 +28,7 @@
 
     from tests.util.assert_df import assert_equal_df
 except ImportError:
-    # pandas and spark can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas or pyspark", allow_module_level=True)
 
 
 def test_file_df_reader_run(

diff --git a/...ore_integration/test_file_df_writer_integration/test_common_file_df_writer_integration.py b/...ore_integration/test_file_df_writer_integration/test_common_file_df_writer_integration.py
@@ -15,8 +15,7 @@
 try:
     from tests.util.assert_df import assert_equal_df
 except ImportError:
-    # pandas and spark can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 
 @pytest.mark.parametrize(

diff --git a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py
@@ -6,8 +6,7 @@
 try:
     import pandas
 except ImportError:
-    # pandas can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 from onetl.connection import Clickhouse
 

diff --git a/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py b/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py
@@ -5,8 +5,7 @@
 try:
     import pandas
 except ImportError:
-    # pandas can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 from onetl.connection import Greenplum
 

diff --git a/tests/tests_integration/tests_db_connection_integration/test_hive_integration.py b/tests/tests_integration/tests_db_connection_integration/test_hive_integration.py
@@ -7,8 +7,7 @@
 try:
     import pandas
 except ImportError:
-    # pandas can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 from onetl.connection import Hive
 

diff --git a/tests/tests_integration/tests_db_connection_integration/test_mssql_integration.py b/tests/tests_integration/tests_db_connection_integration/test_mssql_integration.py
@@ -5,8 +5,7 @@
 try:
     import pandas
 except ImportError:
-    # pandas can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 from onetl.connection import MSSQL
 

diff --git a/tests/tests_integration/tests_db_connection_integration/test_mysql_integration.py b/tests/tests_integration/tests_db_connection_integration/test_mysql_integration.py
@@ -5,8 +5,7 @@
 try:
     import pandas
 except ImportError:
-    # pandas can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 from onetl.connection import MySQL
 

diff --git a/tests/tests_integration/tests_db_connection_integration/test_oracle_integration.py b/tests/tests_integration/tests_db_connection_integration/test_oracle_integration.py
@@ -6,8 +6,7 @@
 try:
     import pandas
 except ImportError:
-    # pandas can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 from onetl.connection import Oracle
 

diff --git a/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py b/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py
@@ -5,8 +5,7 @@
 try:
     import pandas
 except ImportError:
-    # pandas can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 from onetl.connection import Postgres
 

diff --git a/tests/tests_integration/tests_strategy_integration/test_strategy_incremental_batch.py b/tests/tests_integration/tests_strategy_integration/test_strategy_incremental_batch.py
@@ -12,8 +12,7 @@
 
     from tests.util.to_pandas import to_pandas
 except ImportError:
-    # pandas can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 from etl_entities.hwm_store import HWMStoreStackManager
 

diff --git a/tests/tests_integration/tests_strategy_integration/test_strategy_snapshot.py b/tests/tests_integration/tests_strategy_integration/test_strategy_snapshot.py
@@ -12,8 +12,7 @@
 
     from tests.util.to_pandas import to_pandas
 except ImportError:
-    # pandas can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 from onetl.connection import Postgres
 from onetl.db import DBReader

diff --git a/...n/tests_incremental_batch_strategy_integration/test_strategy_incremental_batch_mongodb.py b/...n/tests_incremental_batch_strategy_integration/test_strategy_incremental_batch_mongodb.py
@@ -123,7 +123,7 @@ def test_mongodb_strategy_incremental_batch(
     # same behavior as SnapshotBatchStrategy, no rows skipped
     if "int" in hwm_column:
         # only changed data has been read
-        processing.assert_equal_df(df=first_df, other_frame=first_span, order_by="id_int")
+        processing.assert_equal_df(df=first_df, other_frame=first_span, order_by="_id")
     else:
         # date and datetime values have a random part
         # so instead of checking the whole dataframe a partial comparison should be performed
@@ -172,7 +172,7 @@ def test_mongodb_strategy_incremental_batch(
 
     if "int" in hwm_column:
         # only changed data has been read
-        processing.assert_equal_df(df=second_df, other_frame=second_span, order_by="id_int")
+        processing.assert_equal_df(df=second_df, other_frame=second_span, order_by="_id")
     else:
         # date and datetime values have a random part
         # so instead of checking the whole dataframe a partial comparison should be performed
@@ -224,7 +224,7 @@ def test_mongodb_strategy_incremental_batch_where(spark, processing, prepare_sch
             else:
                 first_df = first_df.union(next_df)
 
-    processing.assert_equal_df(df=first_df, other_frame=first_span[:51], order_by="id_int")
+    processing.assert_equal_df(df=first_df, other_frame=first_span[:51], order_by="_id")
 
     # insert second span
     processing.insert_data(
@@ -243,4 +243,4 @@ def test_mongodb_strategy_incremental_batch_where(spark, processing, prepare_sch
             else:
                 second_df = second_df.union(next_df)
 
-    processing.assert_equal_df(df=second_df, other_frame=second_span[:19], order_by="id_int")
+    processing.assert_equal_df(df=second_df, other_frame=second_span[:19], order_by="_id")
diff --git a/...tegy_integration/tests_incremental_strategy_integration/test_strategy_increment_common.py b/...tegy_integration/tests_incremental_strategy_integration/test_strategy_increment_common.py
@@ -11,8 +11,7 @@
 try:
     import pandas
 except ImportError:
-    # pandas can be missing if someone runs tests for file connections only
-    pass
+    pytest.skip("Missing pandas", allow_module_level=True)
 
 from onetl.connection import Postgres
 from onetl.db import DBReader