Skip to content

Commit

Permalink
[DOP-11365] Add explicit order_by to dataframe assertion
Browse files Browse the repository at this point in the history
  • Loading branch information
dolfinus committed Dec 12, 2023
1 parent cec6c49 commit 8ec95e0
Show file tree
Hide file tree
Showing 21 changed files with 43 additions and 63 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@
try:
from tests.util.assert_df import assert_equal_df
except ImportError:
# pandas and spark can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)

pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]

Expand Down Expand Up @@ -72,7 +71,7 @@ def test_avro_reader(

assert read_df.count()
assert read_df.schema == df.schema
assert_equal_df(read_df, df)
assert_equal_df(read_df, df, order_by="id")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -116,4 +115,4 @@ def test_avro_writer(

assert read_df.count()
assert read_df.schema == df.schema
assert_equal_df(read_df, df)
assert_equal_df(read_df, df, order_by="id")
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
from tests.util.assert_df import assert_equal_df
from tests.util.spark_df import reset_column_names
except ImportError:
# pandas and spark can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas or pyspark", allow_module_level=True)

pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]

Expand Down Expand Up @@ -56,7 +55,7 @@ def test_csv_reader_with_infer_schema(

assert read_df.schema != df.schema
assert read_df.schema == expected_df.schema
assert_equal_df(read_df, expected_df)
assert_equal_df(read_df, expected_df, order_by="id")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -89,7 +88,7 @@ def test_csv_reader_with_options(

assert read_df.count()
assert read_df.schema == df.schema
assert_equal_df(read_df, df)
assert_equal_df(read_df, df, order_by="id")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -131,4 +130,4 @@ def test_csv_writer_with_options(

assert read_df.count()
assert read_df.schema == df.schema
assert_equal_df(read_df, df)
assert_equal_df(read_df, df, order_by="id")
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
from tests.util.assert_df import assert_equal_df
from tests.util.spark_df import reset_column_names
except ImportError:
# pandas and spark can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas or pyspark", allow_module_level=True)

pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]

Expand Down Expand Up @@ -56,7 +55,7 @@ def test_excel_reader_with_infer_schema(

assert read_df.schema != df.schema
assert read_df.schema == expected_df.schema
assert_equal_df(read_df, expected_df)
assert_equal_df(read_df, expected_df, order_by="id")


@pytest.mark.parametrize("format", ["xlsx", "xls"])
Expand Down Expand Up @@ -96,7 +95,7 @@ def test_excel_reader_with_options(

assert read_df.count()
assert read_df.schema == df.schema
assert_equal_df(read_df, df)
assert_equal_df(read_df, df, order_by="id")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -139,4 +138,4 @@ def test_excel_writer(

assert read_df.count()
assert read_df.schema == df.schema
assert_equal_df(read_df, df)
assert_equal_df(read_df, df, order_by="id")
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
try:
from tests.util.assert_df import assert_equal_df
except ImportError:
# pandas and spark can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)

pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]

Expand Down Expand Up @@ -47,7 +46,7 @@ def test_json_reader(

assert read_df.count()
assert read_df.schema == df.schema
assert_equal_df(read_df, df)
assert_equal_df(read_df, df, order_by="id")


def test_json_writer_is_not_supported(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
try:
from tests.util.assert_df import assert_equal_df
except ImportError:
# pandas and spark can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)

pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]

Expand Down Expand Up @@ -47,7 +46,7 @@ def test_jsonline_reader(

assert read_df.count()
assert read_df.schema == df.schema
assert_equal_df(read_df, df)
assert_equal_df(read_df, df, order_by="id")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -85,4 +84,4 @@ def test_jsonline_writer(

assert read_df.count()
assert read_df.schema == df.schema
assert_equal_df(read_df, df)
assert_equal_df(read_df, df, order_by="id")
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
try:
from tests.util.assert_df import assert_equal_df
except ImportError:
# pandas and spark can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)

pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]

Expand Down Expand Up @@ -47,7 +46,7 @@ def test_orc_reader(

assert read_df.count()
assert read_df.schema == df.schema
assert_equal_df(read_df, df)
assert_equal_df(read_df, df, order_by="id")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -85,4 +84,4 @@ def test_orc_writer(

assert read_df.count()
assert read_df.schema == df.schema
assert_equal_df(read_df, df)
assert_equal_df(read_df, df, order_by="id")
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
try:
from tests.util.assert_df import assert_equal_df
except ImportError:
# pandas and spark can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)

pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]

Expand Down Expand Up @@ -47,7 +46,7 @@ def test_parquet_reader(

assert read_df.count()
assert read_df.schema == df.schema
assert_equal_df(read_df, df)
assert_equal_df(read_df, df, order_by="id")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -85,4 +84,4 @@ def test_parquet_writer(

assert read_df.count()
assert read_df.schema == df.schema
assert_equal_df(read_df, df)
assert_equal_df(read_df, df, order_by="id")
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@
try:
from tests.util.assert_df import assert_equal_df
except ImportError:
# pandas and spark can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)

pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]

Expand Down Expand Up @@ -60,7 +59,7 @@ def test_xml_reader(
read_df = reader.run()
assert read_df.count()
assert read_df.schema == df.schema
assert_equal_df(read_df, df)
assert_equal_df(read_df, df, order_by="id")


def test_xml_reader_with_infer_schema(
Expand Down Expand Up @@ -90,7 +89,7 @@ def test_xml_reader_with_infer_schema(
assert set(read_df.columns) == set(
expected_xml_attributes_df.columns,
) # "DataFrames have different column types: StructField('id', IntegerType(), True), StructField('id', LongType(), True), etc."
assert_equal_df(read_df, expected_xml_attributes_df)
assert_equal_df(read_df, expected_xml_attributes_df, order_by="id")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -133,7 +132,7 @@ def test_xml_writer(

assert read_df.count()
assert read_df.schema == df.schema
assert_equal_df(read_df, df)
assert_equal_df(read_df, df, order_by="id")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -166,4 +165,4 @@ def test_xml_reader_with_attributes(
read_df = reader.run()
assert read_df.count()
assert read_df.schema == expected_xml_attributes_df.schema
assert_equal_df(read_df, expected_xml_attributes_df)
assert_equal_df(read_df, expected_xml_attributes_df, order_by="id")
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@

from tests.util.assert_df import assert_equal_df
except ImportError:
# pandas and spark can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas or pyspark", allow_module_level=True)


def test_file_df_reader_run(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
try:
from tests.util.assert_df import assert_equal_df
except ImportError:
# pandas and spark can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)


@pytest.mark.parametrize(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
try:
import pandas
except ImportError:
# pandas can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)

from onetl.connection import Clickhouse

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
try:
import pandas
except ImportError:
# pandas can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)

from onetl.connection import Greenplum

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
try:
import pandas
except ImportError:
# pandas can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)

from onetl.connection import Hive

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
try:
import pandas
except ImportError:
# pandas can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)

from onetl.connection import MSSQL

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
try:
import pandas
except ImportError:
# pandas can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)

from onetl.connection import MySQL

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
try:
import pandas
except ImportError:
# pandas can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)

from onetl.connection import Oracle

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
try:
import pandas
except ImportError:
# pandas can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)

from onetl.connection import Postgres

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@

from tests.util.to_pandas import to_pandas
except ImportError:
# pandas can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)

from etl_entities.hwm_store import HWMStoreStackManager

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@

from tests.util.to_pandas import to_pandas
except ImportError:
# pandas can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)

from onetl.connection import Postgres
from onetl.db import DBReader
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def test_mongodb_strategy_incremental_batch(
# same behavior as SnapshotBatchStrategy, no rows skipped
if "int" in hwm_column:
# only changed data has been read
processing.assert_equal_df(df=first_df, other_frame=first_span, order_by="id_int")
processing.assert_equal_df(df=first_df, other_frame=first_span, order_by="_id")
else:
# date and datetime values have a random part
# so instead of checking the whole dataframe a partial comparison should be performed
Expand Down Expand Up @@ -172,7 +172,7 @@ def test_mongodb_strategy_incremental_batch(

if "int" in hwm_column:
# only changed data has been read
processing.assert_equal_df(df=second_df, other_frame=second_span, order_by="id_int")
processing.assert_equal_df(df=second_df, other_frame=second_span, order_by="_id")
else:
# date and datetime values have a random part
# so instead of checking the whole dataframe a partial comparison should be performed
Expand Down Expand Up @@ -224,7 +224,7 @@ def test_mongodb_strategy_incremental_batch_where(spark, processing, prepare_sch
else:
first_df = first_df.union(next_df)

processing.assert_equal_df(df=first_df, other_frame=first_span[:51], order_by="id_int")
processing.assert_equal_df(df=first_df, other_frame=first_span[:51], order_by="_id")

# insert second span
processing.insert_data(
Expand All @@ -243,4 +243,4 @@ def test_mongodb_strategy_incremental_batch_where(spark, processing, prepare_sch
else:
second_df = second_df.union(next_df)

processing.assert_equal_df(df=second_df, other_frame=second_span[:19], order_by="id_int")
processing.assert_equal_df(df=second_df, other_frame=second_span[:19], order_by="_id")
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
try:
import pandas
except ImportError:
# pandas can be missing if someone runs tests for file connections only
pass
pytest.skip("Missing pandas", allow_module_level=True)

from onetl.connection import Postgres
from onetl.db import DBReader
Expand Down

0 comments on commit 8ec95e0

Please sign in to comment.