[SPARK-51989][PYTHON] Add missing Filter subclasses to __all__ list in datasource

allisonwang-db · allisonwang-db · commit 633419a8c7a3 · 2025-05-06T11:12:17.000-07:00
### What changes were proposed in this pull request? This PR adds missing Filter subclasses to __all__ list in pyspark.sql.datasource. ### Why are the changes needed? To improve python data source filter pushdown ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? ### Was this patch authored or co-authored using generative AI tooling? Closes #50782 from allisonwang-db/spark-51989-missing-filter. Authored-by: Allison Wang <allison.wang@databricks.com> Signed-off-by: Allison Wang <allison.wang@databricks.com>
diff --git a/python/pyspark/sql/datasource.py b/python/pyspark/sql/datasource.py
@@ -53,6 +53,18 @@
     "WriterCommitMessage",
     "Filter",
     "EqualTo",
+    "EqualNullSafe",
+    "GreaterThan",
+    "GreaterThanOrEqual",
+    "LessThan",
+    "LessThanOrEqual",
+    "In",
+    "IsNull",
+    "IsNotNull",
+    "Not",
+    "StringStartsWith",
+    "StringEndsWith",
+    "StringContains",
 ]
 
 
@@ -966,7 +978,7 @@ def abort(self, messages: List[Optional["WriterCommitMessage"]]) -> None:
 
 class DataSourceArrowWriter(DataSourceWriter):
     """
-    A base class for data source writers that process data using PyArrow’s `RecordBatch`.
+    A base class for data source writers that process data using PyArrow's `RecordBatch`.
 
     Unlike :class:`DataSourceWriter`, which works with an iterator of Spark Rows, this class
     is optimized for using the Arrow format when writing data. It can offer better performance