Skip to content

Commit 28309a4

Browse files
authored
feat: Add experimental support for Apache Spark 3.5.1 (apache#587)
* add profile * fix for LegacyBehaviorPolicy * fix 3.5 ShimCometScanExec * builds with Spark 3.5 * fix builds * use correct parquet version * make docs more explicit * bug fix * remove use of reflection * fix * fix 4.0 build * allow different stability plans for 3.5 * copy approved plans from 3.x to 3.5 * regenerate golden files for 3.5 * enable CI test * fix merge conflict * remove unused imports * Refine shim * remove some uses of reflection * refine shim * remove unused code * remove unused imports * add isTimestampNTZType to 3.5 shim * address feedback * remove unused imports * address feedback
1 parent a4b968e commit 28309a4

File tree

287 files changed

+58341
-38
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

287 files changed

+58341
-38
lines changed

.github/workflows/pr_build.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ jobs:
4646
os: [ubuntu-latest]
4747
java_version: [8, 11, 17]
4848
test-target: [rust, java]
49-
spark-version: ['3.4']
49+
spark-version: ['3.5']
5050
scala-version: ['2.12', '2.13']
5151
is_push_event:
5252
- ${{ github.event_name == 'push' }}
@@ -109,7 +109,7 @@ jobs:
109109
os: [ubuntu-latest]
110110
java_version: [8, 11, 17]
111111
test-target: [java]
112-
spark-version: ['3.3']
112+
spark-version: ['3.3', '3.4']
113113
scala-version: ['2.12', '2.13']
114114
fail-fast: false
115115
name: ${{ matrix.os }}/java ${{ matrix.java_version }}-spark-${{matrix.spark-version}}-scala-${{matrix.scala-version}}/${{ matrix.test-target }}
@@ -134,7 +134,7 @@ jobs:
134134
os: [macos-13]
135135
java_version: [8, 11, 17]
136136
test-target: [rust, java]
137-
spark-version: ['3.4']
137+
spark-version: ['3.4', '3.5']
138138
scala-version: ['2.12', '2.13']
139139
fail-fast: false
140140
if: github.event_name == 'push'
@@ -161,7 +161,7 @@ jobs:
161161
matrix:
162162
java_version: [8, 11, 17]
163163
test-target: [rust, java]
164-
spark-version: ['3.4']
164+
spark-version: ['3.4', '3.5']
165165
scala-version: ['2.12', '2.13']
166166
is_push_event:
167167
- ${{ github.event_name == 'push' }}
@@ -247,7 +247,7 @@ jobs:
247247
matrix:
248248
java_version: [8, 17]
249249
test-target: [java]
250-
spark-version: ['3.3']
250+
spark-version: ['3.3', '3.4']
251251
scala-version: ['2.12', '2.13']
252252
exclude:
253253
- java_version: 8

common/pom.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ under the License.
181181
<sources>
182182
<source>src/main/${shims.majorVerSrc}</source>
183183
<source>src/main/${shims.minorVerSrc}</source>
184+
<source>src/main/${shims.pre35Src}</source>
184185
</sources>
185186
</configuration>
186187
</execution>

common/src/main/spark-3.x/org/apache/comet/shims/ShimBatchReader.scala renamed to common/src/main/spark-3.3/org/apache/comet/shims/ShimBatchReader.scala

Lines changed: 8 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,31 +24,13 @@ import org.apache.spark.sql.execution.datasources.PartitionedFile
2424

2525
object ShimBatchReader {
2626

27-
// TODO: remove after dropping Spark 3.3 support and directly call PartitionedFile
2827
def newPartitionedFile(partitionValues: InternalRow, file: String): PartitionedFile =
29-
classOf[PartitionedFile].getDeclaredConstructors
30-
.map(c =>
31-
c.getParameterCount match {
32-
case 5 =>
33-
c.newInstance(
34-
partitionValues,
35-
file,
36-
Long.box(-1), // -1 means we read the entire file
37-
Long.box(-1),
38-
Array.empty[String])
39-
case 7 =>
40-
c.newInstance(
41-
partitionValues,
42-
c.getParameterTypes()(1)
43-
.getConstructor(classOf[String])
44-
.newInstance(file)
45-
.asInstanceOf[AnyRef],
46-
Long.box(-1), // -1 means we read the entire file
47-
Long.box(-1),
48-
Array.empty[String],
49-
Long.box(0),
50-
Long.box(0))
51-
})
52-
.head
53-
.asInstanceOf[PartitionedFile]
28+
PartitionedFile(
29+
partitionValues,
30+
file,
31+
-1, // -1 means we read the entire file
32+
-1,
33+
Array.empty[String],
34+
0,
35+
0)
5436
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.comet.shims
21+
22+
import org.apache.spark.paths.SparkPath
23+
import org.apache.spark.sql.catalyst.InternalRow
24+
import org.apache.spark.sql.execution.datasources.PartitionedFile
25+
26+
object ShimBatchReader {
27+
28+
def newPartitionedFile(partitionValues: InternalRow, file: String): PartitionedFile =
29+
PartitionedFile(
30+
partitionValues,
31+
SparkPath.fromPathString(file),
32+
-1, // -1 means we read the entire file
33+
-1,
34+
Array.empty[String],
35+
0,
36+
0)
37+
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.comet.shims
21+
22+
import org.apache.spark.paths.SparkPath
23+
import org.apache.spark.sql.catalyst.InternalRow
24+
import org.apache.spark.sql.execution.datasources.PartitionedFile
25+
26+
object ShimBatchReader {
27+
28+
def newPartitionedFile(partitionValues: InternalRow, file: String): PartitionedFile =
29+
PartitionedFile(
30+
partitionValues,
31+
SparkPath.fromPathString(file),
32+
-1, // -1 means we read the entire file
33+
-1,
34+
Array.empty[String],
35+
0,
36+
0,
37+
Map.empty)
38+
}

docs/source/user-guide/overview.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,19 @@ The following diagram illustrates the architecture of Comet:
4040

4141
## Current Status
4242

43-
The project is currently integrated into Apache Spark 3.3, and 3.4.
43+
Comet currently supports the following versions of Apache Spark:
44+
45+
- 3.3.x
46+
- 3.4.x
47+
48+
Experimental support is provided for the following versions of Apache Spark and is intended for development/testing
49+
use only and should not be used in production yet.
50+
51+
- 3.5.x
52+
- 4.0.0-preview1
53+
54+
Note that Comet may not fully work with proprietary forks of Apache Spark such as the Spark versions offered by
55+
Cloud Service Providers.
4456

4557
## Feature Parity with Apache Spark
4658

pom.xml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,11 @@ under the License.
9191
<argLine>-ea -Xmx4g -Xss4m ${extraJavaTestArgs}</argLine>
9292
<additional.3_3.test.source>spark-3.3-plus</additional.3_3.test.source>
9393
<additional.3_4.test.source>spark-3.4-plus</additional.3_4.test.source>
94+
<additional.3_5.test.source>not-needed</additional.3_5.test.source>
95+
<additional.pre35.test.source>spark-pre-3.5</additional.pre35.test.source>
9496
<shims.majorVerSrc>spark-3.x</shims.majorVerSrc>
9597
<shims.minorVerSrc>spark-3.4</shims.minorVerSrc>
98+
<shims.pre35Src>spark-pre-3.5</shims.pre35Src>
9699
</properties>
97100

98101
<dependencyManagement>
@@ -547,6 +550,21 @@ under the License.
547550
</properties>
548551
</profile>
549552

553+
<profile>
554+
<!-- FIXME: this is WIP. Tests may fail -->
555+
<id>spark-3.5</id>
556+
<properties>
557+
<scala.version>2.12.18</scala.version>
558+
<spark.version>3.5.1</spark.version>
559+
<spark.version.short>3.5</spark.version.short>
560+
<parquet.version>1.13.1</parquet.version>
561+
<shims.minorVerSrc>spark-3.5</shims.minorVerSrc>
562+
<shims.pre35Src>not-needed</shims.pre35Src>
563+
<additional.pre35.test.source>not-needed</additional.pre35.test.source>
564+
<additional.3_5.test.source>spark-3.5</additional.3_5.test.source>
565+
</properties>
566+
</profile>
567+
550568
<profile>
551569
<!-- FIXME: this is WIP. Tests may fail https://github.com/apache/datafusion-comet/issues/551 -->
552570
<id>spark-4.0</id>
@@ -561,6 +579,8 @@ under the License.
561579
<slf4j.version>2.0.13</slf4j.version>
562580
<shims.majorVerSrc>spark-4.0</shims.majorVerSrc>
563581
<shims.minorVerSrc>not-needed-yet</shims.minorVerSrc>
582+
<shims.pre35Src>not-needed</shims.pre35Src>
583+
<additional.pre35.test.source>not-needed</additional.pre35.test.source>
564584
<!-- Use jdk17 by default -->
565585
<java.version>17</java.version>
566586
<maven.compiler.source>${java.version}</maven.compiler.source>

spark/pom.xml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,8 @@ under the License.
252252
<sources>
253253
<source>src/test/${additional.3_3.test.source}</source>
254254
<source>src/test/${additional.3_4.test.source}</source>
255+
<source>src/test/${additional.3_5.test.source}</source>
256+
<source>src/test/${additional.pre35.test.source}</source>
255257
<source>src/test/${shims.majorVerSrc}</source>
256258
<source>src/test/${shims.minorVerSrc}</source>
257259
</sources>
@@ -267,6 +269,7 @@ under the License.
267269
<sources>
268270
<source>src/main/${shims.majorVerSrc}</source>
269271
<source>src/main/${shims.minorVerSrc}</source>
272+
<source>src/main/${shims.pre35Src}</source>
270273
</sources>
271274
</configuration>
272275
</execution>

spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,6 +1025,10 @@ object CometSparkSessionExtensions extends Logging {
10251025
org.apache.spark.SPARK_VERSION >= "3.4"
10261026
}
10271027

1028+
def isSpark35Plus: Boolean = {
1029+
org.apache.spark.SPARK_VERSION >= "3.5"
1030+
}
1031+
10281032
def isSpark40Plus: Boolean = {
10291033
org.apache.spark.SPARK_VERSION >= "4.0"
10301034
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.comet.shims
20+
21+
import org.apache.comet.expressions.CometEvalMode
22+
import org.apache.spark.sql.catalyst.expressions._
23+
import org.apache.spark.sql.types.{DataType, TimestampNTZType}
24+
25+
/**
26+
* `CometExprShim` acts as a shim for for parsing expressions from different Spark versions.
27+
*/
28+
trait CometExprShim {
29+
/**
30+
* Returns a tuple of expressions for the `unhex` function.
31+
*/
32+
protected def unhexSerde(unhex: Unhex): (Expression, Expression) = {
33+
(unhex.child, Literal(unhex.failOnError))
34+
}
35+
36+
protected def isTimestampNTZType(dt: DataType): Boolean = dt match {
37+
case _: TimestampNTZType => true
38+
case _ => false
39+
}
40+
41+
protected def evalMode(c: Cast): CometEvalMode.Value =
42+
CometEvalModeUtil.fromSparkEvalMode(c.evalMode)
43+
}
44+
45+
object CometEvalModeUtil {
46+
def fromSparkEvalMode(evalMode: EvalMode.Value): CometEvalMode.Value = evalMode match {
47+
case EvalMode.LEGACY => CometEvalMode.LEGACY
48+
case EvalMode.TRY => CometEvalMode.TRY
49+
case EvalMode.ANSI => CometEvalMode.ANSI
50+
}
51+
}
52+
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.comet.shims
21+
22+
import org.apache.spark.sql.internal.LegacyBehaviorPolicy
23+
import org.apache.spark.sql.internal.SQLConf
24+
25+
trait ShimSQLConf {
26+
27+
/**
28+
* Spark 3.4 renamed parquetFilterPushDownStringStartWith to
29+
* parquetFilterPushDownStringPredicate
30+
*/
31+
protected def getPushDownStringPredicate(sqlConf: SQLConf): Boolean =
32+
sqlConf.parquetFilterPushDownStringPredicate
33+
34+
protected val LEGACY = LegacyBehaviorPolicy.LEGACY
35+
protected val CORRECTED = LegacyBehaviorPolicy.CORRECTED
36+
}

0 commit comments

Comments
 (0)