Skip to content

Commit f08e689

Browse files
committed
Bump to Scio v0.14.10 and Beam 2.61
1 parent efee98b commit f08e689

File tree

4 files changed

+180
-90
lines changed

4 files changed

+180
-90
lines changed

build.sbt

+67-19
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,63 @@
11
import sbt._
22
import Keys._
3-
val scioVersion = "0.10.4"
4-
val beamVersion = "2.30.0"
5-
lazy val commonSettings = Def.settings(
6-
organization := "dev.herraiz",
3+
import com.here.bom.Bom
4+
5+
val scioVersion = "0.14.10"
6+
val beamVersion = "2.61.0"
7+
8+
val guavaVersion = "33.1.0-jre"
9+
val jacksonVersion = "2.15.4"
10+
val magnolifyVersion = "0.7.4"
11+
val nettyVersion = "4.1.100.Final"
12+
val slf4jVersion = "1.7.30"
13+
val gcpLibrariesVersion = "26.45.0"
14+
15+
lazy val gcpBom = Bom(
16+
"com.google.cloud" % "libraries-bom" % gcpLibrariesVersion
17+
)
18+
lazy val beamBom = Bom("org.apache.beam" % "beam-sdks-java-bom" % beamVersion)
19+
lazy val guavaBom = Bom("com.google.guava" % "guava-bom" % guavaVersion)
20+
lazy val jacksonBom = Bom(
21+
"com.fasterxml.jackson" % "jackson-bom" % jacksonVersion
22+
)
23+
lazy val magnolifyBom = Bom("com.spotify" % "magnolify-bom" % magnolifyVersion)
24+
lazy val nettyBom = Bom("io.netty" % "netty-bom" % nettyVersion)
25+
lazy val scioBom = Bom("com.spotify" % "scio-bom" % scioVersion)
26+
27+
val bomSettings = Def.settings(
28+
gcpBom,
29+
beamBom,
30+
guavaBom,
31+
jacksonBom,
32+
magnolifyBom,
33+
nettyBom,
34+
dependencyOverrides ++=
35+
gcpBom.key.value.bomDependencies ++
36+
beamBom.key.value.bomDependencies ++
37+
guavaBom.key.value.bomDependencies ++
38+
jacksonBom.key.value.bomDependencies ++
39+
magnolifyBom.key.value.bomDependencies ++
40+
nettyBom.key.value.bomDependencies
41+
)
42+
43+
lazy val commonSettings = bomSettings ++ Def.settings(
44+
organization := "example",
745
// Semantic versioning http://semver.org/
846
version := "0.1.0-SNAPSHOT",
9-
scalaVersion := "2.13.16",
10-
scalacOptions ++= Seq("-target:jvm-1.8",
11-
"-deprecation",
12-
"-feature",
13-
"-unchecked",
14-
"-Ymacro-annotations"),
15-
javacOptions ++= Seq("-source", "1.8", "-target", "1.8")
47+
scalaVersion := "2.13.15",
48+
scalacOptions ++= Seq(
49+
"-release",
50+
"11",
51+
"-deprecation",
52+
"-feature",
53+
"-unchecked",
54+
"-Ymacro-annotations"
55+
),
56+
javacOptions ++= Seq("--release", "11"),
57+
// add extra resolved and remove exclude if you need kafka
58+
// resolvers += "confluent" at "https://packages.confluent.io/maven/",
59+
excludeDependencies += "org.apache.beam" % "beam-sdks-java-io-kafka",
60+
excludeDependencies += "com.github.luben" % "zstd-jni"
1661
)
1762

1863
lazy val root: Project = project
@@ -22,19 +67,21 @@ lazy val root: Project = project
2267
name := "scio-scala-workshop-beam-summit",
2368
description := "scio-scala-workshop-beam-summit",
2469
publish / skip := true,
25-
run / classLoaderLayeringStrategy := ClassLoaderLayeringStrategy.Flat,
70+
fork := true,
71+
run / outputStrategy := Some(OutputStrategy.StdoutOutput),
2672
libraryDependencies ++= Seq(
2773
"com.spotify" %% "scio-core" % scioVersion,
2874
"com.spotify" %% "scio-google-cloud-platform" % scioVersion,
2975
"com.spotify" %% "scio-extra" % scioVersion,
3076
"com.spotify" %% "scio-test" % scioVersion % Test,
31-
"org.apache.beam" % "beam-runners-direct-java" % beamVersion,
32-
"org.apache.beam" % "beam-runners-google-cloud-dataflow-java" % beamVersion,
33-
"org.slf4j" % "slf4j-simple" % "2.0.16",
34-
"com.google.http-client" % "google-http-client-apache-v2" % "1.45.3"
77+
"org.slf4j" % "slf4j-api" % slf4jVersion,
78+
"org.apache.beam" % "beam-runners-google-cloud-dataflow-java" % beamVersion % Runtime,
79+
"org.apache.beam" % "beam-runners-direct-java" % beamVersion % Test,
80+
"com.spotify" %% "scio-test" % scioVersion % Test,
81+
"org.slf4j" % "slf4j-simple" % slf4jVersion % Test,
82+
"com.github.luben" % "zstd-jni" % "1.5.2-2"
3583
)
3684
)
37-
.enablePlugins(JavaAppPackaging)
3885

3986
lazy val repl: Project = project
4087
.in(file(".repl"))
@@ -46,8 +93,9 @@ lazy val repl: Project = project
4693
"com.spotify" %% "scio-repl" % scioVersion
4794
),
4895
Compile / mainClass := Some("com.spotify.scio.repl.ScioShell"),
49-
publish / skip := true
96+
publish / skip := true,
97+
fork := false
5098
)
5199
.dependsOn(root)
52100

53-
resolvers += "confluent" at "https://packages.confluent.io/maven/"
101+
ThisBuild / versionPolicyIntention := Compatibility.BinaryAndSourceCompatible

project/plugins.sbt

+3-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.8.1")
1+
addDependencyTreePlugin
2+
addSbtPlugin("com.here.platform" % "sbt-bom" % "1.0.17")
3+
addSbtPlugin("ch.epfl.scala" % "sbt-version-policy" % "3.2.1")

src/main/scala/dev/herraiz/TaxiSessionsPipeline.scala

+63-31
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,16 @@
1616

1717
package dev.herraiz
1818

19+
import io.circe.Error
20+
import com.spotify.scio._
1921
import com.spotify.scio.bigquery._
2022
import com.spotify.scio.pubsub._
21-
import com.spotify.scio.values.{SCollection, WindowOptions}
22-
import com.spotify.scio.{Args, ContextAndArgs, ScioContext, streaming}
23+
import com.spotify.scio.values._
2324
import dev.herraiz.data.DataTypes._
24-
import io.circe
25-
import org.apache.beam.sdk.transforms.windowing.{AfterProcessingTime, AfterWatermark}
25+
import org.apache.beam.sdk.transforms.windowing.{
26+
AfterProcessingTime,
27+
AfterWatermark
28+
}
2629
import org.joda.time.Duration
2730

2831
object TaxiSessionsPipeline {
@@ -40,67 +43,96 @@ object TaxiSessionsPipeline {
4043
val accumTable = opts("accum-table")
4144

4245
val messages: SCollection[String] = getMessagesFromPubSub(pubsubTopic)
43-
val (rides: SCollection[PointTaxiRide], writableErrors: SCollection[JsonError]) = parseJSONStrings(messages)
44-
45-
rides.saveAsBigQueryTable(Table.Spec(goodTable), WRITE_APPEND, CREATE_IF_NEEDED)
46-
writableErrors.saveAsBigQueryTable(Table.Spec(badTable), WRITE_APPEND, CREATE_IF_NEEDED)
46+
val (
47+
rides: SCollection[PointTaxiRide],
48+
writableErrors: SCollection[JsonError]
49+
) = parseJSONStrings(messages)
50+
51+
rides
52+
.saveAsTypedBigQueryTable(
53+
Table.Spec(goodTable)
54+
)
55+
56+
writableErrors.saveAsTypedBigQueryTable(
57+
Table.Spec(badTable)
58+
)
4759

4860
// Group by session with a max duration of 5 mins between events
4961
// Window options
5062
val wopts: WindowOptions = customWindowOptions
5163
val groupRides = groupRidesByKey(rides.map(_.toTaxiRide), wopts)
52-
groupRides.saveAsBigQueryTable(Table.Spec(accumTable), WRITE_APPEND, CREATE_IF_NEEDED)
64+
groupRides.saveAsTypedBigQueryTable(
65+
Table.Spec(accumTable)
66+
)
5367

5468
sc.run
5569
}
5670

5771
def customWindowOptions: WindowOptions =
5872
WindowOptions(
59-
trigger = AfterWatermark.pastEndOfWindow()
60-
.withEarlyFirings(AfterProcessingTime
61-
.pastFirstElementInPane
62-
.plusDelayOf(Duration.standardSeconds(EARLY_RESULT)))
63-
.withLateFirings(AfterProcessingTime
64-
.pastFirstElementInPane()
65-
.plusDelayOf(Duration.standardSeconds(LATENESS))),
73+
trigger = AfterWatermark
74+
.pastEndOfWindow()
75+
.withEarlyFirings(
76+
AfterProcessingTime.pastFirstElementInPane
77+
.plusDelayOf(Duration.standardSeconds(EARLY_RESULT))
78+
)
79+
.withLateFirings(
80+
AfterProcessingTime
81+
.pastFirstElementInPane()
82+
.plusDelayOf(Duration.standardSeconds(LATENESS))
83+
),
6684
accumulationMode = streaming.ACCUMULATING_FIRED_PANES,
6785
allowedLateness = Duration.standardSeconds(LATENESS)
6886
)
6987

70-
def getMessagesFromPubSub(pubsubTopic: String)(implicit sc: ScioContext): SCollection[String] = {
71-
val pubsubRead: PubsubIO[String] = PubsubIO.string(pubsubTopic, timestampAttribute = "ts")
88+
def getMessagesFromPubSub(
89+
pubsubTopic: String
90+
)(implicit sc: ScioContext): SCollection[String] = {
91+
val pubsubRead: PubsubIO[String] =
92+
PubsubIO.string(pubsubTopic, timestampAttribute = "ts")
7293
val pubsubParams: PubsubIO.ReadParam = PubsubIO.ReadParam(PubsubIO.Topic)
7394

7495
/*_*/
7596
sc.read(pubsubRead)(pubsubParams) /*_*/
7697
}
7798

78-
def parseJSONStrings(messages: SCollection[String]):
79-
(SCollection[PointTaxiRide], SCollection[JsonError]) = {
80-
val jsons: SCollection[Either[circe.Error, PointTaxiRide]] = messages.map { s: String => json2TaxiRide(s) }
81-
82-
val errorsEither :: pointsEither :: Nil = jsons.partition(2, { e =>
83-
e match {
84-
case Left(_) => 0
85-
case Right(_) => 1
99+
def parseJSONStrings(
100+
messages: SCollection[String]
101+
): (SCollection[PointTaxiRide], SCollection[JsonError]) = {
102+
val jsons: SCollection[Either[Error, PointTaxiRide]] = messages.map {
103+
s: String => json2TaxiRide(s)
104+
}
105+
106+
val errorsEither :: pointsEither :: Nil = jsons.partition(
107+
2,
108+
{ e =>
109+
e match {
110+
case Left(_) => 0
111+
case Right(_) => 1
112+
}
86113
}
87-
})
114+
)
88115

89-
val errors: SCollection[circe.Error] = errorsEither.map(_.left.get)
116+
val errors: SCollection[Error] = errorsEither.map(_.left.get)
90117
val points: SCollection[PointTaxiRide] = pointsEither.map(_.right.get)
91118

92119
val jsonErrors: SCollection[JsonError] = errors.map(circeErrorToCustomError)
93120

94121
(points, jsonErrors)
95122
}
96123

97-
def groupRidesByKey(rides: SCollection[TaxiRide], wopts: WindowOptions): SCollection[TaxiRide] = {
124+
def groupRidesByKey(
125+
rides: SCollection[TaxiRide],
126+
wopts: WindowOptions
127+
): SCollection[TaxiRide] = {
98128
val ridesWithKey: SCollection[(String, TaxiRide)] =
99129
rides.keyBy(_.ride_id)
100130

101131
val afterWindow: SCollection[(String, TaxiRide)] =
102-
ridesWithKey.withSessionWindows(Duration.standardSeconds(SESSION_GAP), options = wopts)
103-
132+
ridesWithKey.withSessionWindows(
133+
Duration.standardSeconds(SESSION_GAP),
134+
options = wopts
135+
)
104136

105137
val agg: SCollection[TaxiRide] = afterWindow.reduceByKey(_ + _).map(_._2)
106138

src/main/scala/dev/herraiz/data/TaxiDataTypes.scala

+47-39
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616

1717
package dev.herraiz.data
1818

19-
import com.spotify.scio.bigquery.types.BigQueryType
2019
import io.circe._
2120
import io.circe.generic.semiauto._
2221
import io.circe.parser.decode
22+
import com.spotify.scio.bigquery.types.BigQueryType
2323
import org.joda.time.format.{DateTimeFormat, DateTimeFormatter}
2424
import org.joda.time.{Instant, Interval}
2525

@@ -46,27 +46,29 @@ object DataTypes {
4646

4747
ti match {
4848
case Success(instant: Instant) => Right(instant)
49-
case Failure(e: Throwable) => Left(e.getMessage)
49+
case Failure(e: Throwable) => Left(e.getMessage)
5050
}
5151
}
5252

5353
// Decoder to produce TaxiRide objects from Json
54-
protected lazy implicit val taxiRideDecoder: Decoder[PointTaxiRide] = deriveDecoder[PointTaxiRide]
54+
protected lazy implicit val taxiRideDecoder: Decoder[PointTaxiRide] =
55+
deriveDecoder[PointTaxiRide]
5556

5657
@BigQueryType.toTable
5758
case class PointTaxiRide(
58-
ride_id: String,
59-
point_idx: Int,
60-
latitude: Double,
61-
longitude: Double,
62-
timestamp: Instant,
63-
meter_reading: Double,
64-
meter_increment: Double,
65-
ride_status: String,
66-
passenger_count: Int
67-
) {
59+
ride_id: String,
60+
point_idx: Int,
61+
latitude: Double,
62+
longitude: Double,
63+
timestamp: Instant,
64+
meter_reading: Double,
65+
meter_increment: Double,
66+
ride_status: String,
67+
passenger_count: Int
68+
) {
6869
def toTaxiRide: TaxiRide =
69-
TaxiRide(this.ride_id,
70+
TaxiRide(
71+
this.ride_id,
7072
1,
7173
this.timestamp,
7274
None,
@@ -78,14 +80,14 @@ object DataTypes {
7880

7981
@BigQueryType.toTable
8082
case class TaxiRide(
81-
ride_id: String,
82-
n_points: Int,
83-
init: Instant,
84-
finish: Option[Instant],
85-
total_meter: Double,
86-
init_status: String,
87-
finish_status: Option[String]
88-
) {
83+
ride_id: String,
84+
n_points: Int,
85+
init: Instant,
86+
finish: Option[Instant],
87+
total_meter: Double,
88+
init_status: String,
89+
finish_status: Option[String]
90+
) {
8991
def +(taxiRide: TaxiRide): TaxiRide = {
9092

9193
val (first, second) =
@@ -95,22 +97,28 @@ object DataTypes {
9597
(this, taxiRide)
9698
}
9799

98-
99-
val (finishStatus: Option[String], finishInstant: Option[Instant]) = first.finish match {
100-
case None =>
101-
(Some(second.finish_status.getOrElse(second.init_status)),
102-
Some(second.finish.getOrElse(second.init)))
103-
case Some(i) =>
104-
val interval: Interval = new Interval(first.init, i)
105-
val testInstant: Instant = second.finish.getOrElse(second.init)
106-
if (interval.contains(testInstant)) {
107-
(Some(first.finish_status.getOrElse(first.init_status)),
108-
Some(first.finish.getOrElse(first.init)))
109-
} else {
110-
(Some(second.finish_status.getOrElse(second.init_status)),
111-
Some(second.finish.getOrElse(second.init)))
112-
}
113-
}
100+
val (finishStatus: Option[String], finishInstant: Option[Instant]) =
101+
first.finish match {
102+
case None =>
103+
(
104+
Some(second.finish_status.getOrElse(second.init_status)),
105+
Some(second.finish.getOrElse(second.init))
106+
)
107+
case Some(i) =>
108+
val interval: Interval = new Interval(first.init, i)
109+
val testInstant: Instant = second.finish.getOrElse(second.init)
110+
if (interval.contains(testInstant)) {
111+
(
112+
Some(first.finish_status.getOrElse(first.init_status)),
113+
Some(first.finish.getOrElse(first.init))
114+
)
115+
} else {
116+
(
117+
Some(second.finish_status.getOrElse(second.init_status)),
118+
Some(second.finish.getOrElse(second.init))
119+
)
120+
}
121+
}
114122

115123
TaxiRide(
116124
taxiRide.ride_id,
@@ -127,4 +135,4 @@ object DataTypes {
127135
@BigQueryType.toTable
128136
case class JsonError(msg: String)
129137

130-
}
138+
}

0 commit comments

Comments
 (0)