-
Notifications
You must be signed in to change notification settings - Fork 126
/
PartitionMapAnalyzer.scala
51 lines (44 loc) · 2.05 KB
/
PartitionMapAnalyzer.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
package util
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.rdd.RDD
//
// This is a general tool for analyzing an RDD of ConsumerRecord, such as is normally produced by a Kafka stream.
// The goal is to see how subscribed Kafka topics and their Kafka partitions map to partitions in the RDD that is
// emitted by the Spark stream. The code is a little convoluted because of its contradictory goals:
// 1) avoid collecting the RDD tot he driver node (thus having to serialize ConsumerRecord
// 2) print the partition information sequentially to keep the output from being jumbled
//
// It may be fun to rewrite it so that a data structure containing this infrastructure is produced in parallel
// and then collected and printed sequentially.
//
object PartitionMapAnalyzer {
def analyze[K,V](r: RDD[ConsumerRecord[K,V]],
dumpRecords: Boolean = false) : Unit =
{
if (r.count() > 0) {
println("*** " + r.getNumPartitions + " partitions")
val partitions = r.glom().zipWithIndex()
// this loop will be sequential; each iteration analyzes one partition
(0l to partitions.count() - 1).foreach(n => analyzeOnePartition(partitions, n, dumpRecords))
} else {
println("*** RDD is empty")
}
}
private def analyzeOnePartition[K,V](partitions: RDD[(Array[ConsumerRecord[K, V]], Long)],
which: Long,
dumpRecords: Boolean) : Unit =
{
partitions.foreach({
case (data: Array[ConsumerRecord[K, V]], index: Long) => {
if (index == which) {
println(s"*** partition $index has ${data.length} records")
data.groupBy(cr => (cr.topic(), cr.partition())).foreach({
case (k: (String, Int), v: Array[ConsumerRecord[K, V]]) =>
println(s"*** rdd partition = $index, topic = ${k._1}, topic partition = ${k._2}, record count = ${v.length}.")
})
if (dumpRecords) data.foreach(cr => println(s"RDD partition $index record $cr"))
}
}
})
}
}