From e68768ea756ebde01740a17bccbb853541e3dc0e Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Sun, 24 Jun 2018 12:03:12 +0100 Subject: [PATCH] Pimp RDD with fraction --- README.md | 7 ++++--- build.sbt | 2 +- .../SparkHelper$$RDDExtensions.html | 17 +++++++++++++++++ docs/com/spark_helper/SparkHelper$.html | 3 ++- docs/com/spark_helper/package.html | 3 ++- docs/index/index-f.html | 3 +++ .../scala/com/spark_helper/SparkHelper.scala | 14 ++++++++++++++ .../com/spark_helper/SparkHelperTest.scala | 10 ++++++++++ 8 files changed, 53 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 857f425..7510472 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,7 @@ rdd.reduceWithCount // RDD("a", "b", "c", "a", "d", "a", "c") => RDD(("a", 3), ( rdd.maxBy(_._2) // RDD((1, "a"), (2, "c"), (3, "b"), (4, "c")) => (2, "c") or (4, "c") rdd.minBy(_._2) // RDD((1, "a"), (2, "c"), (3, "b"), (4, "c")) => (1, "a") rdd.maxByKey; rdd.minByKey; rdd.maxByValue, ... // RDD((1, "a"), (2, "c"), (3, "b"), (4, "c")).maxByKey => (4, "c") +rdd.fraction(_ > 0) // RDD(1, 0, -2, -1, 7, -8, 8, 1, -2) => 0.4444 ``` @@ -314,7 +315,7 @@ With sbt: ```scala resolvers += "jitpack" at "https://jitpack.io" -libraryDependencies += "com.github.xavierguihot" % "spark_helper" % "2.0.2" +libraryDependencies += "com.github.xavierguihot" % "spark_helper" % "2.0.3" ``` With maven: @@ -330,7 +331,7 @@ With maven: com.github.xavierguihot spark_helper - 2.0.2 + 2.0.3 ``` @@ -344,7 +345,7 @@ allprojects { } dependencies { - compile 'com.github.xavierguihot:spark_helper:2.0.2' + compile 'com.github.xavierguihot:spark_helper:2.0.3' } ``` diff --git a/build.sbt b/build.sbt index 4a4a309..c1b73eb 100644 --- a/build.sbt +++ b/build.sbt @@ -1,6 +1,6 @@ name := "spark_helper" -version := "2.0.2" +version := "2.0.3" scalaVersion := "2.11.12" diff --git a/docs/com/spark_helper/SparkHelper$$RDDExtensions.html b/docs/com/spark_helper/SparkHelper$$RDDExtensions.html index 5248400..b776ed0 100644 --- a/docs/com/spark_helper/SparkHelper$$RDDExtensions.html +++ b/docs/com/spark_helper/SparkHelper$$RDDExtensions.html @@ -316,6 +316,23 @@

) +
  • + + +

    + + + def + + + fraction(p: (T) ⇒ Boolean): Double + +

    + + Permalink + + +

    Determines what fraction of the RDD passes the predicate.

    Determines what fraction of the RDD passes the predicate.

    RDD(1, 0, -2, -1, 7, -8, 8, 1, -2).fraction(_ > 0) // 0.4444
    returns

    the fraction elements in the RDD which passe the given predicate

  • diff --git a/docs/com/spark_helper/SparkHelper$.html b/docs/com/spark_helper/SparkHelper$.html index 338143b..904092d 100644 --- a/docs/com/spark_helper/SparkHelper$.html +++ b/docs/com/spark_helper/SparkHelper$.html @@ -93,7 +93,8 @@

    rdd.reduceWithCount // RDD("a", "b", "c", "a", "d", "a", "c") => RDD(("a", 3), ("b", 1), ("c", 2), ("d", 1)) rdd.maxBy(_._2) // RDD((1, "a"), (2, "c"), (3, "b"), (4, "c")) => (2, "c") or (4, "c") rdd.minBy(_._2) // RDD((1, "a"), (2, "c"), (3, "b"), (4, "c")) => (1, "a") -rdd.maxByKey; rdd.minByKey; rdd.maxByValue, ... // RDD((1, "a"), (2, "c"), (3, "b"), (4, "c")).maxByKey => (4, "c")

    Source // RDD((1, "a"), (2, "c"), (3, "b"), (4, "c")).maxByKey => (4, "c") +rdd.fraction(_ > 0) // RDD(1, 0, -2, -1, 7, -8, 8, 1, -2) => 0.4444

    Source SparkHelper

    Since

    2017-02

    To do

    sc.parallelize[T](elmts: T*) instead of sc.parallelize[T](elmts: Array[T])

    Linear Supertypes diff --git a/docs/com/spark_helper/package.html b/docs/com/spark_helper/package.html index f08d516..eeaa46b 100644 --- a/docs/com/spark_helper/package.html +++ b/docs/com/spark_helper/package.html @@ -347,7 +347,8 @@

    rdd.reduceWithCount // RDD("a", "b", "c", "a", "d", "a", "c") => RDD(("a", 3), ("b", 1), ("c", 2), ("d", 1)) rdd.maxBy(_._2) // RDD((1, "a"), (2, "c"), (3, "b"), (4, "c")) => (2, "c") or (4, "c") rdd.minBy(_._2) // RDD((1, "a"), (2, "c"), (3, "b"), (4, "c")) => (1, "a") -rdd.maxByKey; rdd.minByKey; rdd.maxByValue, ... // RDD((1, "a"), (2, "c"), (3, "b"), (4, "c")).maxByKey => (4, "c")

    Source // RDD((1, "a"), (2, "c"), (3, "b"), (4, "c")).maxByKey => (4, "c") +rdd.fraction(_ > 0) // RDD(1, 0, -2, -1, 7, -8, 8, 1, -2) => 0.4444

    Source SparkHelper

    Since

    2017-02

    To do

    sc.parallelize[T](elmts: T*) instead of sc.parallelize[T](elmts: Array[T])

  • diff --git a/docs/index/index-f.html b/docs/index/index-f.html index f65fda4..1c68318 100644 --- a/docs/index/index-f.html +++ b/docs/index/index-f.html @@ -40,5 +40,8 @@
    folderModificationDateTime
    +
    +
    fraction
    +
    diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index d0a9a0a..5bd9567 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -69,6 +69,7 @@ import scala.util.Random * rdd.maxBy(_._2) // RDD((1, "a"), (2, "c"), (3, "b"), (4, "c")) => (2, "c") or (4, "c") * rdd.minBy(_._2) // RDD((1, "a"), (2, "c"), (3, "b"), (4, "c")) => (1, "a") * rdd.maxByKey; rdd.minByKey; rdd.maxByValue, ... // RDD((1, "a"), (2, "c"), (3, "b"), (4, "c")).maxByKey => (4, "c") + * rdd.fraction(_ > 0) // RDD(1, 0, -2, -1, 7, -8, 8, 1, -2) => 0.4444 * }}} * * Source 0) === 4d / 9d) + // 2: + assert(sc.parallelize(Array(1, 7, 8, 1)).fraction(_ > 0) === 1d) + // 3: + assert(sc.parallelize(Array(-1, -7, 0, -1)).fraction(_ > 0) === 0d) + } } case class A(x: Int, y: String)