diff --git a/.gitignore b/.gitignore index d838934..b76ffde 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ project/target target *.crc + +.idea diff --git a/README.md b/README.md index e36dad4..90015ab 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,6 @@ ## Overview -Version: 1.1.1 - API Scaladoc: [SparkHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.SparkHelper$) This library contains a bunch of low-level basic methods for data processing @@ -18,14 +16,14 @@ names are self-explanatory and readable. This also provides a monitoring/logger tool. -This is a bunch of 4 modules: +This is a set of 4 modules: -* [HdfsHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.HdfsHelper$): Wrapper around [apache Hadoop FileSystem API](https://hadoop.apache.org/docs/r2.6.1/api/org/apache/hadoop/fs/FileSystem.html) for file manipulations on hdfs. -* [SparkHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.SparkHelper$): Hdfs file manipulations through the Spark API. +* [HdfsHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.HdfsHelper$): Wrapper around the [apache Hadoop FileSystem API](https://hadoop.apache.org/docs/r2.6.1/api/org/apache/hadoop/fs/FileSystem.html) for file manipulations on hdfs. +* [SparkHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.SparkHelper$): Hdfs file manipulations through the Spark API (pimped RDDs and SparkContext). * [DateHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.DateHelper$): Wrapper around [joda-time](http://www.joda.org/joda-time/apidocs/) for usual data mining dates manipulations. * [Monitor](http://xavierguihot.com/spark_helper/#com.spark_helper.Monitor$): Spark custom monitoring/logger and kpi validator. -Compatible with Spark 2. +Compatible with Spark 2.x ### HdfsHelper: @@ -36,21 +34,21 @@ The full list of methods is available at Contains basic file-related methods mostly based on hdfs apache Hadoop FileSystem API [org.apache.hadoop.fs.FileSystem](https://hadoop.apache.org/docs/r2.6.1/api/org/apache/hadoop/fs/FileSystem.html). -For instance, one don't want to remove a file from hdfs using 3 lines of code -and thus could instead just use `HdfsHelper.deleteFile("my/hdfs/file/path.csv")`. - -A non-exhaustive list of exemples: +A non-exhaustive list of examples: ```scala import com.spark_helper.HdfsHelper // A bunch of methods wrapping the FileSystem API, such as: -HdfsHelper.fileExists("my/hdfs/file/path.txt") -assert(HdfsHelper.listFileNamesInFolder("my/folder/path") == List("file_name_1.txt", "file_name_2.csv")) -assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306") -assert(HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") == 3) -HdfsHelper.deleteFile("my/hdfs/file/path.csv") -HdfsHelper.moveFolder("my/hdfs/folder") +HdfsHelper.fileExists("my/hdfs/file/path.txt") // HdfsHelper.folderExists("my/hdfs/folder") +HdfsHelper.listFileNamesInFolder("my/folder/path") // List("file_name_1.txt", "file_name_2.csv") +HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") // "20170306" +HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") // 3 +HdfsHelper.deleteFile("my/hdfs/file/path.csv") // HdfsHelper.deleteFolder("my/hdfs/folder") +HdfsHelper.moveFolder("old/path", "new/path") // HdfsHelper.moveFile("old/path.txt", "new/path.txt") +HdfsHelper.createEmptyHdfsFile("/some/hdfs/file/path.token") // HdfsHelper.createFolder("my/hdfs/folder") + +// File content helpers: HdfsHelper.compressFile("hdfs/path/to/uncompressed_file.txt", classOf[GzipCodec]) HdfsHelper.appendHeader("my/hdfs/file/path.csv", "colum0,column1") @@ -58,46 +56,71 @@ HdfsHelper.appendHeader("my/hdfs/file/path.csv", "colum0,column1") HdfsHelper.isHdfsXmlCompliantWithXsd("my/hdfs/file/path.xml", getClass.getResource("/some_xml.xsd")) HdfsHelper.loadXmlFileFromHdfs("my/hdfs/file/path.xml") -// Very handy to load a config (typesafe format) stored on hdfs at the begining of a spark job: +// Very handy to load a config (typesafe format) stored on hdfs at the beginning of a spark job: HdfsHelper.loadTypesafeConfigFromHdfs("my/hdfs/file/path.conf"): Config // In order to write small amount of data in a file on hdfs without the whole spark stack: HdfsHelper.writeToHdfsFile(Array("some", "relatively small", "text"), "/some/hdfs/file/path.txt") +// or: +import com.spark_helper.HdfsHelper._ +Array("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt") +"hello world".writeToHdfs("/some/hdfs/file/path.txt") // Deletes all files/folders in "hdfs/path/to/folder" for which the timestamp is older than 10 days: HdfsHelper.purgeFolder("hdfs/path/to/folder", 10) ``` +In case a specific configuration is needed to access the file system, these +setters are available: + +```scala +// To use a specific conf FileSystem.get(whateverConf) instead of FileSystem.get(new Configuration()): +HdfsHelper.setConf(whateverConf) +// Or directly the FileSystem: +HdfsHelper.setFileSystem(whateverFileSystem) +``` + ### SparkHelper: The full list of methods is available at [SparkHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.SparkHelper$). -Contains basic file/RRD-related methods based on the Spark APIs. +Contains basic RRD-related methods. -A non-exhaustive list of exemples: +A non-exhaustive list of examples: ```scala -import com.spark_helper.SparkHelper +import com.spark_helper.SparkHelper._ -// Same as SparkContext.saveAsTextFile, but the result is a single file: -SparkHelper.saveAsSingleTextFile(myOutputRDD, "/my/output/file/path.txt") +// Same as rdd.saveAsTextFile("path"), but the result is a single file (while +// keeping the processing distributed): +rdd.saveAsSingleTextFile("/my/output/file/path.txt") +rdd.saveAsSingleTextFile("/my/output/file/path.txt", classOf[BZip2Codec]) -// Same as SparkContext.textFile, but instead of reading one record per line, -// it reads records spread over several lines. This way, xml, json, yml or -// any multi-line record file format can be used with Spark: -SparkHelper.textFileWithDelimiter("/my/input/folder/path", sparkContext, "---\n") +// Same as sc.textFile("path"), but instead of reading one record per line (by +// splitting the input with \n), it splits the file in records based on a custom +// delimiter. This way, xml, json, yml or any multi-line record file format can +// be used with Spark: +sc.textFile("/my/input/folder/path", "---\n") // for a yml file for instance -// Equivalent to sparkContext.textFile(), but for each line is tupled with its -// file path: -SparkHelper.textFileWithFileName("folder", sparkContext) +// Equivalent to rdd.flatMap(identity) for RDDs of Seqs or Options: +rdd.flatten + +// Equivalent to sc.textFile(), but for each line is tupled with its file path: +sc.textFileWithFileName("/my/input/folder/path") // which produces: -RDD( - ("file:/path/on/machine/folder/file_1.txt", "record1fromfile1"), - ("file:/path/on/machine/folder/file_1.txt", "record2fromfile1"), - ("file:/path/on/machine/folder/file_2.txt", "record1fromfile2"), - ... -) +// RDD(("folder/file_1.txt", "record1fromfile1"), ("folder/file_1.txt", "record2fromfile1"), +// ("folder/file_2.txt", "record1fromfile2"), ...) + +// In the given folder, this generates one file per key in the given key/value +// RDD. Within each file (named from the key) are all values for this key: +rdd.saveAsTextFileByKey("/my/output/folder/path") + +// Concept mapper (the following example transforms RDD(1, 3, 2, 7, 8) into RDD(1, 3, 4, 7, 16)): +rdd.partialMap { case a if a % 2 == 0 => 2 * a } + +// For when input files contain commas and textFile can't handle it: +sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt")) ``` ### DateHelper: @@ -106,21 +129,43 @@ The full list of methods is available at [DateHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.DateHelper$). Wrapper around [joda-time](http://www.joda.org/joda-time/apidocs/) for -data-mining classic dates manipulations. +data-mining classic dates manipulations and job scheduling. -A non-exhaustive list of exemples: +A non-exhaustive list of examples: ```scala import com.spark_helper.DateHelper -assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101")) -assert(DateHelper.today() == "20170310") // If today's "20170310" -assert(DateHelper.yesterday() == "20170309") // If today's "20170310" -assert(DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") == "170327") -assert(DateHelper.now("HH:mm") == "10:24") -assert(DateHelper.currentTimestamp() == "1493105229736") -assert(DateHelper.nDaysBefore(3) == "20170307") // If today's "20170310" -assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310") +DateHelper.daysBetween("20161230", "20170101") // List("20161230", "20161231", "20170101") +DateHelper.today // "20170310" +DateHelper.yesterday // "20170309" +DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") // "170327" +DateHelper.now("HH:mm") // "10:24" +DateHelper.currentTimestamp // "1493105229736" +DateHelper.nDaysBefore(3) // "20170307" +DateHelper.nDaysAfterDate(3, "20170307") // "20170310" +DateHelper.nextDay("20170310") // "20170311" +DateHelper.nbrOfDaysSince("20170302") // 8 +DateHelper.nbrOfDaysBetween("20170327", "20170401") // 5 +DateHelper.dayOfWeek("20160614") // 2 + +import com.spark_helper.DateHelper._ + +2.daysAgo // "20170308" +"20161230" to "20170101" // List("20161230", "20161231", "20170101") +3.daysBefore("20170310") // "20170307" +5.daysAfter // "20170315" +4.daysAfter("20170310") // "20170314" +"20170302".isCompliantWith("yyyyMMdd") +"20170310".nextDay // "20170311" +"20170310".previousDay // "20170309" +``` + +The default format (when no format is specified) is "yyyyMMdd" (20170327). It +can be modified globally with: + +```scala +DateHelper.setFormat("ddMMMyy") ``` ### Monitor: @@ -128,15 +173,15 @@ assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310") The full list of methods is available at [Monitor](http://xavierguihot.com/spark_helper/#com.spark_helper.Monitor$) -It's a simple logger/report which contains a report that one can update from -the driver and a success state. The idea is to persist job executions logs and -errors (and forget about grepping unreadable yarn logs). +It's a simple logger/report which contains a report and a state that one can +update from the driver. The idea is to persist job executions logs and errors +(and forget about grepping unreadable yarn logs). -It's designed for perdiodic spark jobs (handles storage and purge of logs) and +It's designed for periodic spark jobs (handles storage and purge of logs) and provides a way to handle kpis validation. Logs are stored on the go which means one can have a direct real time access of -the job logs/status and it's current state (which can overwise be a pain if it +the job logs/status and it's current state (which can otherwise be a pain if it means going through yarn logs, or even for certain production environments going through additional layers of software logs to get to yarn logs). @@ -150,9 +195,9 @@ the logger for a clean logging. This is a "driver-only" logger and is not intended at logging concurrent actions from executors. -Produced reports can easily be inserted in a notification email whenerver the +Produced reports can easily be inserted in a notification email whenever the job fails, which saves a lot of time to maintainers operating on heavy -production environements. +production environments. The produced persisted report is also a way for downstream jobs to know the status of their input data. @@ -190,7 +235,7 @@ try { Monitor.error(e, "My pipeline descirption") // whatever unexpected error } -if (Monitor.isSuccess()) { +if (Monitor.isSuccess) { val doMore = "Let's do some more stuff!" Monitor.log("My second pipeline description: success") } @@ -199,9 +244,9 @@ if (Monitor.isSuccess()) { // HDFS (this saves the logs in the folder set with Monitor.setLogFolder): Monitor.store() -// At the end of the job, if the job isn't successfull, you might want to +// At the end of the job, if the job isn't successful, you might want to // crash it (for instance to get a notification from your scheduler): -if (!Monitor.isSuccess()) throw new Exception() // or send an email, or ... +if (!Monitor.isSuccess) throw new Exception() // or send an email, or ... ``` At any time during the job, logs can be accessed from file @@ -214,7 +259,7 @@ Here are some possible reports generated by the previous pipeline: My job description (whatever you want); for instance: Documentation: https://github.com/xavierguihot/spark_helper -[10:23] Begining +[10:23] Beginning [10:23-10:23] My pipeline descirption: failed Diagnostic: No input data! org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://my/hdfs/input/path @@ -231,7 +276,7 @@ or My job description (whatever you want); for instance: Documentation: https://github.com/xavierguihot/spark_helper -[10:23] Begining +[10:23] Beginning [10:23-10:41] My pipeline descirption: success KPI: Nbr of output records Value: 14669071.0 @@ -248,15 +293,15 @@ Documentation: https://github.com/xavierguihot/spark_helper ## Including spark_helper to your dependencies: -With sbt, add these lines to your build.sbt: +With sbt: ```scala resolvers += "jitpack" at "https://jitpack.io" -libraryDependencies += "com.github.xavierguihot" % "spark_helper" % "v1.1.1" +libraryDependencies += "com.github.xavierguihot" % "spark_helper" % "2.0.0" ``` -With maven, add these lines to your pom.xml: +With maven: ```xml @@ -269,11 +314,11 @@ With maven, add these lines to your pom.xml: com.github.xavierguihot spark_helper - v1.1.1 + 2.0.0 ``` -With gradle, add these lines to your build.gradle: +With gradle: ```groovy allprojects { @@ -283,6 +328,9 @@ allprojects { } dependencies { - compile 'com.github.xavierguihot:spark_helper:v1.1.1' + compile 'com.github.xavierguihot:spark_helper:2.0.0' } ``` + +For versions anterior to `2.0.0`, use prefix `v` in the version tag; for +instance `v1.0.0` diff --git a/build.sbt b/build.sbt index bc2ebbc..15d0838 100644 --- a/build.sbt +++ b/build.sbt @@ -1,6 +1,6 @@ name := "spark_helper" -version := "1.1.1" +version := "2.0.0" scalaVersion := "2.11.12" diff --git a/docs/com/spark_helper/DateHelper$$IntExtensions.html b/docs/com/spark_helper/DateHelper$$IntExtensions.html new file mode 100644 index 0000000..9b5203c --- /dev/null +++ b/docs/com/spark_helper/DateHelper$$IntExtensions.html @@ -0,0 +1,599 @@ + + + + IntExtensions - com.spark_helper.DateHelper.IntExtensions + + + + + + + + + + + + + + + +
+ Class +

com.spark_helper.DateHelper

+

IntExtensions

Related Doc: + package DateHelper +

+ + Permalink + + +
+ +

+ + implicit + class + + + IntExtensions extends AnyRef + +

+ +
+ Linear Supertypes +
AnyRef, Any
+
+ + +
+
+
+ Ordering +
    + +
  1. Alphabetic
  2. +
  3. By Inheritance
  4. +
+
+
+ Inherited
+
+
    +
  1. IntExtensions
  2. AnyRef
  3. Any
  4. +
+
+ +
    +
  1. Hide All
  2. +
  3. Show All
  4. +
+
+
+ Visibility +
  1. Public
  2. All
+
+
+ +
+
+
+

Instance Constructors

+
  1. + + +

    + + + new + + + IntExtensions(int: Int) + +

    + + Permalink + + + +
+
+ + + + + +
+

Value Members

+
  1. + + +

    + + final + def + + + !=(arg0: Any): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  2. + + +

    + + final + def + + + ##(): Int + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  3. + + +

    + + final + def + + + ==(arg0: Any): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  4. + + +

    + + final + def + + + asInstanceOf[T0]: T0 + +

    + + Permalink + + +
    Definition Classes
    Any
    +
  5. + + +

    + + + def + + + clone(): AnyRef + +

    + + Permalink + + +
    Attributes
    protected[java.lang]
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + ... + ) + +
    +
  6. + + +

    + + + def + + + daysAfter(date: String): String + +

    + + Permalink + + +

    Returns which date it will be x days after the given date under the +default format.

    Returns which date it will be x days after the given date under the +default format.

    If the given date is "20170122" and we request the date it will be 3 +days after, we'll return "20170125".

    assert(5.daysAfter("20170305") == "20170310")
    date

    the date under the default format for which we want the date +for nbrOfDaysAfter days after.

    returns

    the date it was nbrOfDaysAfter after date under the default +format.

    +
  7. + + +

    + + + def + + + daysAfter: String + +

    + + Permalink + + +

    Returns which date it will be x days after today under the default format.

    Returns which date it will be x days after today under the default format.

    If we're "20170125" and we request for 3 days after, we'll return +"20170127".

    // If today's "20170310":
    +assert(3.daysAfter == "20170313")
    returns

    today's date plus the given nbr of days

    +
  8. + + +

    + + + def + + + daysAgo: String + +

    + + Permalink + + +

    Returns which date it was x days before today under the default format.

    Returns which date it was x days before today under the default format.

    If we're "20170125" and we request for 3 days before, we'll return +"20170122".

    // If today's "20170310":
    +assert(3.daysAgo == "20170307")
    returns

    today's date minus the given nbr of days

    +
  9. + + +

    + + + def + + + daysBefore(date: String): String + +

    + + Permalink + + +

    Returns which date it was x days before the given date.

    Returns which date it was x days before the given date.

    If the given date is "20170125" and we request the date it was 3 days +before, this will return "20170122".

    assert(3.daysBefore("20170310") == "20170307")
    date

    the date under the default format for which we want the date +for nbrOfDaysBefore days before.

    returns

    the date it was nbrOfDaysBefore before date under the default +format.

    +
  10. + + +

    + + final + def + + + eq(arg0: AnyRef): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  11. + + +

    + + + def + + + equals(arg0: Any): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  12. + + +

    + + + def + + + finalize(): Unit + +

    + + Permalink + + +
    Attributes
    protected[java.lang]
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + classOf[java.lang.Throwable] + ) + +
    +
  13. + + +

    + + final + def + + + getClass(): Class[_] + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  14. + + +

    + + + def + + + hashCode(): Int + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  15. + + +

    + + + val + + + int: Int + +

    + + Permalink + + + +
  16. + + +

    + + final + def + + + isInstanceOf[T0]: Boolean + +

    + + Permalink + + +
    Definition Classes
    Any
    +
  17. + + +

    + + final + def + + + ne(arg0: AnyRef): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  18. + + +

    + + final + def + + + notify(): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  19. + + +

    + + final + def + + + notifyAll(): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  20. + + +

    + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  21. + + +

    + + + def + + + toString(): String + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  22. + + +

    + + final + def + + + wait(): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + ... + ) + +
    +
  23. + + +

    + + final + def + + + wait(arg0: Long, arg1: Int): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + ... + ) + +
    +
  24. + + +

    + + final + def + + + wait(arg0: Long): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + ... + ) + +
    +
+
+ + + + +
+ +
+
+

Inherited from AnyRef

+
+

Inherited from Any

+
+ +
+ +
+
+

Ungrouped

+ +
+
+ +
+ +
+ + + + + + diff --git a/docs/com/spark_helper/DateHelper$$StringExtensions.html b/docs/com/spark_helper/DateHelper$$StringExtensions.html new file mode 100644 index 0000000..cf0c71c --- /dev/null +++ b/docs/com/spark_helper/DateHelper$$StringExtensions.html @@ -0,0 +1,595 @@ + + + + StringExtensions - com.spark_helper.DateHelper.StringExtensions + + + + + + + + + + + + + + + +
+ Class +

com.spark_helper.DateHelper

+

StringExtensions

Related Doc: + package DateHelper +

+ + Permalink + + +
+ +

+ + implicit + class + + + StringExtensions extends AnyRef + +

+ +
+ Linear Supertypes +
AnyRef, Any
+
+ + +
+
+
+ Ordering +
    + +
  1. Alphabetic
  2. +
  3. By Inheritance
  4. +
+
+
+ Inherited
+
+
    +
  1. StringExtensions
  2. AnyRef
  3. Any
  4. +
+
+ +
    +
  1. Hide All
  2. +
  3. Show All
  4. +
+
+
+ Visibility +
  1. Public
  2. All
+
+
+ +
+
+
+

Instance Constructors

+
  1. + + +

    + + + new + + + StringExtensions(string: String) + +

    + + Permalink + + + +
+
+ + + + + +
+

Value Members

+
  1. + + +

    + + final + def + + + !=(arg0: Any): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  2. + + +

    + + final + def + + + ##(): Int + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  3. + + +

    + + final + def + + + ==(arg0: Any): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  4. + + +

    + + final + def + + + asInstanceOf[T0]: T0 + +

    + + Permalink + + +
    Definition Classes
    Any
    +
  5. + + +

    + + + def + + + clone(): AnyRef + +

    + + Permalink + + +
    Attributes
    protected[java.lang]
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + ... + ) + +
    +
  6. + + +

    + + final + def + + + eq(arg0: AnyRef): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  7. + + +

    + + + def + + + equals(arg0: Any): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  8. + + +

    + + + def + + + finalize(): Unit + +

    + + Permalink + + +
    Attributes
    protected[java.lang]
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + classOf[java.lang.Throwable] + ) + +
    +
  9. + + +

    + + final + def + + + getClass(): Class[_] + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  10. + + +

    + + + def + + + hashCode(): Int + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  11. + + +

    + + + def + + + isCompliantWith(format: String): Boolean + +

    + + Permalink + + +

    Validates the formatted date is compliant with the provided format.

    Validates the formatted date is compliant with the provided format.

    assert("20170302".isCompliantWith("yyyyMMdd"))
    +assert(!"20170333".isCompliantWith("yyyyMMdd"))
    +assert("20170228".isCompliantWith("yyyyMMdd"))
    +assert(!"20170229".isCompliantWith("yyyyMMdd"))
    +assert(!"170228".isCompliantWith("yyyyMMdd"))
    +assert(!"".isCompliantWith("yyyyMMdd"))
    +assert(!"a".isCompliantWith("yyyyMMdd"))
    +assert(!"24JAN17".isCompliantWith("yyyyMMdd"))
    returns

    if the provided date is under the provided format

    +
  12. + + +

    + + final + def + + + isInstanceOf[T0]: Boolean + +

    + + Permalink + + +
    Definition Classes
    Any
    +
  13. + + +

    + + final + def + + + ne(arg0: AnyRef): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  14. + + +

    + + + def + + + nextDay: String + +

    + + Permalink + + +

    Returns the date one day after the given date.

    Returns the date one day after the given date.

    assert("20170310".nextDay == "20170311")
    returns

    the date of the day after the given date

    +
  15. + + +

    + + final + def + + + notify(): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  16. + + +

    + + final + def + + + notifyAll(): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  17. + + +

    + + + def + + + previousDay: String + +

    + + Permalink + + +

    Returns the date one day before the given date.

    Returns the date one day before the given date.

    assert("20170310".previousDay == "20170309")
    returns

    the date of the day before the given date

    +
  18. + + +

    + + + val + + + string: String + +

    + + Permalink + + + +
  19. + + +

    + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  20. + + +

    + + + def + + + to(lastDate: String): List[String] + +

    + + Permalink + + +

    Creates the list of dates between the two given dates.

    Creates the list of dates between the two given dates.

    assert(("20161230" to "20170101") == List("20161230", "20161231", "20170101"))
    lastDate

    the last date

    returns

    the list of dates between this string and the lastDate in the +default format.

    +
  21. + + +

    + + + def + + + toString(): String + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  22. + + +

    + + final + def + + + wait(): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + ... + ) + +
    +
  23. + + +

    + + final + def + + + wait(arg0: Long, arg1: Int): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + ... + ) + +
    +
  24. + + +

    + + final + def + + + wait(arg0: Long): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + ... + ) + +
    +
+
+ + + + +
+ +
+
+

Inherited from AnyRef

+
+

Inherited from Any

+
+ +
+ +
+
+

Ungrouped

+ +
+
+ +
+ +
+ + + + + + diff --git a/docs/com/spark_helper/DateHelper$.html b/docs/com/spark_helper/DateHelper$.html index 77a52dd..749074c 100644 --- a/docs/com/spark_helper/DateHelper$.html +++ b/docs/com/spark_helper/DateHelper$.html @@ -52,14 +52,31 @@

A facility which deals with usual date needs (wrapper around joda-time).

The goal is to remove the maximum of highly used low-level code from your spark job and replace it with methods fully tested whose name is -self-explanatory/readable.

A few exemples:

assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101"))
-assert(DateHelper.today() == "20170310") // If today's "20170310"
-assert(DateHelper.yesterday() == "20170309") // If today's "20170310"
-assert(DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") == "170327")
-assert(DateHelper.now("HH:mm") == "10:24")
-assert(DateHelper.currentTimestamp() == "1493105229736")
-assert(DateHelper.nDaysBefore(3) == "20170307") // If today's "20170310"
-assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310")

Source import com.spark_helper.DateHelper + +DateHelper.daysBetween("20161230", "20170101") // List("20161230", "20161231", "20170101") +DateHelper.today // "20170310" +DateHelper.yesterday // "20170309" +DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") // "170327" +DateHelper.now("HH:mm") // "10:24" +DateHelper.currentTimestamp // "1493105229736" +DateHelper.nDaysBefore(3) // "20170307" +DateHelper.nDaysAfterDate(3, "20170307") // "20170310" +DateHelper.nextDay("20170310") // "20170311" +DateHelper.nbrOfDaysSince("20170302") // 8 +DateHelper.nbrOfDaysBetween("20170327", "20170401") // 5 +DateHelper.dayOfWeek("20160614") // 2 + +import com.spark_helper.DateHelper._ + +2.daysAgo // "20170308" +"20161230" to "20170101" // List("20161230", "20161231", "20170101") +3.daysBefore("20170310") // "20170307" +5.daysAfter // "20170315" +4.daysAfter("20170310") // "20170314" +"20170302".isCompliantWith("yyyyMMdd") +"20170310".nextDay // "20170311" +"20170310".previousDay // "20170309"

Source DateHelper

Since

2017-02

Linear Supertypes @@ -100,7 +117,44 @@

- +
+

Type Members

+
  1. + + +

    + + implicit + class + + + IntExtensions extends AnyRef + +

    + + Permalink + + + +
  2. + + +

    + + implicit + class + + + StringExtensions extends AnyRef + +

    + + Permalink + + + +
+
@@ -198,18 +252,18 @@

  • - - + +

    def - currentTimestamp(): String + currentTimestamp: String

    - + Permalink @@ -241,7 +295,7 @@

    def - dateFromTimestamp(timestamp: Long, format: String = "yyyyMMdd"): String + dateFromTimestamp(timestamp: Long, format: String = defaultFormat): String

    @@ -250,7 +304,7 @@

    Returns the date associated to the given UTC timestamp.

    Returns the date associated to the given UTC timestamp.

    assert(DateHelper.dateFromTimestamp(1496074819L) == "20170529")
     assert(DateHelper.dateFromTimestamp(1496074819L, "yyMMdd") == "170529")
    timestamp

    the UTC timestamps (nbr of millis since 1970-01-01) for -which to get the associated date.

    format

    (default = "yyyyMMdd") the format of the provided dates

    returns

    the associated date under the requested format

    +which to get the associated date.

    format

    the format of the provided dates

    returns

    the associated date under the requested format

  • @@ -260,15 +314,14 @@

    def - dayOfWeek(date: String, format: String = "yyyyMMdd"): Int + dayOfWeek(date: String, format: String = defaultFormat): Int

    Permalink -

    Returns the day of week for a date under the given format.

    Returns the day of week for a date under the given format.

    A Monday is 1 and a Sunday is 7.

    assert(DateHelper.dayOfWeek("20160614") == 2)
    date

    the date for which to get the day of week

    format

    (default = "yyyyMMdd") the format under which the date is -provided.

    returns

    the associated day of week, such as 2 for Tuesday

    +

    Returns the day of week for a date under the given format.

    Returns the day of week for a date under the given format.

    A Monday is 1 and a Sunday is 7.

    assert(DateHelper.dayOfWeek("20160614") == 2)
    date

    the date for which to get the day of week

    format

    the format under which the date is provided

    returns

    the associated day of week, such as 2 for Tuesday

  • @@ -278,15 +331,15 @@

    def - daysBetween(firstDate: String, lastDate: String, format: String = "yyyyMMdd"): List[String] + daysBetween(firstDate: String, lastDate: String, format: String = defaultFormat): List[String]

    Permalink -

    Finds the list of dates between the two given dates.

    Finds the list of dates between the two given dates.

    assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101"))
    firstDate

    the first date (in the given format)

    lastDate

    the last date (in the given format)

    format

    (default = "yyyyMMdd") the format to use for firstDate and -lastDate and for the returned list of dates.

    returns

    the list of dates between firstDate and lastDate in the given +

    Finds the list of dates between the two given dates.

    Finds the list of dates between the two given dates.

    assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101"))
    firstDate

    the first date (in the given format)

    lastDate

    the last date (in the given format)

    format

    the format to use for firstDate and lastDate and for the +returned list of dates.

    returns

    the list of dates between firstDate and lastDate in the given format.

  • @@ -402,7 +455,7 @@

    assert(!DateHelper.isDateCompliantWithFormat("170228", "yyyyMMdd")) assert(!DateHelper.isDateCompliantWithFormat("", "yyyyMMdd")) assert(!DateHelper.isDateCompliantWithFormat("a", "yyyyMMdd")) -assert(!DateHelper.isDateCompliantWithFormat("24JAN17", "yyyyMMdd"))

  • stringValue

    the stringified date

    returns

    if the provided date is under the provided format

    +assert(!DateHelper.isDateCompliantWithFormat("24JAN17", "yyyyMMdd"))
    stringValue

    the formatted date

    returns

    if the provided date is under the provided format

  • @@ -440,6 +493,64 @@

    returned list of dates and thus prefer getting a list of Joda DateTime objects instead of String dates.

    jodaFirstDate

    the joda DateTime first date

    jodaLastDate

    the joda DateTime last date

    returns

    the list of joda DateTime between jodaFirstDate and jodaLastDate

    +

  • + + +

    + + + def + + + nDaysAfter(nbrOfDaysAfter: Int): String + +

    + + Permalink + + +

    Returns which date it will be x days after today under the default format.

    Returns which date it will be x days after today under the default format.

    // If today's "20170310":
    +assert(DateHelper.nDaysAfter(5) == "20170315")
    nbrOfDaysAfter

    the nbr of days after today

    returns

    today's date plus the nbrOfDaysAfter under the default format

    +
  • + + +

    + + + def + + + nDaysAfter(nbrOfDaysAfter: Int, format: String): String + +

    + + Permalink + + +

    Returns which date it will be x days after today under the requested format.

    Returns which date it will be x days after today under the requested format.

    // If today's "20170310":
    +assert(DateHelper.nDaysAfter(5, "yyMMdd") == "170315")
    nbrOfDaysAfter

    the nbr of days after today

    format

    the format for the returned date

    returns

    today's date plus the nbrOfDaysAfter under the requested format

    +
  • + + +

    + + + def + + + nDaysAfterDate(nbrOfDaysAfter: Int, date: String): String + +

    + + Permalink + + +

    Returns which date it will be x days after the given date under the +default format.

    Returns which date it will be x days after the given date under the +default format.

    If the given date is "20170122" and we request the date it will be 3 days +after, we'll return "20170125".

    assert(DateHelper.nDaysAfterDate(5, "20170305") == "20170310")
    nbrOfDaysAfter

    the nbr of days after the given date

    date

    the date under the default format for which we want the date +for nbrOfDaysAfter days after.

    returns

    the date it was nbrOfDaysAfter after date under the default +format.

  • @@ -449,7 +560,7 @@

    def - nDaysAfterDate(nbrOfDaysAfter: Int, date: String, format: String = "yyyyMMdd"): String + nDaysAfterDate(nbrOfDaysAfter: Int, date: String, format: String): String

    @@ -457,11 +568,27 @@

    Returns which date it will be x days after the given date.

    Returns which date it will be x days after the given date.

    If the given date is "20170122" and we request the date it will be 3 days -after, we'll return "20170125".

    assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310")
    -assert(DateHelper.nDaysAfterDate(5, "170305", "yyMMdd") == "170310")
    nbrOfDaysAfter

    the nbr of days after the given date

    date

    the date under the provided format for which we want the date -for nbrOfDaysAfter days after.

    format

    (default = "yyyyMMdd") the format for the provided and -returned dates.

    returns

    the date it was nbrOfDaysAfter after date under the requested +after, we'll return "20170125".

    assert(DateHelper.nDaysAfterDate(5, "170305", "yyMMdd") == "170310")
    nbrOfDaysAfter

    the nbr of days after the given date

    date

    the date under the provided format for which we want the date +for nbrOfDaysAfter days after.

    format

    the format for the provided and returned dates.

    returns

    the date it was nbrOfDaysAfter after date under the requested format.

    +

  • + + +

    + + + def + + + nDaysBefore(nbrOfDaysBefore: Int): String + +

    + + Permalink + + +

    Returns which date it was x days before today.

    Returns which date it was x days before today.

    // If today's "20170310":
    +assert(DateHelper.nDaysBefore(5) == "20170305")
    nbrOfDaysBefore

    the nbr of days before today

    returns

    today's date minus the nbrOfDaysBefore under the default format

  • @@ -471,17 +598,35 @@

    def - nDaysBefore(nbrOfDaysBefore: Int, format: String = "yyyyMMdd"): String + nDaysBefore(nbrOfDaysBefore: Int, format: String): String

    Permalink -

    Returns which date it was x days before today under the requested format.

    Returns which date it was x days before today under the requested format.

    If we're "20170125" and we request for 3 days before, we'll return -"20170122".

    // If today's "20170310":
    -assert(DateHelper.nDaysBefore(3) == "20170307")
    -assert(DateHelper.nDaysBefore(5, "yyMMdd") == "170305")
    nbrOfDaysBefore

    the nbr of days before today

    format

    (default = "yyyyMMdd") the format for the returned date

    returns

    today's date minus the nbrOfDaysBefore under the requested format

    +

    Returns which date it was x days before today under the requested format.

    Returns which date it was x days before today under the requested format.

    // If today's "20170310":
    +assert(DateHelper.nDaysBefore(5, "yyMMdd") == "170305")
    nbrOfDaysBefore

    the nbr of days before today

    format

    the format for the returned date

    returns

    today's date minus the nbrOfDaysBefore under the requested format

    +
  • + + +

    + + + def + + + nDaysBeforeDate(nbrOfDaysBefore: Int, date: String): String + +

    + + Permalink + + +

    Returns which date it was x days before the given date.

    Returns which date it was x days before the given date.

    If the given date is "20170125" and we request the date it was 3 days +before, this will return "20170122".

    assert(DateHelper.nDaysBeforeDate(5, "20170310") == "20170305")
    nbrOfDaysBefore

    the nbr of days before the given date

    date

    the date under the default format for which we want the date +for nbrOfDaysBefore days before.

    returns

    the date it was nbrOfDaysBefore before date under the default +format.

  • @@ -491,7 +636,7 @@

    def - nDaysBeforeDate(nbrOfDaysBefore: Int, date: String, format: String = "yyyyMMdd"): String + nDaysBeforeDate(nbrOfDaysBefore: Int, date: String, format: String): String

    @@ -499,10 +644,8 @@

    Returns which date it was x days before the given date.

    Returns which date it was x days before the given date.

    If the given date is "20170125" and we request the date it was 3 days -before, we'll return "20170122".

    assert(DateHelper.nDaysBeforeDate(3, "20170310") == "20170307")
    -assert(DateHelper.nDaysBeforeDate(5, "170310", "yyMMdd") == "170305")
    nbrOfDaysBefore

    the nbr of days before the given date

    date

    the date under the provided format for which we want the date -for nbrOfDaysBefore days before.

    format

    (default = "yyyyMMdd") the format for the provided and -returned dates.

    returns

    the date it was nbrOfDaysBefore before date under the requested +before, this will return "20170122".

    assert(DateHelper.nDaysBeforeDate(5, "170310", "yyMMdd") == "170305")
    nbrOfDaysBefore

    the nbr of days before the given date

    date

    the date under the provided format for which we want the date +for nbrOfDaysBefore days before.

    format

    the format for the provided and returned dates.

    returns

    the date it was nbrOfDaysBefore before date under the requested format.

  • @@ -513,7 +656,7 @@

    def - nbrOfDaysBetween(firstDate: String, lastDate: String, format: String = "yyyyMMdd"): Int + nbrOfDaysBetween(firstDate: String, lastDate: String, format: String = defaultFormat): Int

    @@ -524,7 +667,7 @@

    assert(DateHelper.nbrOfDaysBetween("20170327", "20170401") == 5)

    This expects the first date to be before the last date.

    firstDate

    the first date of the range for which to egt the nbr of days.

    lastDate

    the last date of the range for which to egt the nbr of -days.

    format

    (default = "yyyyMMdd") the format of the provided dates

    returns

    the nbr of days between the two given dates

    +days.

    format

    the format of the provided dates

    returns

    the nbr of days between the two given dates

  • @@ -534,7 +677,7 @@

    def - nbrOfDaysSince(date: String, format: String = "yyyyMMdd"): Int + nbrOfDaysSince(date: String, format: String = defaultFormat): Int

    @@ -543,7 +686,7 @@

    Returns the nbr of days between today and the given date.

    Returns the nbr of days between today and the given date.

    // If today is "20170327":
     assert(DateHelper.nbrOfDaysSince("20170310") == 17)
    -assert(DateHelper.nbrOfDaysSince("170310", "yyMMdd") == 17)
    date

    the date for which to find the nbr of days of diff with today

    format

    (default = "yyyyMMdd") the format of the provided date

    returns

    the nbr of days between today and the given date

    +assert(DateHelper.nbrOfDaysSince("170310", "yyMMdd") == 17)
    date

    the date for which to find the nbr of days of diff with today

    format

    the format of the provided date

    returns

    the nbr of days between today and the given date

  • @@ -570,17 +713,15 @@

    def - nextDay(date: String, format: String = "yyyyMMdd"): String + nextDay(date: String, format: String = defaultFormat): String

    Permalink -

    Returns for a date the date one day latter.

    Returns for a date the date one day latter.

    // If the given date is "20170310":
    -assert(DateHelper.nextDay("20170310") == "20170311")
    -assert(DateHelper.nextDay("170310", "yyMMdd") == "170311")
    date

    the date for which to find the date of the day after

    format

    (default = "yyyyMMdd") the format of the provided and the -returned dates.

    returns

    the date of the day after the given date

    +

    Returns for a date the date one day latter.

    Returns for a date the date one day latter.

    assert(DateHelper.nextDay("20170310") == "20170311")
    +assert(DateHelper.nextDay("170310", "yyMMdd") == "170311")
    date

    the date for which to find the date of the day after

    format

    the format of the provided and the returned dates

    returns

    the date of the day after the given date

  • @@ -643,17 +784,15 @@

    def - previousDay(date: String, format: String = "yyyyMMdd"): String + previousDay(date: String, format: String = defaultFormat): String

    Permalink -

    Returns for a date the date one day before.

    Returns for a date the date one day before.

    // If the given date is "20170310":
    -assert(DateHelper.previousDay("20170310") == "20170309")
    -assert(DateHelper.previousDay("170310", "yyMMdd") == "170309")
    date

    the date for which to find the date of the day before

    format

    (default = "yyyyMMdd") the format of the provided and the -returned dates.

    returns

    the date of the day before the given date

    +

    Returns for a date the date one day before.

    Returns for a date the date one day before.

    assert(DateHelper.previousDay("20170310") == "20170309")
    +assert(DateHelper.previousDay("170310", "yyMMdd") == "170309")
    date

    the date for which to find the date of the day before

    format

    the format of the provided and the returned dates

    returns

    the date of the day before the given date

  • @@ -671,6 +810,29 @@

    Reformats a date from one format to another.

    Reformats a date from one format to another.

    assert(DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") == "170327")
    date

    the date to reformat

    inputFormat

    the format in which the date to reformat is provided

    outputFormat

    the format in which to format the provided date

    returns

    the date under the new format

    +

  • + + +

    + + + def + + + setFormat(format: String): Unit + +

    + + Permalink + + +

    Sets the default date format used by these functions when no date format +is specified.

    Sets the default date format used by these functions when no date format +is specified.

    // By default, yyyyMMdd is used:
    +assert(3.daysBefore == "20170307")
    +// But this can be modified globally:
    +DateHelper.setFormat("ddMMMyy")
    +assert(3.daysBefore == "07Mar17")
    format

    the new default format

  • @@ -705,6 +867,24 @@

    Definition Classes
    AnyRef → Any
    +

  • + + +

    + + + def + + + today: String + +

    + + Permalink + + +

    Returns today's date/time under the default format.

    Returns today's date/time under the default format.

    // If today's "20170310":
    +assert(DateHelper.today() == "20170310")
    returns

    today's date under the default format

  • @@ -714,7 +894,7 @@

    def - today(format: String = "yyyyMMdd"): String + today(format: String): String

    @@ -722,8 +902,25 @@

    Returns today's date/time under the requested format.

    Returns today's date/time under the requested format.

    // If today's "20170310":
    -assert(DateHelper.today() == "20170310")
    -assert(DateHelper.today("yyMMdd") == "170310")
    format

    (default = "yyyyMMdd") the format for the current date

    returns

    today's date under the requested format

    +assert(DateHelper.today("yyMMdd") == "170310")
    format

    the format for the current date

    returns

    today's date under the requested format

    +

  • + + +

    + + + def + + + twoDaysAgo(): String + +

    + + Permalink + + +

    Returns which date it was 2 days before today under the default format.

    Returns which date it was 2 days before today under the default format.

    // If today's "20170310":
    +assert(DateHelper.twoDaysAgo() == "20170308")
    returns

    the date of two days ago under the default format

  • @@ -733,7 +930,7 @@

    def - twoDaysAgo(format: String = "yyyyMMdd"): String + twoDaysAgo(format: String): String

    @@ -741,9 +938,7 @@

    Returns which date it was 2 days before today under the requested format.

    Returns which date it was 2 days before today under the requested format.

    // If today's "20170310":
    -assert(DateHelper.twoDaysAgo() == "20170308")
    -assert(DateHelper.twoDaysAgo("yyMMdd") == "170308")
    format

    (default = "yyyyMMdd") the format in which to output the -date of two days ago.

    returns

    the date of two days ago under the requested format

    +assert(DateHelper.twoDaysAgo("yyMMdd") == "170308")
    format

    the format in which to output the date of two days ago

    returns

    the date of two days ago under the requested format

  • @@ -813,6 +1008,24 @@

    ) +

  • + + +

    + + + def + + + yesterday: String + +

    + + Permalink + + +

    Returns yesterday's date/time under the default format.

    Returns yesterday's date/time under the default format.

    // If today's "20170310":
    +assert(DateHelper.yesterday() == "20170309")
    returns

    yesterday's date under the default format

  • @@ -822,7 +1035,7 @@

    def - yesterday(format: String = "yyyyMMdd"): String + yesterday(format: String): String

    @@ -830,9 +1043,7 @@

    Returns yesterday's date/time under the requested format.

    Returns yesterday's date/time under the requested format.

    // If today's "20170310":
    -assert(DateHelper.yesterday() == "20170309")
    -assert(DateHelper.yesterday("yyMMdd") == "170309")
    format

    (default = "yyyyMMdd") the format in which to output the -date of yesterday.

    returns

    yesterday's date under the requested format

    +assert(DateHelper.yesterday("yyMMdd") == "170309")
    format

    the format in which to output the date of yesterday

    returns

    yesterday's date under the requested format

  • diff --git a/docs/com/spark_helper/HdfsHelper$$SeqExtensions.html b/docs/com/spark_helper/HdfsHelper$$SeqExtensions.html new file mode 100644 index 0000000..b234a92 --- /dev/null +++ b/docs/com/spark_helper/HdfsHelper$$SeqExtensions.html @@ -0,0 +1,539 @@ + + + + SeqExtensions - com.spark_helper.HdfsHelper.SeqExtensions + + + + + + + + + + + + + + + +
    + Class +

    com.spark_helper.HdfsHelper

    +

    SeqExtensions

    Related Doc: + package HdfsHelper +

    + + Permalink + + +
    + +

    + + implicit + class + + + SeqExtensions[T <: Seq[String]] extends AnyRef + +

    + +
    + Linear Supertypes +
    AnyRef, Any
    +
    + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. SeqExtensions
    2. AnyRef
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + +

      + + + new + + + SeqExtensions(seq: T)(implicit arg0: ClassTag[T]) + +

      + + Permalink + + + +
    +
    + + + + + +
    +

    Value Members

    +
    1. + + +

      + + final + def + + + !=(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + +

      + + final + def + + + ##(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + +

      + + final + def + + + ==(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + +

      + + final + def + + + asInstanceOf[T0]: T0 + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    5. + + +

      + + + def + + + clone(): AnyRef + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    6. + + +

      + + final + def + + + eq(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    7. + + +

      + + + def + + + equals(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + +

      + + + def + + + finalize(): Unit + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + +

      + + final + def + + + getClass(): Class[_] + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    10. + + +

      + + + def + + + hashCode(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    11. + + +

      + + final + def + + + isInstanceOf[T0]: Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    12. + + +

      + + final + def + + + ne(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    13. + + +

      + + final + def + + + notify(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    14. + + +

      + + final + def + + + notifyAll(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    15. + + +

      + + + val + + + seq: T + +

      + + Permalink + + + +
    16. + + +

      + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    17. + + +

      + + + def + + + toString(): String + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    18. + + +

      + + final + def + + + wait(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    19. + + +

      + + final + def + + + wait(arg0: Long, arg1: Int): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    20. + + +

      + + final + def + + + wait(arg0: Long): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    21. + + +

      + + + def + + + writeToHdfs(filePath: String): Unit + +

      + + Permalink + + +

      Saves list elements in a file on hdfs.

      Saves list elements in a file on hdfs.

      Please only consider this way of storing data when the data set is small +enough.

      Overwrites the file if it already exists.

      Array("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt")
      +List("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt")
      filePath

      the path of the file in which to write the content of +the List.

      +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + + + + diff --git a/docs/com/spark_helper/HdfsHelper$$StringExtensions.html b/docs/com/spark_helper/HdfsHelper$$StringExtensions.html new file mode 100644 index 0000000..ca7faf5 --- /dev/null +++ b/docs/com/spark_helper/HdfsHelper$$StringExtensions.html @@ -0,0 +1,536 @@ + + + + StringExtensions - com.spark_helper.HdfsHelper.StringExtensions + + + + + + + + + + + + + + + +
    + Class +

    com.spark_helper.HdfsHelper

    +

    StringExtensions

    Related Doc: + package HdfsHelper +

    + + Permalink + + +
    + +

    + + implicit + class + + + StringExtensions extends AnyRef + +

    + +
    + Linear Supertypes +
    AnyRef, Any
    +
    + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. StringExtensions
    2. AnyRef
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + +

      + + + new + + + StringExtensions(string: String) + +

      + + Permalink + + + +
    +
    + + + + + +
    +

    Value Members

    +
    1. + + +

      + + final + def + + + !=(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + +

      + + final + def + + + ##(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + +

      + + final + def + + + ==(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + +

      + + final + def + + + asInstanceOf[T0]: T0 + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    5. + + +

      + + + def + + + clone(): AnyRef + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    6. + + +

      + + final + def + + + eq(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    7. + + +

      + + + def + + + equals(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + +

      + + + def + + + finalize(): Unit + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + +

      + + final + def + + + getClass(): Class[_] + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    10. + + +

      + + + def + + + hashCode(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    11. + + +

      + + final + def + + + isInstanceOf[T0]: Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    12. + + +

      + + final + def + + + ne(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    13. + + +

      + + final + def + + + notify(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    14. + + +

      + + final + def + + + notifyAll(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    15. + + +

      + + + val + + + string: String + +

      + + Permalink + + + +
    16. + + +

      + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    17. + + +

      + + + def + + + toString(): String + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    18. + + +

      + + final + def + + + wait(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    19. + + +

      + + final + def + + + wait(arg0: Long, arg1: Int): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    20. + + +

      + + final + def + + + wait(arg0: Long): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    21. + + +

      + + + def + + + writeToHdfs(filePath: String): Unit + +

      + + Permalink + + +

      Saves the String in a file on hdfs.

      Saves the String in a file on hdfs.

      Overwrites the file if it already exists.

      "some\nrelatively small\ntext".writeToHdfsFile("/some/hdfs/file/path.txt")
      filePath

      the path of the file in which to write the String

      +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + + + + diff --git a/docs/com/spark_helper/HdfsHelper$.html b/docs/com/spark_helper/HdfsHelper$.html index ab99d93..ee3a0ca 100644 --- a/docs/com/spark_helper/HdfsHelper$.html +++ b/docs/com/spark_helper/HdfsHelper$.html @@ -54,38 +54,39 @@

    spark job and replace it with methods fully tested whose name is self-explanatory/readable.

    For instance, one don't want to remove a file from hdfs using 3 lines of code and thus could instead just use -HdfsHelper.deleteFile("my/hdfs/file/path.csv").

    A few exemples:

    import com.spark_helper.HdfsHelper
    +HdfsHelper.deleteFile("my/hdfs/file/path.csv").

    A few examples:

    import com.spark_helper.HdfsHelper
     
     // A bunch of methods wrapping the FileSystem API, such as:
    -HdfsHelper.fileExists("my/hdfs/file/path.txt")
    -assert(HdfsHelper.listFileNamesInFolder("my/folder/path") == List("file_name_1.txt", "file_name_2.csv"))
    -assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306")
    -assert(HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") == 3)
    -HdfsHelper.deleteFile("my/hdfs/file/path.csv")
    -HdfsHelper.moveFolder("my/hdfs/folder")
    +HdfsHelper.fileExists("my/hdfs/file/path.txt") // HdfsHelper.folderExists("my/hdfs/folder")
    +HdfsHelper.listFileNamesInFolder("my/folder/path") // List("file_name_1.txt", "file_name_2.csv")
    +HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") // "20170306"
    +HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") // 3
    +HdfsHelper.deleteFile("my/hdfs/file/path.csv") // HdfsHelper.deleteFolder("my/hdfs/folder")
    +HdfsHelper.moveFolder("old/path", "new/path") // HdfsHelper.moveFile("old/path.txt", "new/path.txt")
    +HdfsHelper.createEmptyHdfsFile("/some/hdfs/file/path.token") // HdfsHelper.createFolder("my/hdfs/folder")
    +
    +// File content helpers:
     HdfsHelper.compressFile("hdfs/path/to/uncompressed_file.txt", classOf[GzipCodec])
     HdfsHelper.appendHeader("my/hdfs/file/path.csv", "colum0,column1")
     
     // Some Xml/Typesafe helpers for hadoop as well:
    -HdfsHelper.isHdfsXmlCompliantWithXsd(
    -  "my/hdfs/file/path.xml", getClass.getResource("/some_xml.xsd"))
    +HdfsHelper.isHdfsXmlCompliantWithXsd("my/hdfs/file/path.xml", getClass.getResource("/some_xml.xsd"))
     HdfsHelper.loadXmlFileFromHdfs("my/hdfs/file/path.xml")
     
    -// Very handy to load a config (typesafe format) stored on hdfs at the
    -// begining of a spark job:
    +// Very handy to load a config (typesafe format) stored on hdfs at the beginning of a spark job:
     HdfsHelper.loadTypesafeConfigFromHdfs("my/hdfs/file/path.conf"): Config
     
    -// In order to write small amount of data in a file on hdfs without the
    -// whole spark stack:
    -HdfsHelper.writeToHdfsFile(
    -  Array("some", "relatively small", "text"),
    -  "/some/hdfs/file/path.txt")
    +// In order to write small amount of data in a file on hdfs without the whole spark stack:
    +HdfsHelper.writeToHdfsFile(Array("some", "relatively small", "text"), "/some/hdfs/file/path.txt")
    +// or:
    +import com.spark_helper.HdfsHelper._
    +Array("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt")
    +"hello world".writeToHdfs("/some/hdfs/file/path.txt")
     
    -// Deletes all files/folders in "hdfs/path/to/folder" for which the
    -// timestamp is older than 10 days:
    +// Deletes all files/folders in "hdfs/path/to/folder" for which the timestamp is older than 10 days:
     HdfsHelper.purgeFolder("hdfs/path/to/folder", 10)

    Source HdfsHelper -

    Since

    2017-02

    +

    Since

    2017-02

    To do

    Create a touch method

    Linear Supertypes
    Serializable, Serializable, AnyRef, Any
    @@ -124,7 +125,44 @@

    - +
    +

    Type Members

    +
    1. + + +

      + + implicit + class + + + SeqExtensions[T <: Seq[String]] extends AnyRef + +

      + + Permalink + + + +
    2. + + +

      + + implicit + class + + + StringExtensions extends AnyRef + +

      + + Permalink + + + +
    +
    @@ -200,7 +238,7 @@

    Appends a footer to a file.

    Appends a footer to a file.

    If the workingFolderPath parameter is provided, then the processing is done in a working/tmp folder and then only, the final file is moved to its final real location. This way, in case of cluster instability, i.e. in -case the Spark job is interupted, this avoids having a temporary or +case the Spark job is interrupted, this avoids having a temporary or corrupted file in output.

    filePath

    the path of the file for which to add the footer

    footer

    the footer to add

    workingFolderPath

    the path where file manipulations will happen

  • @@ -219,11 +257,11 @@

    Permalink -

    Appends a header to a file.

    Appends a header to a file.

    Usefull when creating a csv file with spark and you need to add a header +

    Appends a header to a file.

    Appends a header to a file.

    Useful when creating a csv file with spark and you need to add a header describing the different fields.

    If the workingFolderPath parameter is provided, then the processing is done in a working/tmp folder and then only, the final file is moved to its final real location. This way, in case of cluster instability, i.e. in -case the Spark job is interupted, this avoids having a temporary or +case the Spark job is interrupted, this avoids having a temporary or corrupted file in output.

    filePath

    the path of the file for which to add the header

    header

    the header to add

    workingFolderPath

    the path where file manipulations will happen

  • @@ -242,11 +280,11 @@

    Permalink -

    Appends a header and a footer to a file.

    Appends a header and a footer to a file.

    Usefull when creating an xml file with spark and you need to add top level +

    Appends a header and a footer to a file.

    Appends a header and a footer to a file.

    Useful when creating an xml file with spark and you need to add top level tags.

    If the workingFolderPath parameter is provided, then the processing is done in a working/tmp folder and then only, the final file is moved to its final real location. This way, in case of cluster instability, i.e. in -case the Spark job is interupted, this avoids having a temporary or +case the Spark job is interrupted, this avoids having a temporary or corrupted file in output.

    filePath

    the path of the file for which to add the header and the footer.

    header

    the header to add

    footer

    the footer to add

    workingFolderPath

    the path where file manipulations will happen

    @@ -329,8 +367,8 @@

    Permalink -

    Creates an empty file on hdfs.

    Creates an empty file on hdfs.

    Might be usefull for token files. For instance a file which is only used -as a timestamp token of the last update of a processus, or a file which +

    Creates an empty file on hdfs.

    Creates an empty file on hdfs.

    Might be useful for token files. For instance a file which is only used +as a timestamp token of the last update of a process, or a file which blocks the execution of an other instance of the same job, ...

    Overwrites the file if it already exists.

    HdfsHelper.createEmptyHdfsFile("/some/hdfs/file/path.token")

    In case this is used as a timestamp container, you can then use the following methods to retrieve its timestamp:

    val fileAge = HdfsHelper.nbrOfDaysSinceFileWasLastModified("/some/hdfs/file/path.token")
     val lastModificationDate = HdfsHelper.folderModificationDate("/some/hdfs/file/path.token")
    filePath

    the path of the empty file to create

    @@ -456,9 +494,9 @@

    Permalink -

    Returns the stringified date of the last modification of the given file.

    Returns the stringified date of the last modification of the given file.

    assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306")
    hdfsPath

    the path of the file for which to get the last +

    Returns the formatted date of the last modification of the given file.

    Returns the formatted date of the last modification of the given file.

    assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306")
    hdfsPath

    the path of the file for which to get the last modification date.

    format

    (default = "yyyyMMdd") the format under which to get the -modification date.

    returns

    the stringified date of the last modification of the given file, +modification date.

    returns

    the formatted date of the last modification of the given file, under the provided format.

  • @@ -536,9 +574,9 @@

    Permalink -

    Returns the stringified date of the last modification of the given folder.

    Returns the stringified date of the last modification of the given folder.

    assert(HdfsHelper.folderModificationDate("my/hdfs/folder") == "20170306")
    hdfsPath

    the path of the folder for which to get the last +

    Returns the formatted date of the last modification of the given folder.

    Returns the formatted date of the last modification of the given folder.

    assert(HdfsHelper.folderModificationDate("my/hdfs/folder") == "20170306")
    hdfsPath

    the path of the folder for which to get the last modification date.

    format

    (default = "yyyyMMdd") the format under which to get the -modification date.

    returns

    the stringified date of the last modification of the given folder, +modification date.

    returns

    the formatted date of the last modification of the given folder, under the provided format.

  • @@ -683,7 +721,7 @@

    Permalink -

    Loads a typesafe config from Hdfs.

    Loads a typesafe config from Hdfs.

    The best way to load the configuration of your job from hdfs.

    Typesafe is a config format which looks like this:

    config {
    +      

    Loads a Typesafe config from Hdfs.

    Loads a Typesafe config from Hdfs.

    The best way to load the configuration of your job from hdfs.

    Typesafe is a config format which looks like this:

    config {
       airlines = [
         {
           code = QF
    @@ -702,8 +740,8 @@ 

    } } ] -}

    hdfsConfigPath

    the absolute path of the typesafe config file on -hdfs we want to load as a typesafe Config object.

    returns

    the com.typesafe.config.Config object which contains usable data

    +}
    hdfsConfigPath

    the absolute path of the Typesafe config file on +hdfs we want to load as a Typesafe Config object.

    returns

    the com.typesafe.config.Config object which contains usable data

  • @@ -854,6 +892,50 @@

    // timestamp is older than 10 days: HdfsHelper.purgeFolder("hdfs/path/to/folder", 10)

  • folderPath

    the path of the folder on hdfs to purge

    purgeAge

    the threshold (in nbr of days) above which a file is considered too old and thus deleted/purged.

    +
  • + + +

    + + + def + + + setConf(configuration: Configuration): Unit + +

    + + Permalink + + +

    Sets a specific Configuration +used by the underlying FileSystem +in case it requires some specificities.

    Sets a specific Configuration +used by the underlying FileSystem +in case it requires some specificities.

    If this setter is not used, the default Configuration is set with +new Configuration(). +

    configuration

    the specific Configuration to use

    +
  • + + +

    + + + def + + + setFileSystem(fileSystem: FileSystem): Unit + +

    + + Permalink + + +

    Sets a specific FileSystem +in case it requires some specificities.

    Sets a specific FileSystem +in case it requires some specificities.

    If this setter is not used, the default FileSystem is set with +FileSystem.get(new Configuration()). +

    fileSystem

    the specific FileSystem to use

  • @@ -999,7 +1081,7 @@

    enough.

    Overwrites the file if it already exists.

    HdfsHelper.writeToHdfsFile(
       Array("some", "relatively small", "text"), "/some/hdfs/file/path.txt")
     HdfsHelper.writeToHdfsFile(
    -  List("some", "relatively small", "text"), "/some/hdfs/file/path.txt")
    content

    the array of strings to write in the file as one line per + List("some", "relatively small", "text"), "/some/hdfs/file/path.txt")

    content

    the seq of strings to write in the file as one line per string (this takes care of joining strings with "\n"s).

    filePath

    the path of the file in which to write the content

  • diff --git a/docs/com/spark_helper/Monitor$.html b/docs/com/spark_helper/Monitor$.html index 955f476..15e0a20 100644 --- a/docs/com/spark_helper/Monitor$.html +++ b/docs/com/spark_helper/Monitor$.html @@ -49,11 +49,11 @@

    -

    A logger dedicated to Spak jobs.

    It's a simple logger/report which contains a report that one can update from +

    A logger dedicated to Spark jobs.

    It's a simple logger/report which contains a report that one can update from the driver and a success state. The idea is to persist job executions logs -and errors (and forget about grepping unreadable yarn logs).

    It's designed for perdiodic spark jobs (handles storage and purge of logs) +and errors (and forget about grepping unreadable yarn logs).

    It's designed for periodic spark jobs (handles storage and purge of logs) and provides a way to handle kpis validation.

    Logs are stored on the go which means one can have a direct real time access -of the job logs/status and it's current state (which can overwise be a pain +of the job logs/status and it's current state (which can otherwise be a pain if it means going through yarn logs, or even for certain production environments going through additional layers of software logs to get to yarn logs).

    One of the issues this logger aims at tackling is the handling of exceptions @@ -62,9 +62,9 @@

    want to perform a few actions before letting the job crash. The idea is thus to surround (driver side) a Spark pipeline within a try catch and redirect the exception to the logger for a clean logging.

    This is a "driver-only" logger and is not intended at logging concurrent -actions from executors.

    Produced reports can easily be inserted in a notification email whenerver +actions from executors.

    Produced reports can easily be inserted in a notification email whenever the job fails, which saves a lot of time to maintainers operating on heavy -production environements.

    The produced persisted report is also a way for downstream jobs to know the +production environments.

    The produced persisted report is also a way for downstream jobs to know the status of their input data.

    Let's go through a simple Spark job example monitored with this Monitor facility:

    Monitor.setTitle("My job title")
     Monitor.addDescription(
    @@ -83,7 +83,7 @@ 

    Test("Nbr of output records", processedData.count(), SUPERIOR_THAN, 10e6d, NBR), Test("Some pct of invalid output", your_complex_kpi, INFERIOR_THAN, 3, PCT) ), - "My pipeline descirption" + "My pipeline description" ) if (outputIsValid) @@ -91,9 +91,9 @@

    } catch { case iie: InvalidInputException => - Monitor.error(iie, "My pipeline descirption", diagnostic = "No input data!") + Monitor.error(iie, "My pipeline description", diagnostic = "No input data!") case e: Throwable => - Monitor.error(e, "My pipeline descirption") // whatever unexpected error + Monitor.error(e, "My pipeline description") // whatever unexpected error } if (Monitor.isSuccess()) { @@ -105,7 +105,7 @@

    // HDFS (this saves the logs in the folder set with Monitor.setLogFolder): Monitor.store() -// At the end of the job, if the job isn't successfull, you might want to +// At the end of the job, if the job isn't successful, you might want to // crash it (for instance to get a notification from your scheduler): if (!Monitor.isSuccess()) throw new Exception() // or send an email, or ...

    At any time during the job, logs can be accessed from file path/to/log/folder/current.ongoing

    If we were to read the stored report after this simple pipeline, here are @@ -113,8 +113,8 @@

    My job description (whatever you want); for instance: Documentation: https://github.com/xavierguihot/spark_helper -[10:23] Begining -[10:23-10:23] My pipeline descirption: failed +[10:23] Beginning +[10:23-10:23] My pipeline description: failed Diagnostic: No input data! org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://my/hdfs/input/path at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:285) @@ -124,18 +124,18 @@

    My job description (whatever you want); for instance: Documentation: https://github.com/xavierguihot/spark_helper -[10:23] Begining -[10:23-10:36] My pipeline descirption: failed +[10:23] Beginning +[10:23-10:36] My pipeline description: failed java.lang.NumberFormatException: For input string: "a" java.lang.NumberFormatException.forInputString(NumberFormatException.java:65) java.lang.Integer.parseInt(Integer.java:492) ... -[10:36] Duration: 00:13:47

    Another scenario, successfull spark pipeline and KPIs are valid; all good!:

              My job title
    +[10:36] Duration: 00:13:47

    Another scenario, successful spark pipeline and KPIs are valid; all good!:

              My job title
     
     My job description (whatever you want); for instance:
     Documentation: https://github.com/xavierguihot/spark_helper
    -[10:23] Begining
    -[10:23-10:41] My pipeline descirption: success
    +[10:23] Beginning
    +[10:23-10:41] My pipeline description: success
       KPI: Nbr of output records
         Value: 14669071.0
         Must be superior than 10000000.0
    @@ -147,7 +147,7 @@ 

    [10:41-10:42] My second pipeline description: success [10:42] Duration: 00:19:23

    Source Monitor -

    Since

    2017-02

    +

    Since

    2017-02

    To do

    would a State monad be appropriate?

    Linear Supertypes
    AnyRef, Any
    @@ -262,7 +262,7 @@

    Sets the report's contact list.

    Sets the report's contact list.

    This will appear within the first lines of the report:

    // Using:
     Monitor.setReportTitle("My Simple Job")
     Monitor.addContacts(List("x.guihot@gmail.com", "smbdy@gmail.com"))
    -// Produces this at the begining of the report:
    +// Produces this at the beginning of the report:
     "          My Simple Job"
     ""
     "Point of contact: x.guihot@gmail.com, smbdy@gmail.com"
    contacts

    the list of points of contact

    @@ -285,7 +285,7 @@

    Sets the report's description.

    Sets the report's description.

    This will appear within the first lines of the report:

    // Using:
     Monitor.setReportTitle("My Simple Job")
     Monitor.addDescription("Documentation: https://github.com/xavierguihot/spark_helper")
    -// Produces this at the begining of the report:
    +// Produces this at the beginning of the report:
     "          My Simple Job"
     ""
     "Documentation: https://github.com/xavierguihot/spark_helper"
    description

    the description of the Spark job (or whatever)

    @@ -372,7 +372,7 @@

    def - error(exception: Throwable, taskDescription: String, diagnostic: String = ""): Boolean + error(exception: Throwable, taskDescription: String, diagnostic: String = ""): Boolean

    @@ -385,8 +385,8 @@

    catch whatever exception from executors and thus log the exact error while still being able to keep on with the job or end it properly.

    Catching an error like this:

    monitor.error(
       invalidInputException,
    -  "My pipeline descirption",
    -  diagnostic = "No input data!")

    will result in this to be appended to the report:

    [10:23-10:24] My pipeline descirption: failed
    +  "My pipeline description",
    +  diagnostic = "No input data!")

    will result in this to be appended to the report:

    [10:23-10:24] My pipeline description: failed
       Diagnostic: No input data!
         org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://my/hdfs/input/path
         at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:285)
    @@ -410,7 +410,7 @@ 

    Updates the report with some text and a failure.

    Updates the report with some text and a failure.

    This sets the status of the monitoring to false. After that the status -will never be success again, even if you update the report with success().

    Using this method like this:

    monitor.error("Some text")

    will result in this to be appended to the report:

    "[10:35-10:37] Some text: failure\n"

    Once the monitoring is a failure, then whatever following successfull +will never be success again, even if you update the report with success().

    Using this method like this:

    monitor.error("Some text")

    will result in this to be appended to the report:

    "[10:35-10:37] Some text: failure\n"

    Once the monitoring is a failure, then whatever following successful action won't change the failed status of the monitoring.

    taskDescription

    the text to append to the report

    returns

    false since it's a failure

  • @@ -488,18 +488,18 @@

    Definition Classes
    Any

  • - - + +

    def - isSuccess(): Boolean + isSuccess: Boolean

    - + Permalink @@ -696,7 +696,7 @@

    Sets the report's title.

    Sets the report's title.

    This will be the first line of the report:

    // Using:
     Monitor.setReportTitle("My Simple Job")
    -// Produces this at the begining of the report:
    +// Produces this at the beginning of the report:
     "          My Simple Job"
     ""
    title

    the title of the report

  • diff --git a/docs/com/spark_helper/SparkHelper$$OptionRDDExtensions.html b/docs/com/spark_helper/SparkHelper$$OptionRDDExtensions.html new file mode 100644 index 0000000..66a0d24 --- /dev/null +++ b/docs/com/spark_helper/SparkHelper$$OptionRDDExtensions.html @@ -0,0 +1,540 @@ + + + + OptionRDDExtensions - com.spark_helper.SparkHelper.OptionRDDExtensions + + + + + + + + + + + + + + + +
    + Class +

    com.spark_helper.SparkHelper

    +

    OptionRDDExtensions

    Related Doc: + package SparkHelper +

    + + Permalink + + +
    + +

    + + implicit + class + + + OptionRDDExtensions[T] extends AnyRef + +

    + +
    + Linear Supertypes +
    AnyRef, Any
    +
    + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. OptionRDDExtensions
    2. AnyRef
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + +

      + + + new + + + OptionRDDExtensions(rdd: RDD[Option[T]])(implicit arg0: ClassTag[T]) + +

      + + Permalink + + + +
    +
    + + + + + +
    +

    Value Members

    +
    1. + + +

      + + final + def + + + !=(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + +

      + + final + def + + + ##(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + +

      + + final + def + + + ==(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + +

      + + final + def + + + asInstanceOf[T0]: T0 + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    5. + + +

      + + + def + + + clone(): AnyRef + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    6. + + +

      + + final + def + + + eq(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    7. + + +

      + + + def + + + equals(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + +

      + + + def + + + finalize(): Unit + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + +

      + + + def + + + flatten: RDD[T] + +

      + + Permalink + + +

      Flattens an RDD of Option[T] +to RDD[T].

      Flattens an RDD of Option[T] +to RDD[T].

      sc.parallelize(Array(Some(1), None, Some(2))).flatten == sc.parallelize(Array(Seq(1, 2)))
      returns

      the flat RDD as RDD.flatMap(x => x) +or List.flatten +would have.

      +
    10. + + +

      + + final + def + + + getClass(): Class[_] + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    11. + + +

      + + + def + + + hashCode(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    12. + + +

      + + final + def + + + isInstanceOf[T0]: Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    13. + + +

      + + final + def + + + ne(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    14. + + +

      + + final + def + + + notify(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    15. + + +

      + + final + def + + + notifyAll(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    16. + + +

      + + + val + + + rdd: RDD[Option[T]] + +

      + + Permalink + + + +
    17. + + +

      + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    18. + + +

      + + + def + + + toString(): String + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    19. + + +

      + + final + def + + + wait(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    20. + + +

      + + final + def + + + wait(arg0: Long, arg1: Int): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    21. + + +

      + + final + def + + + wait(arg0: Long): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + + + + diff --git a/docs/com/spark_helper/SparkHelper$$PairRDDExtensions.html b/docs/com/spark_helper/SparkHelper$$PairRDDExtensions.html new file mode 100644 index 0000000..b136443 --- /dev/null +++ b/docs/com/spark_helper/SparkHelper$$PairRDDExtensions.html @@ -0,0 +1,376 @@ + + + + PairRDDExtensions - com.spark_helper.SparkHelper.PairRDDExtensions + + + + + + + + + + + + + + + +
    + Class +

    com.spark_helper.SparkHelper

    +

    PairRDDExtensions

    Related Doc: + package SparkHelper +

    + + Permalink + + +
    + +

    + + implicit final + class + + + PairRDDExtensions extends AnyVal + +

    + +
    + Linear Supertypes +
    AnyVal, Any
    +
    + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. PairRDDExtensions
    2. AnyVal
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + +

      + + + new + + + PairRDDExtensions(rdd: RDD[(String, String)]) + +

      + + Permalink + + + +
    +
    + + + + + +
    +

    Value Members

    +
    1. + + +

      + + final + def + + + !=(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    2. + + +

      + + final + def + + + ##(): Int + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    3. + + +

      + + final + def + + + ==(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    4. + + +

      + + final + def + + + asInstanceOf[T0]: T0 + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    5. + + +

      + + + def + + + getClass(): Class[_ <: AnyVal] + +

      + + Permalink + + +
      Definition Classes
      AnyVal → Any
      +
    6. + + +

      + + final + def + + + isInstanceOf[T0]: Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    7. + + +

      + + + val + + + rdd: RDD[(String, String)] + +

      + + Permalink + + + +
    8. + + +

      + + + def + + + saveAsTextFileByKey(path: String, keyNbr: Int, codec: Class[_ <: CompressionCodec]): Unit + +

      + + Permalink + + +

      Saves and repartitions a key/value RDD on files whose name is the key.

      Saves and repartitions a key/value RDD on files whose name is the key.

      Within the provided path, there will be one file per key in the given +keyValueRDD. And within a file for a given key are only stored values +for this key.

      This is not scalable. This shouldn't be considered for any data flow +with normal or big volumes.

      rdd.saveAsTextFileByKey("/my/output/folder/path", 12, classOf[BZip2Codec])
      path

      the folder where will be stored key files

      keyNbr

      the nbr of expected keys (which is the nbr of output +files)

      codec

      the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))

      +
    9. + + +

      + + + def + + + saveAsTextFileByKey(path: String, codec: Class[_ <: CompressionCodec]): Unit + +

      + + Permalink + + +

      Saves and repartitions a key/value RDD on files whose name is the key.

      Saves and repartitions a key/value RDD on files whose name is the key.

      Within the provided path, there will be one file per key in the given +keyValueRDD. And within a file for a given key are only stored values +for this key.

      As this internally needs to know the nbr of keys, this will have to +compute it. If this nbr of keys is known beforehand, it would spare +resources to use +saveAsTextFileByKey(path: String, keyNbr: Int, codec: Class[_ <: CompressionCodec]) +instead.

      This is not scalable. This shouldn't be considered for any data flow +with normal or big volumes.

      rdd.saveAsTextFileByKey("/my/output/folder/path", classOf[BZip2Codec])
      path

      the folder where will be stored key files

      codec

      the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))

      +
    10. + + +

      + + + def + + + saveAsTextFileByKey(path: String, keyNbr: Int): Unit + +

      + + Permalink + + +

      Saves and repartitions a key/value RDD on files whose name is the key.

      Saves and repartitions a key/value RDD on files whose name is the key.

      Within the provided path, there will be one file per key in the given +keyValueRDD. And within a file for a given key are only stored values +for this key.

      This is not scalable. This shouldn't be considered for any data flow +with normal or big volumes.

      rdd.saveAsTextFileByKey("/my/output/folder/path", 12)
      path

      the folder where will be stored key files

      keyNbr

      the nbr of expected keys (which is the nbr of output +files)

      +
    11. + + +

      + + + def + + + saveAsTextFileByKey(path: String): Unit + +

      + + Permalink + + +

      Saves and repartitions a key/value RDD on files whose name is the key.

      Saves and repartitions a key/value RDD on files whose name is the key.

      Within the provided path, there will be one file per key in the given +keyValueRDD. And within a file for a given key are only stored values +for this key.

      As this internally needs to know the nbr of keys, this will have to +compute it. If this nbr of keys is known beforehand, it would spare +resources to use saveAsTextFileByKey(path: String, keyNbr: Int) +instead.

      This is not scalable. This shouldn't be considered for any data flow +with normal or big volumes.

      rdd.saveAsTextFileByKey("/my/output/folder/path")
      path

      the folder where will be stored key files

      +
    12. + + +

      + + + def + + + toString(): String + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyVal

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + + + + diff --git a/docs/com/spark_helper/SparkHelper$$RDDExtensions.html b/docs/com/spark_helper/SparkHelper$$RDDExtensions.html new file mode 100644 index 0000000..cfdc746 --- /dev/null +++ b/docs/com/spark_helper/SparkHelper$$RDDExtensions.html @@ -0,0 +1,547 @@ + + + + RDDExtensions - com.spark_helper.SparkHelper.RDDExtensions + + + + + + + + + + + + + + + +
    + Class +

    com.spark_helper.SparkHelper

    +

    RDDExtensions

    Related Doc: + package SparkHelper +

    + + Permalink + + +
    + +

    + + implicit + class + + + RDDExtensions[T] extends AnyRef + +

    + +
    + Linear Supertypes +
    AnyRef, Any
    +
    + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. RDDExtensions
    2. AnyRef
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + +

      + + + new + + + RDDExtensions(rdd: RDD[T])(implicit arg0: ClassTag[T]) + +

      + + Permalink + + + +
    +
    + + + + + +
    +

    Value Members

    +
    1. + + +

      + + final + def + + + !=(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + +

      + + final + def + + + ##(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + +

      + + final + def + + + ==(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + +

      + + final + def + + + asInstanceOf[T0]: T0 + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    5. + + +

      + + + def + + + clone(): AnyRef + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    6. + + +

      + + final + def + + + eq(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    7. + + +

      + + + def + + + equals(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + +

      + + + def + + + finalize(): Unit + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + +

      + + final + def + + + getClass(): Class[_] + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    10. + + +

      + + + def + + + hashCode(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    11. + + +

      + + final + def + + + isInstanceOf[T0]: Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    12. + + +

      + + final + def + + + ne(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    13. + + +

      + + final + def + + + notify(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    14. + + +

      + + final + def + + + notifyAll(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    15. + + +

      + + + def + + + partialMap(pf: PartialFunction[T, T]): RDD[T] + +

      + + Permalink + + +

      Map an RDD to the same type, by applying a partial function and the +identity otherwise.

      Map an RDD to the same type, by applying a partial function and the +identity otherwise.

      Avoids having case x => x.

      Similar idea to .collect, +but instead of skipping non-matching items, it keeps them as-is.

      sc.parallelize(Array(1, 3, 2, 7, 8)).partialMap { case a if a % 2 == 0 => 2 * a }
      +// is equivalent to:
      +sc.parallelize(Array(1, 3, 2, 7, 8)).map {
      +  case a if a % 2 == 0 => 2 * a
      +  case a               => a
      +}
      +// in order to map to:
      +sc.parallelize(Array(1, 3, 4, 7, 16))
      pf

      the partial function to apply

      returns

      an rdd of the same type, for which each element is either the +application of the partial function where defined or the identity.

      +
    16. + + +

      + + + val + + + rdd: RDD[T] + +

      + + Permalink + + + +
    17. + + +

      + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    18. + + +

      + + + def + + + toString(): String + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    19. + + +

      + + final + def + + + wait(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    20. + + +

      + + final + def + + + wait(arg0: Long, arg1: Int): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    21. + + +

      + + final + def + + + wait(arg0: Long): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + + + + diff --git a/docs/com/spark_helper/SparkHelper$$SeqRDDExtensions.html b/docs/com/spark_helper/SparkHelper$$SeqRDDExtensions.html new file mode 100644 index 0000000..190d93d --- /dev/null +++ b/docs/com/spark_helper/SparkHelper$$SeqRDDExtensions.html @@ -0,0 +1,540 @@ + + + + SeqRDDExtensions - com.spark_helper.SparkHelper.SeqRDDExtensions + + + + + + + + + + + + + + + +
    + Class +

    com.spark_helper.SparkHelper

    +

    SeqRDDExtensions

    Related Doc: + package SparkHelper +

    + + Permalink + + +
    + +

    + + implicit + class + + + SeqRDDExtensions[T] extends AnyRef + +

    + +
    + Linear Supertypes +
    AnyRef, Any
    +
    + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. SeqRDDExtensions
    2. AnyRef
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + +

      + + + new + + + SeqRDDExtensions(rdd: RDD[Seq[T]])(implicit arg0: ClassTag[T]) + +

      + + Permalink + + + +
    +
    + + + + + +
    +

    Value Members

    +
    1. + + +

      + + final + def + + + !=(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + +

      + + final + def + + + ##(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + +

      + + final + def + + + ==(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + +

      + + final + def + + + asInstanceOf[T0]: T0 + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    5. + + +

      + + + def + + + clone(): AnyRef + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    6. + + +

      + + final + def + + + eq(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    7. + + +

      + + + def + + + equals(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + +

      + + + def + + + finalize(): Unit + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + +

      + + + def + + + flatten: RDD[T] + +

      + + Permalink + + +

      Flattens an RDD of Seq[T] +to RDD[T].

      Flattens an RDD of Seq[T] +to RDD[T].

      sc.parallelize(Array(Seq(1, 2, 3), Nil, Seq(4))).flatten == sc.parallelize(Array(Seq(1, 2, 3, 4)))
      returns

      the flat RDD as RDD.flatMap(identity) +or List.flatten +would have.

      +
    10. + + +

      + + final + def + + + getClass(): Class[_] + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    11. + + +

      + + + def + + + hashCode(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    12. + + +

      + + final + def + + + isInstanceOf[T0]: Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    13. + + +

      + + final + def + + + ne(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    14. + + +

      + + final + def + + + notify(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    15. + + +

      + + final + def + + + notifyAll(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    16. + + +

      + + + val + + + rdd: RDD[Seq[T]] + +

      + + Permalink + + + +
    17. + + +

      + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    18. + + +

      + + + def + + + toString(): String + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    19. + + +

      + + final + def + + + wait(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    20. + + +

      + + final + def + + + wait(arg0: Long, arg1: Int): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    21. + + +

      + + final + def + + + wait(arg0: Long): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + + + + diff --git a/docs/com/spark_helper/SparkHelper$$SparkContextExtensions.html b/docs/com/spark_helper/SparkHelper$$SparkContextExtensions.html new file mode 100644 index 0000000..cda2b45 --- /dev/null +++ b/docs/com/spark_helper/SparkHelper$$SparkContextExtensions.html @@ -0,0 +1,476 @@ + + + + SparkContextExtensions - com.spark_helper.SparkHelper.SparkContextExtensions + + + + + + + + + + + + + + + +
    + Class +

    com.spark_helper.SparkHelper

    +

    SparkContextExtensions

    Related Doc: + package SparkHelper +

    + + Permalink + + +
    + +

    + + implicit final + class + + + SparkContextExtensions extends AnyVal + +

    + +
    + Linear Supertypes +
    AnyVal, Any
    +
    + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. SparkContextExtensions
    2. AnyVal
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + +

      + + + new + + + SparkContextExtensions(sc: SparkContext) + +

      + + Permalink + + + +
    +
    + + + + + +
    +

    Value Members

    +
    1. + + +

      + + final + def + + + !=(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    2. + + +

      + + final + def + + + ##(): Int + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    3. + + +

      + + final + def + + + ==(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    4. + + +

      + + final + def + + + asInstanceOf[T0]: T0 + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    5. + + +

      + + + def + + + decreaseCoalescence(highCoalescenceLevelFolder: String, lowerCoalescenceLevelFolder: String, finalCoalesceLevel: Int, codec: Class[_ <: CompressionCodec]): Unit + +

      + + Permalink + + +

      Decreases the nbr of partitions of a folder.

      Decreases the nbr of partitions of a folder.

      This comes in handy when the last step of your job needs to run on +thousands of files, but you want to store your final output on let's say +only 30 files.

      It's like a FileUtil.copyMerge() +, but the merging produces more than one file.

      Be aware that this methods deletes the provided input folder.

      sc.decreaseCoalescence(
      +  "/folder/path/with/2000/files",
      +  "/produced/folder/path/with/only/30/files",
      +  30,
      +  classOf[BZip2Codec]
      +)
      highCoalescenceLevelFolder

      the folder which contains 10000 files

      lowerCoalescenceLevelFolder

      the folder which will contain the same +data as highCoalescenceLevelFolder but spread on only 30 files (where 30 +is the finalCoalesceLevel parameter).

      finalCoalesceLevel

      the nbr of files within the folder at the end +of this method.

      codec

      the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))

      +
    6. + + +

      + + + def + + + decreaseCoalescence(highCoalescenceLevelFolder: String, lowerCoalescenceLevelFolder: String, finalCoalesceLevel: Int): Unit + +

      + + Permalink + + +

      Decreases the nbr of partitions of a folder.

      Decreases the nbr of partitions of a folder.

      This comes in handy when the last step of your job needs to run on +thousands of files, but you want to store your final output on let's say +only 30 files.

      It's like a FileUtil.copyMerge() +, but the merging produces more than one file.

      Be aware that this methods deletes the provided input folder.

      sc.decreaseCoalescence(
      +  "/folder/path/with/2000/files",
      +  "/produced/folder/path/with/only/30/files",
      +  30
      +)
      highCoalescenceLevelFolder

      the folder which contains 10000 files

      lowerCoalescenceLevelFolder

      the folder which will contain the same +data as highCoalescenceLevelFolder but spread on only 30 files (where 30 +is the finalCoalesceLevel parameter).

      finalCoalesceLevel

      the nbr of files within the folder at the end +of this method.

      +
    7. + + +

      + + + def + + + getClass(): Class[_ <: AnyVal] + +

      + + Permalink + + +
      Definition Classes
      AnyVal → Any
      +
    8. + + +

      + + final + def + + + isInstanceOf[T0]: Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    9. + + +

      + + + val + + + sc: SparkContext + +

      + + Permalink + + + +
    10. + + +

      + + + def + + + textFile(paths: Seq[String], minPartitions: Int): RDD[String] + +

      + + Permalink + + +

      A replacement for sc.textFile() +when files contains commas in their name.

      A replacement for sc.textFile() +when files contains commas in their name.

      As sc.textFile() +allows to provide several files at once by giving them as a string which +is a list of strings joined with ,, +we can't give it files containing commas in their name.

      This method aims at bypassing this limitation by passing paths as a +sequence of strings.

      sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt"))
      paths

      the paths of the file(s)/folder(s) to read

      minPartitions

      the nbr of partitions in which to split the input

      +
    11. + + +

      + + + def + + + textFile(paths: Seq[String]): RDD[String] + +

      + + Permalink + + +

      A replacement for sc.textFile() +when files contains commas in their name.

      A replacement for sc.textFile() +when files contains commas in their name.

      As sc.textFile() +allows to provide several files at once by giving them as a string which +is a list of strings joined with ,, +we can't give it files containing commas in their name.

      This method aims at bypassing this limitation by passing paths as a +sequence of strings.

      sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt"))
      paths

      the paths of the file(s)/folder(s) to read

      +
    12. + + +

      + + + def + + + textFile(path: String, delimiter: String, maxRecordLength: String = "1000000"): RDD[String] + +

      + + Permalink + + +

      Equivalent to sparkContext.textFile() +, but for a specific record delimiter.

      Equivalent to sparkContext.textFile() +, but for a specific record delimiter.

      By default, sparkContext.textFile() +will provide one record per line (per '\n'). +But what if the format to read considers that one record is stored in +more than one line (yml, custom format, ...)?

      For instance in order to read a yml file, which is a format for which a +record (a single entity) is spread other several lines, you can modify +the record delimiter with "---\n" +instead of "\n". +Same goes when reading an xml file where a record might be spread over +several lines or worse the whole xml file is one line.

      // Let's say data we want to use with Spark looks like this (one record
      +// is a customer, but it's spread over several lines):
      +<Customers>\n
      +<Customer>\n
      +<Address>34 thingy street, someplace, sometown</Address>\n
      +</Customer>\n
      +<Customer>\n
      +<Address>12 thingy street, someplace, sometown</Address>\n
      +</Customer>\n
      +</Customers>
      +//Then you can use it this way:
      +val computedRecords = sc.textFile("my/path/to/customers.xml", "\n")
      +val expectedRecords = RDD(
      +  <Customers>\n,
      +  (
      +    <Address>34 thingy street, someplace, sometown</Address>\n +
      +    </Customer>\n
      +  ),
      +  (
      +    <Address>12 thingy street, someplace, sometown</Address>\n +
      +    </Customer>\n +
      +    </Customers>
      +  )
      +)
      +assert(computedRecords == expectedRecords)
      path

      the path of the file to read (folder or file, '*' works +as well).

      delimiter

      the specific record delimiter which replaces "\n"

      maxRecordLength

      the max length (not sure which unit) of a record +before considering the record too long to fit into memory.

      returns

      the RDD of records

      +
    13. + + +

      + + + def + + + textFileWithFileName(path: String): RDD[(String, String)] + +

      + + Permalink + + +

      Equivalent to sparkContext.textFile() +, but each record is associated with the file path it comes from.

      Equivalent to sparkContext.textFile() +, but each record is associated with the file path it comes from.

      Produces an RDD[(file_name, line)] +which provides a way to know from which file a given line comes from.

      // Considering this folder:
      +// folder/file_1.txt whose content is data1\ndata2\ndata3
      +// folder/file_2.txt whose content is data4\ndata4
      +// folder/folder_1/file_3.txt whose content is data6\ndata7
      +// then:
      +sc.textFileWithFileName("folder")
      +// will return:
      +RDD(
      +  ("file:/path/on/machine/folder/file_1.txt", "data1"),
      +  ("file:/path/on/machine/folder/file_1.txt", "data2"),
      +  ("file:/path/on/machine/folder/file_1.txt", "data3"),
      +  ("file:/path/on/machine/folder/file_2.txt", "data4"),
      +  ("file:/path/on/machine/folder/file_2.txt", "data5"),
      +  ("file:/path/on/machine/folder/folder_1/file_3.txt", "data6"),
      +  ("file:/path/on/machine/folder/folder_1/file_3.txt", "data7")
      +)
      path

      the path of the folder (or structure of folders) to read

      returns

      the RDD of records where a record is a tuple containing the path +of the file the record comes from and the record itself.

      +
    14. + + +

      + + + def + + + toString(): String + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyVal

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + + + + diff --git a/docs/com/spark_helper/SparkHelper$$StringRDDExtensions.html b/docs/com/spark_helper/SparkHelper$$StringRDDExtensions.html new file mode 100644 index 0000000..bf483ea --- /dev/null +++ b/docs/com/spark_helper/SparkHelper$$StringRDDExtensions.html @@ -0,0 +1,422 @@ + + + + StringRDDExtensions - com.spark_helper.SparkHelper.StringRDDExtensions + + + + + + + + + + + + + + + +
    + Class +

    com.spark_helper.SparkHelper

    +

    StringRDDExtensions

    Related Doc: + package SparkHelper +

    + + Permalink + + +
    + +

    + + implicit final + class + + + StringRDDExtensions extends AnyVal + +

    + +
    + Linear Supertypes +
    AnyVal, Any
    +
    + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. StringRDDExtensions
    2. AnyVal
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + +

      + + + new + + + StringRDDExtensions(rdd: RDD[String]) + +

      + + Permalink + + + +
    +
    + + + + + +
    +

    Value Members

    +
    1. + + +

      + + final + def + + + !=(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    2. + + +

      + + final + def + + + ##(): Int + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    3. + + +

      + + final + def + + + ==(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    4. + + +

      + + final + def + + + asInstanceOf[T0]: T0 + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    5. + + +

      + + + def + + + getClass(): Class[_ <: AnyVal] + +

      + + Permalink + + +
      Definition Classes
      AnyVal → Any
      +
    6. + + +

      + + final + def + + + isInstanceOf[T0]: Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    7. + + +

      + + + val + + + rdd: RDD[String] + +

      + + Permalink + + + +
    8. + + +

      + + + def + + + saveAsSingleTextFile(path: String, workingFolder: String, codec: Class[_ <: CompressionCodec]): Unit + +

      + + Permalink + + +

      Saves an RDD in exactly one file.

      Saves an RDD in exactly one file.

      Allows one to save an RDD in one file, while keeping the processing +distributed.

      This variant of saveAsSingleTextFile +performs the storage in a temporary folder instead of directly in the +final output folder. This way the risks of having corrupted files in the +real output folder due to cluster interruptions is minimized.

      rdd.saveAsSingleTextFile("/my/file/path.txt", "/my/working/folder/path", classOf[BZip2Codec])
      path

      the path of the produced file

      workingFolder

      the path where file manipulations will temporarily +happen.

      codec

      the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))

      +
    9. + + +

      + + + def + + + saveAsSingleTextFile(path: String, workingFolder: String): Unit + +

      + + Permalink + + +

      Saves an RDD in exactly one file.

      Saves an RDD in exactly one file.

      Allows one to save an RDD in one file, while keeping the processing +distributed.

      This variant of saveAsSingleTextFile +performs the storage in a temporary folder instead of directly in the +final output folder. This way the risks of having corrupted files in the +real output folder due to cluster interruptions is minimized.

      rdd.saveAsSingleTextFile("/my/file/path.txt", "/my/working/folder/path")
      path

      the path of the produced file

      workingFolder

      the path where file manipulations will temporarily +happen.

      +
    10. + + +

      + + + def + + + saveAsSingleTextFile(path: String, codec: Class[_ <: CompressionCodec]): Unit + +

      + + Permalink + + +

      Saves an RDD in exactly one file.

      Saves an RDD in exactly one file.

      Allows one to save an RDD in one file, while keeping the processing +distributed.

      rdd.saveAsSingleTextFile("/my/file/path.txt", classOf[BZip2Codec])
      path

      the path of the produced file

      codec

      the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))

      +
    11. + + +

      + + + def + + + saveAsSingleTextFile(path: String): Unit + +

      + + Permalink + + +

      Saves an RDD in exactly one file.

      Saves an RDD in exactly one file.

      Allows one to save an RDD in one file, while keeping the processing +distributed.

      rdd.saveAsSingleTextFile("/my/file/path.txt")
      path

      the path of the produced file

      +
    12. + + +

      + + + def + + + saveAsTextFileAndCoalesce(path: String, finalCoalesceLevel: Int, codec: Class[_ <: CompressionCodec]): Unit + +

      + + Permalink + + +

      Saves as text file, and decreases the nbr of output partitions.

      Saves as text file, and decreases the nbr of output partitions.

      Same as rdd.saveAsTextFile() +, but decreases the nbr of partitions in the output folder before doing +so.

      The result is equivalent to rdd.coalesce(x).saveAsTextFile() +, but if x +is very low, coalesce +would make the processing time explode, whereas this methods keeps the +processing distributed, save as text file and then only merges the +result in a lower nbr of partitions.

      rdd.saveAsTextFileAndCoalesce("/produced/folder/path/with/only/30/files", 30, classOf[BZip2Codec])
      path

      the folder where will finally be stored the RDD but spread +on only 30 files (where 30 is the value of the finalCoalesceLevel +parameter).

      finalCoalesceLevel

      the nbr of files within the folder at the end +of this method.

      codec

      the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))

      +
    13. + + +

      + + + def + + + saveAsTextFileAndCoalesce(path: String, finalCoalesceLevel: Int): Unit + +

      + + Permalink + + +

      Saves as text file, but by decreasing the nbr of partitions of the output.

      Saves as text file, but by decreasing the nbr of partitions of the output.

      Same as rdd.saveAsTextFile() +, but decreases the nbr of partitions in the output folder before doing +so.

      The result is equivalent to rdd.coalesce(x).saveAsTextFile() +, but if x +is very low, coalesce +would make the processing time explode, whereas this methods keeps the +processing distributed, save as text file and then only merges the +result in a lower nbr of partitions.

      rdd.saveAsTextFileAndCoalesce("/produced/folder/path/with/only/30/files", 30)
      path

      the folder where will finally be stored the RDD but spread +on only 30 files (where 30 is the value of the finalCoalesceLevel +parameter).

      finalCoalesceLevel

      the nbr of files within the folder at the end +of this method.

      +
    14. + + +

      + + + def + + + toString(): String + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyVal

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + + + + diff --git a/docs/com/spark_helper/SparkHelper$.html b/docs/com/spark_helper/SparkHelper$.html index f7024b1..25d8011 100644 --- a/docs/com/spark_helper/SparkHelper$.html +++ b/docs/com/spark_helper/SparkHelper$.html @@ -51,19 +51,39 @@

    A facility to deal with RDD/file manipulations based on the Spark API.

    The goal is to remove the maximum of highly used low-level code from your spark job and replace it with methods fully tested whose name is -self-explanatory/readable.

    A few exemples:

    // Same as SparkContext.saveAsTextFile, but the result is a single file:
    -SparkHelper.saveAsSingleTextFile(myOutputRDD, "/my/output/file/path.txt")
    -// Same as SparkContext.textFile, but instead of reading one record per
    -// line, it reads records spread over several lines.
    -// This way, xml, json, yml or any multi-line record file format can be used
    -// with Spark:
    -SparkHelper.textFileWithDelimiter("/my/input/folder/path", sparkContext, "---\n")
    -// Same as SparkContext.textFile, but instead of returning an RDD of
    -// records, it returns an RDD of tuples containing both the record and the
    -// path of the file it comes from:
    -SparkHelper.textFileWithFileName("folder", sparkContext)

    Source import com.spark_helper.SparkHelper._ + +// Same as rdd.saveAsTextFile("path"), but the result is a single file (while +// keeping the processing distributed): +rdd.saveAsSingleTextFile("/my/output/file/path.txt") +rdd.saveAsSingleTextFile("/my/output/file/path.txt", classOf[BZip2Codec]) + +// Same as sc.textFile("path"), but instead of reading one record per line (by +// splitting the input with \n), it splits the file in records based on a custom +// delimiter. This way, xml, json, yml or any multi-line record file format can +// be used with Spark: +sc.textFile("/my/input/folder/path", "---\n") // for a yml file for instance + +// Equivalent to rdd.flatMap(identity) for RDDs of Seqs or Options: +rdd.flatten + +// Equivalent to sc.textFile(), but for each line is tupled with its file path: +sc.textFileWithFileName("/my/input/folder/path") +// which produces: +// RDD(("folder/file_1.txt", "record1fromfile1"), ("folder/file_1.txt", "record2fromfile1"), +// ("folder/file_2.txt", "record1fromfile2"), ...) + +// In the given folder, this generates one file per key in the given key/value +// RDD. Within each file (named from the key) are all values for this key: +rdd.saveAsTextFileByKey("/my/output/folder/path") + +// Concept mapper (the following example transforms RDD(1, 3, 2, 7, 8) into RDD(1, 3, 4, 7, 16)): +rdd.partialMap { case a if a % 2 == 0 => 2 * a } + +// For when input files contain commas and textFile can't handle it: +sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt"))

    Source SparkHelper -

    Since

    2017-02

    +

    Since

    2017-02

    To do

    sc.parallelize[T](elmts: T*) instead of sc.parallelize[T](elmts: Array[T])

    Linear Supertypes
    Serializable, Serializable, AnyRef, Any
    @@ -102,7 +122,112 @@

    - +
    +

    Type Members

    +
    1. + + +

      + + implicit + class + + + OptionRDDExtensions[T] extends AnyRef + +

      + + Permalink + + + +
    2. + + +

      + + implicit final + class + + + PairRDDExtensions extends AnyVal + +

      + + Permalink + + + +
    3. + + +

      + + implicit + class + + + RDDExtensions[T] extends AnyRef + +

      + + Permalink + + + +
    4. + + +

      + + implicit + class + + + SeqRDDExtensions[T] extends AnyRef + +

      + + Permalink + + + +
    5. + + +

      + + implicit final + class + + + SparkContextExtensions extends AnyVal + +

      + + Permalink + + + +
    6. + + +

      + + implicit final + class + + + StringRDDExtensions extends AnyVal + +

      + + Permalink + + + +
    +
    @@ -199,62 +324,6 @@

    )

    -

  • - - -

    - - - def - - - decreaseCoalescence(highCoalescenceLevelFolder: String, lowerCoalescenceLevelFolder: String, finalCoalescenceLevel: Int, sparkContext: SparkContext, compressionCodec: Class[_ <: CompressionCodec]): Unit - -

    - - Permalink - - -

    Decreases the nbr of partitions of a folder.

    Decreases the nbr of partitions of a folder.

    This is often handy when the last step of your job needs to run on -thousands of files, but you want to store your final output on let's say -only 300 files.

    It's like a FileUtil.copyMerge, but the merging produces more than one -file.

    Be aware that this methods deletes the provided input folder.

    SparkHelper.decreaseCoalescence(
    -  "/folder/path/with/2000/files",
    -  "/produced/folder/path/with/only/300/files",
    -  300,
    -  sparkContext,
    -  classOf[BZip2Codec])
    highCoalescenceLevelFolder

    the folder which contains 10000 files

    lowerCoalescenceLevelFolder

    the folder which will contain the same -data as highCoalescenceLevelFolder but spread on only 300 files (where 300 -is the finalCoalescenceLevel parameter).

    finalCoalescenceLevel

    the nbr of files within the folder at the end -of this method.

    sparkContext

    the SparkContext

    compressionCodec

    the type of compression to use (for instance -classOf[BZip2Codec] or classOf[GzipCodec]))

    -
  • - - -

    - - - def - - - decreaseCoalescence(highCoalescenceLevelFolder: String, lowerCoalescenceLevelFolder: String, finalCoalescenceLevel: Int, sparkContext: SparkContext): Unit - -

    - - Permalink - - -

    Decreases the nbr of partitions of a folder.

    Decreases the nbr of partitions of a folder.

    This is often handy when the last step of your job needs to run on -thousands of files, but you want to store your final output on let's say -only 300 files.

    It's like a FileUtil.copyMerge, but the merging produces more than one -file.

    Be aware that this methods deletes the provided input folder.

    SparkHelper.decreaseCoalescence(
    -  "/folder/path/with/2000/files",
    -  "/produced/folder/path/with/only/300/files",
    -  300,
    -  sparkContext)
    highCoalescenceLevelFolder

    the folder which contains 10000 files

    lowerCoalescenceLevelFolder

    the folder which will contain the same -data as highCoalescenceLevelFolder but spread on only 300 files (where 300 -is the finalCoalescenceLevel parameter).

    finalCoalescenceLevel

    the nbr of files within the folder at the end -of this method.

    sparkContext

    the SparkContext

  • @@ -414,193 +483,6 @@

    Definition Classes
    AnyRef
    -

  • - - -

    - - - def - - - saveAsSingleTextFile(outputRDD: RDD[String], outputFile: String, workingFolder: String, compressionCodec: Class[_ <: CompressionCodec]): Unit - -

    - - Permalink - - -

    Saves an RDD in exactly one file.

    Saves an RDD in exactly one file.

    Allows one to save an RDD in one file, while keeping the processing -parallelized.

    This variant of saveAsSingleTextFile performs the storage in a temporary -folder instead of directly in the final output folder. This way the risks -of having corrupted files in the real output folder due to cluster -interruptions is minimized.

    SparkHelper.saveAsSingleTextFile(
    -  myRddToStore,
    -  "/my/file/path.txt",
    -  "/my/working/folder/path",
    -  classOf[BZip2Codec])
    outputRDD

    the RDD of strings to store in one file

    outputFile

    the path of the produced file

    workingFolder

    the path where file manipulations will temporarily -happen.

    compressionCodec

    the type of compression to use (for instance -classOf[BZip2Codec] or classOf[GzipCodec]))

    -
  • - - -

    - - - def - - - saveAsSingleTextFile(outputRDD: RDD[String], outputFile: String, workingFolder: String): Unit - -

    - - Permalink - - -

    Saves an RDD in exactly one file.

    Saves an RDD in exactly one file.

    Allows one to save an RDD in one file, while keeping the processing -parallelized.

    This variant of saveAsSingleTextFile performs the storage in a temporary -folder instead of directly in the final output folder. This way the -risks of having corrupted files in the real output folder due to cluster -interruptions is minimized.

    SparkHelper.saveAsSingleTextFile(
    -  myRddToStore, "/my/file/path.txt", "/my/working/folder/path")
    outputRDD

    the RDD of strings to store in one file

    outputFile

    the path of the produced file

    workingFolder

    the path where file manipulations will temporarily -happen.

    -
  • - - -

    - - - def - - - saveAsSingleTextFile(outputRDD: RDD[String], outputFile: String, compressionCodec: Class[_ <: CompressionCodec]): Unit - -

    - - Permalink - - -

    Saves an RDD in exactly one file.

    Saves an RDD in exactly one file.

    Allows one to save an RDD in one file, while keeping the processing -parallelized.

    SparkHelper.saveAsSingleTextFile(
    -  myRddToStore, "/my/file/path.txt", classOf[BZip2Codec])
    outputRDD

    the RDD of strings to store in one file

    outputFile

    the path of the produced file

    compressionCodec

    the type of compression to use (for instance -classOf[BZip2Codec] or classOf[GzipCodec]))

    -
  • - - -

    - - - def - - - saveAsSingleTextFile(outputRDD: RDD[String], outputFile: String): Unit - -

    - - Permalink - - -

    Saves an RDD in exactly one file.

    Saves an RDD in exactly one file.

    Allows one to save an RDD in one file, while keeping the processing -parallelized.

    SparkHelper.saveAsSingleTextFile(myRddToStore, "/my/file/path.txt")
    outputRDD

    the RDD of strings to store in one file

    outputFile

    the path of the produced file

    -
  • - - -

    - - - def - - - saveAsTextFileAndCoalesce(outputRDD: RDD[String], outputFolder: String, finalCoalescenceLevel: Int, compressionCodec: Class[_ <: CompressionCodec]): Unit - -

    - - Permalink - - -

    Saves as text file, but by decreasing the nbr of partitions of the output.

    Saves as text file, but by decreasing the nbr of partitions of the output.

    Same as decreaseCoalescence, but the storage of the RDD in an intermediate -folder is included.

    This still makes the processing parallelized, but the output is coalesced.

    SparkHelper.saveAsTextFileAndCoalesce(
    -  myRddToStore,
    -  "/produced/folder/path/with/only/300/files",
    -  300,
    -  classOf[BZip2Codec])
    outputRDD

    the RDD to store, processed for instance on 10000 tasks -(which would thus be stored as 10000 files).

    outputFolder

    the folder where will finally be stored the RDD but -spread on only 300 files (where 300 is the value of the -finalCoalescenceLevel parameter).

    finalCoalescenceLevel

    the nbr of files within the folder at the end -of this method.

    compressionCodec

    the type of compression to use (for instance -classOf[BZip2Codec] or classOf[GzipCodec]))

    -
  • - - -

    - - - def - - - saveAsTextFileAndCoalesce(outputRDD: RDD[String], outputFolder: String, finalCoalescenceLevel: Int): Unit - -

    - - Permalink - - -

    Saves as text file, but by decreasing the nbr of partitions of the output.

    Saves as text file, but by decreasing the nbr of partitions of the output.

    Same as decreaseCoalescence, but the storage of the RDD in an intermediate -folder is included.

    This still makes the processing parallelized, but the output is coalesced.

    SparkHelper.saveAsTextFileAndCoalesce(
    -  myRddToStore, "/produced/folder/path/with/only/300/files", 300)
    outputRDD

    the RDD to store, processed for instance on 10000 tasks -(which would thus be stored as 10000 files).

    outputFolder

    the folder where will finally be stored the RDD but -spread on only 300 files (where 300 is the value of the -finalCoalescenceLevel parameter).

    finalCoalescenceLevel

    the nbr of files within the folder at the end -of this method.

    -
  • - - -

    - - - def - - - saveAsTextFileByKey(keyValueRDD: RDD[(String, String)], outputFolder: String, keyNbr: Int, compressionCodec: Class[_ <: CompressionCodec]): Unit - -

    - - Permalink - - -

    Saves and repartitions a key/value RDD on files whose name is the key.

    Saves and repartitions a key/value RDD on files whose name is the key.

    Within the provided outputFolder, will be one file per key in your -keyValueRDD. And within a file for a given key are only values for this -key.

    You need to know the nbr of keys beforehand (in general you use this to -split your dataset in subsets, or to output one file per client, so you -know how many keys you have). So you need to put as keyNbr the exact nbr -of keys you'll have.

    This is not scalable. This shouldn't be considered for any data flow with -normal or big volumes.

    SparkHelper.saveAsTextFileByKey(
    -  myKeyValueRddToStore, "/my/output/folder/path", 12, classOf[BZip2Codec])
    keyValueRDD

    the key/value RDD

    outputFolder

    the foldder where will be storrred key files

    keyNbr

    the nbr of expected keys (which is the nbr of outputed files)

    compressionCodec

    the type of compression to use (for instance -classOf[BZip2Codec] or classOf[GzipCodec]))

    -
  • - - -

    - - - def - - - saveAsTextFileByKey(keyValueRDD: RDD[(String, String)], outputFolder: String, keyNbr: Int): Unit - -

    - - Permalink - - -

    Saves and repartitions a key/value RDD on files whose name is the key.

    Saves and repartitions a key/value RDD on files whose name is the key.

    Within the provided outputFolder, will be one file per key in your -keyValueRDD. And within a file for a given key are only values for this -key.

    You need to know the nbr of keys beforehand (in general you use this to -split your dataset in subsets, or to output one file per client, so you -know how many keys you have). So you need to put as keyNbr the exact nbr -of keys you'll have.

    This is not scalable. This shouldn't be considered for any data flow with -normal or big volumes.

    SparkHelper.saveAsTextFileByKey(
    -  myKeyValueRddToStore, "/my/output/folder/path", 12)
    keyValueRDD

    the key/value RDD

    outputFolder

    the foldder where will be storrred key files

    keyNbr

    the nbr of expected keys (which is the nbr of outputed files)

  • @@ -618,93 +500,6 @@

    Definition Classes
    AnyRef
    -

  • - - -

    - - - def - - - textFileWithDelimiter(hdfsPath: String, sparkContext: SparkContext, delimiter: String, maxRecordLength: String = "1000000"): RDD[String] - -

    - - Permalink - - -

    Equivalent to sparkContext.textFile(), but for a specific record delimiter.

    Equivalent to sparkContext.textFile(), but for a specific record delimiter.

    By default, sparkContext.textFile() will provide one record per line. But -what if the format you want to read considers that one record (one entity) -is stored in more than one line (yml, xml, ...)?

    For instance in order to read a yml file, which is a format for which a -record (a single entity) is spread other several lines, you can modify the -record delimiter with "---\n" instead of "\n". Same goes when reading an -xml file where a record might be spread over several lines or worse the -whole xml file is one line.

    // Let's say data we want to use with Spark looks like this (one record is
    -// a customer, but it's spread over several lines):
    -<Customers>\n
    -<Customer>\n
    -<Address>34 thingy street, someplace, sometown</Address>\n
    -</Customer>\n
    -<Customer>\n
    -<Address>12 thingy street, someplace, sometown</Address>\n
    -</Customer>\n
    -</Customers>
    -//Then you can use it this way:
    -val computedRecords = SparkHelper.textFileWithDelimiter(
    -  "my/path/to/customers.xml", sparkContext, <Customer>\n
    -).collect()
    -val expectedRecords = Array(
    -  <Customers>\n,
    -  (
    -    <Address>34 thingy street, someplace, sometown</Address>\n +
    -    </Customer>\n
    -  ),
    -  (
    -    <Address>12 thingy street, someplace, sometown</Address>\n +
    -    </Customer>\n +
    -    </Customers>
    -  )
    -)
    -assert(computedRecords == expectedRecords)
    hdfsPath

    the path of the file to read (folder or file, '*' works as -well).

    sparkContext

    the SparkContext

    delimiter

    the specific record delimiter which replaces "\n"

    maxRecordLength

    the max length (not sure which unit) of a record -before considering the record too long to fit into memory.

    returns

    the RDD of records

    -
  • - - -

    - - - def - - - textFileWithFileName(hdfsPath: String, sparkContext: SparkContext): RDD[(String, String)] - -

    - - Permalink - - -

    Equivalent to sparkContext.textFile(), but for each line is associated -with its file path.

    Equivalent to sparkContext.textFile(), but for each line is associated -with its file path.

    Produces a RDD[(file_name, line)] which provides a way to know from which -file a given line comes from.

    // Considering this folder:
    -// folder/file_1.txt whose content is data1\ndata2\ndata3
    -// folder/file_2.txt whose content is data4\ndata4
    -// folder/folder_1/file_3.txt whose content is data6\ndata7
    -// then:
    -SparkHelper.textFileWithFileName("folder", sparkContext)
    -// will return:
    -RDD(
    -  ("file:/path/on/machine/folder/file_1.txt", "data1"),
    -  ("file:/path/on/machine/folder/file_1.txt", "data2"),
    -  ("file:/path/on/machine/folder/file_1.txt", "data3"),
    -  ("file:/path/on/machine/folder/file_2.txt", "data4"),
    -  ("file:/path/on/machine/folder/file_2.txt", "data5"),
    -  ("file:/path/on/machine/folder/folder_1/file_3.txt", "data6"),
    -  ("file:/path/on/machine/folder/folder_1/file_3.txt", "data7")
    -)
    hdfsPath

    the path of the folder (or structure of folders) to read

    sparkContext

    the SparkContext

    returns

    the RDD of records where a record is a tuple containing the path -of the file the record comes from and the record itself.

  • diff --git a/docs/com/spark_helper/monitoring/Test.html b/docs/com/spark_helper/monitoring/Test.html index f1cd8b7..4246185 100644 --- a/docs/com/spark_helper/monitoring/Test.html +++ b/docs/com/spark_helper/monitoring/Test.html @@ -50,7 +50,7 @@

    A class which represents a KPI to validate.

    This is intended to be used as parameter of Monitor.updateByKpiValidation -and Monitor.updateByKpisValidation methods.

    Some exemples of Test objects:

    Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT)
    +and Monitor.updateByKpisValidation methods.

    Some examples of Test objects:

    Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT)
     Test("pctOfSomethingElse", 0.27d, SUPERIOR_THAN, 0.3d, PCT)
     Test("someNbr", 1235d, EQUAL_TO, 1235d, NBR)
    description

    the name/description of the KPI which will appear on the validation report.

    kpiValue

    the value for this KPI

    thresholdType

    the type of threshold (SUPERIOR_THAN, INFERIOR_THAN or @@ -110,7 +110,7 @@

    Creates a Test object.

    Creates a Test object. -Some exemples of Test objects:

    Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT)
    +Some examples of Test objects:

    Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT)
     Test("pctOfSomethingElse", 0.27d, SUPERIOR_THAN, 0.3d, PCT)
     Test("someNbr", 1235d, EQUAL_TO, 1235d, NBR)
    description

    the name/description of the KPI which will appear on the validation report.

    kpiValue

    the value for this KPI

    thresholdType

    the type of threshold (SUPERIOR_THAN, INFERIOR_THAN or diff --git a/docs/com/spark_helper/monitoring/ThresholdType.html b/docs/com/spark_helper/monitoring/ThresholdType.html index b0c6eef..61447b9 100644 --- a/docs/com/spark_helper/monitoring/ThresholdType.html +++ b/docs/com/spark_helper/monitoring/ThresholdType.html @@ -49,7 +49,7 @@

    -

    An enumeration which represents the type of threshol to use (EQUAL_TO, +

    An enumeration which represents the type of threshold to use (EQUAL_TO, SUPERIOR_THAN or INFERIOR_THAN)

    Linear Supertypes
    AnyRef, Any
    diff --git a/docs/com/spark_helper/monitoring/package.html b/docs/com/spark_helper/monitoring/package.html index f89cf5e..c7c36f9 100644 --- a/docs/com/spark_helper/monitoring/package.html +++ b/docs/com/spark_helper/monitoring/package.html @@ -100,7 +100,7 @@

    A class which represents a KPI to validate.

    A class which represents a KPI to validate.

    This is intended to be used as parameter of Monitor.updateByKpiValidation -and Monitor.updateByKpisValidation methods.

    Some exemples of Test objects:

    Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT)
    +and Monitor.updateByKpisValidation methods.

    Some examples of Test objects:

    Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT)
     Test("pctOfSomethingElse", 0.27d, SUPERIOR_THAN, 0.3d, PCT)
     Test("someNbr", 1235d, EQUAL_TO, 1235d, NBR)
    description

    the name/description of the KPI which will appear on the validation report.

    kpiValue

    the value for this KPI

    thresholdType

    the type of threshold (SUPERIOR_THAN, INFERIOR_THAN or @@ -121,7 +121,7 @@

    Permalink -

    An enumeration which represents the type of threshol to use (EQUAL_TO, +

    An enumeration which represents the type of threshold to use (EQUAL_TO, SUPERIOR_THAN or INFERIOR_THAN)

  • diff --git a/docs/com/spark_helper/package.html b/docs/com/spark_helper/package.html index 0912242..60ce9ba 100644 --- a/docs/com/spark_helper/package.html +++ b/docs/com/spark_helper/package.html @@ -90,14 +90,31 @@

    joda-time).

    A facility which deals with usual date needs (wrapper around joda-time).

    The goal is to remove the maximum of highly used low-level code from your spark job and replace it with methods fully tested whose name is -self-explanatory/readable.

    A few exemples:

    assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101"))
    -assert(DateHelper.today() == "20170310") // If today's "20170310"
    -assert(DateHelper.yesterday() == "20170309") // If today's "20170310"
    -assert(DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") == "170327")
    -assert(DateHelper.now("HH:mm") == "10:24")
    -assert(DateHelper.currentTimestamp() == "1493105229736")
    -assert(DateHelper.nDaysBefore(3) == "20170307") // If today's "20170310"
    -assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310")

    Source import com.spark_helper.DateHelper + +DateHelper.daysBetween("20161230", "20170101") // List("20161230", "20161231", "20170101") +DateHelper.today // "20170310" +DateHelper.yesterday // "20170309" +DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") // "170327" +DateHelper.now("HH:mm") // "10:24" +DateHelper.currentTimestamp // "1493105229736" +DateHelper.nDaysBefore(3) // "20170307" +DateHelper.nDaysAfterDate(3, "20170307") // "20170310" +DateHelper.nextDay("20170310") // "20170311" +DateHelper.nbrOfDaysSince("20170302") // 8 +DateHelper.nbrOfDaysBetween("20170327", "20170401") // 5 +DateHelper.dayOfWeek("20160614") // 2 + +import com.spark_helper.DateHelper._ + +2.daysAgo // "20170308" +"20161230" to "20170101" // List("20161230", "20161231", "20170101") +3.daysBefore("20170310") // "20170307" +5.daysAfter // "20170315" +4.daysAfter("20170310") // "20170314" +"20170302".isCompliantWith("yyyyMMdd") +"20170310".nextDay // "20170311" +"20170310".previousDay // "20170309"

    Source DateHelper

    Since

    2017-02

  • @@ -122,38 +139,39 @@

    spark job and replace it with methods fully tested whose name is self-explanatory/readable.

    For instance, one don't want to remove a file from hdfs using 3 lines of code and thus could instead just use -HdfsHelper.deleteFile("my/hdfs/file/path.csv").

    A few exemples:

    import com.spark_helper.HdfsHelper
    +HdfsHelper.deleteFile("my/hdfs/file/path.csv").

    A few examples:

    import com.spark_helper.HdfsHelper
     
     // A bunch of methods wrapping the FileSystem API, such as:
    -HdfsHelper.fileExists("my/hdfs/file/path.txt")
    -assert(HdfsHelper.listFileNamesInFolder("my/folder/path") == List("file_name_1.txt", "file_name_2.csv"))
    -assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306")
    -assert(HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") == 3)
    -HdfsHelper.deleteFile("my/hdfs/file/path.csv")
    -HdfsHelper.moveFolder("my/hdfs/folder")
    +HdfsHelper.fileExists("my/hdfs/file/path.txt") // HdfsHelper.folderExists("my/hdfs/folder")
    +HdfsHelper.listFileNamesInFolder("my/folder/path") // List("file_name_1.txt", "file_name_2.csv")
    +HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") // "20170306"
    +HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") // 3
    +HdfsHelper.deleteFile("my/hdfs/file/path.csv") // HdfsHelper.deleteFolder("my/hdfs/folder")
    +HdfsHelper.moveFolder("old/path", "new/path") // HdfsHelper.moveFile("old/path.txt", "new/path.txt")
    +HdfsHelper.createEmptyHdfsFile("/some/hdfs/file/path.token") // HdfsHelper.createFolder("my/hdfs/folder")
    +
    +// File content helpers:
     HdfsHelper.compressFile("hdfs/path/to/uncompressed_file.txt", classOf[GzipCodec])
     HdfsHelper.appendHeader("my/hdfs/file/path.csv", "colum0,column1")
     
     // Some Xml/Typesafe helpers for hadoop as well:
    -HdfsHelper.isHdfsXmlCompliantWithXsd(
    -  "my/hdfs/file/path.xml", getClass.getResource("/some_xml.xsd"))
    +HdfsHelper.isHdfsXmlCompliantWithXsd("my/hdfs/file/path.xml", getClass.getResource("/some_xml.xsd"))
     HdfsHelper.loadXmlFileFromHdfs("my/hdfs/file/path.xml")
     
    -// Very handy to load a config (typesafe format) stored on hdfs at the
    -// begining of a spark job:
    +// Very handy to load a config (typesafe format) stored on hdfs at the beginning of a spark job:
     HdfsHelper.loadTypesafeConfigFromHdfs("my/hdfs/file/path.conf"): Config
     
    -// In order to write small amount of data in a file on hdfs without the
    -// whole spark stack:
    -HdfsHelper.writeToHdfsFile(
    -  Array("some", "relatively small", "text"),
    -  "/some/hdfs/file/path.txt")
    +// In order to write small amount of data in a file on hdfs without the whole spark stack:
    +HdfsHelper.writeToHdfsFile(Array("some", "relatively small", "text"), "/some/hdfs/file/path.txt")
    +// or:
    +import com.spark_helper.HdfsHelper._
    +Array("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt")
    +"hello world".writeToHdfs("/some/hdfs/file/path.txt")
     
    -// Deletes all files/folders in "hdfs/path/to/folder" for which the
    -// timestamp is older than 10 days:
    +// Deletes all files/folders in "hdfs/path/to/folder" for which the timestamp is older than 10 days:
     HdfsHelper.purgeFolder("hdfs/path/to/folder", 10)

    Source HdfsHelper -

    Since

    2017-02

    +

    Since

    2017-02

    To do

    Create a touch method

  • @@ -170,11 +188,11 @@

    Permalink -

    A logger dedicated to Spak jobs.

    A logger dedicated to Spak jobs.

    It's a simple logger/report which contains a report that one can update from +

    A logger dedicated to Spark jobs.

    A logger dedicated to Spark jobs.

    It's a simple logger/report which contains a report that one can update from the driver and a success state. The idea is to persist job executions logs -and errors (and forget about grepping unreadable yarn logs).

    It's designed for perdiodic spark jobs (handles storage and purge of logs) +and errors (and forget about grepping unreadable yarn logs).

    It's designed for periodic spark jobs (handles storage and purge of logs) and provides a way to handle kpis validation.

    Logs are stored on the go which means one can have a direct real time access -of the job logs/status and it's current state (which can overwise be a pain +of the job logs/status and it's current state (which can otherwise be a pain if it means going through yarn logs, or even for certain production environments going through additional layers of software logs to get to yarn logs).

    One of the issues this logger aims at tackling is the handling of exceptions @@ -183,9 +201,9 @@

    want to perform a few actions before letting the job crash. The idea is thus to surround (driver side) a Spark pipeline within a try catch and redirect the exception to the logger for a clean logging.

    This is a "driver-only" logger and is not intended at logging concurrent -actions from executors.

    Produced reports can easily be inserted in a notification email whenerver +actions from executors.

    Produced reports can easily be inserted in a notification email whenever the job fails, which saves a lot of time to maintainers operating on heavy -production environements.

    The produced persisted report is also a way for downstream jobs to know the +production environments.

    The produced persisted report is also a way for downstream jobs to know the status of their input data.

    Let's go through a simple Spark job example monitored with this Monitor facility:

    Monitor.setTitle("My job title")
     Monitor.addDescription(
    @@ -204,7 +222,7 @@ 

    Test("Nbr of output records", processedData.count(), SUPERIOR_THAN, 10e6d, NBR), Test("Some pct of invalid output", your_complex_kpi, INFERIOR_THAN, 3, PCT) ), - "My pipeline descirption" + "My pipeline description" ) if (outputIsValid) @@ -212,9 +230,9 @@

    } catch { case iie: InvalidInputException => - Monitor.error(iie, "My pipeline descirption", diagnostic = "No input data!") + Monitor.error(iie, "My pipeline description", diagnostic = "No input data!") case e: Throwable => - Monitor.error(e, "My pipeline descirption") // whatever unexpected error + Monitor.error(e, "My pipeline description") // whatever unexpected error } if (Monitor.isSuccess()) { @@ -226,7 +244,7 @@

    // HDFS (this saves the logs in the folder set with Monitor.setLogFolder): Monitor.store() -// At the end of the job, if the job isn't successfull, you might want to +// At the end of the job, if the job isn't successful, you might want to // crash it (for instance to get a notification from your scheduler): if (!Monitor.isSuccess()) throw new Exception() // or send an email, or ...

    At any time during the job, logs can be accessed from file path/to/log/folder/current.ongoing

    If we were to read the stored report after this simple pipeline, here are @@ -234,8 +252,8 @@

    My job description (whatever you want); for instance: Documentation: https://github.com/xavierguihot/spark_helper -[10:23] Begining -[10:23-10:23] My pipeline descirption: failed +[10:23] Beginning +[10:23-10:23] My pipeline description: failed Diagnostic: No input data! org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://my/hdfs/input/path at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:285) @@ -245,18 +263,18 @@

    My job description (whatever you want); for instance: Documentation: https://github.com/xavierguihot/spark_helper -[10:23] Begining -[10:23-10:36] My pipeline descirption: failed +[10:23] Beginning +[10:23-10:36] My pipeline description: failed java.lang.NumberFormatException: For input string: "a" java.lang.NumberFormatException.forInputString(NumberFormatException.java:65) java.lang.Integer.parseInt(Integer.java:492) ... -[10:36] Duration: 00:13:47

    Another scenario, successfull spark pipeline and KPIs are valid; all good!:

              My job title
    +[10:36] Duration: 00:13:47

    Another scenario, successful spark pipeline and KPIs are valid; all good!:

              My job title
     
     My job description (whatever you want); for instance:
     Documentation: https://github.com/xavierguihot/spark_helper
    -[10:23] Begining
    -[10:23-10:41] My pipeline descirption: success
    +[10:23] Beginning
    +[10:23-10:41] My pipeline description: success
       KPI: Nbr of output records
         Value: 14669071.0
         Must be superior than 10000000.0
    @@ -268,7 +286,7 @@ 

    [10:41-10:42] My second pipeline description: success [10:42] Duration: 00:19:23

    Source Monitor -

    Since

    2017-02

    +

    Since

    2017-02

    To do

    would a State monad be appropriate?

  • @@ -287,19 +305,39 @@

    A facility to deal with RDD/file manipulations based on the Spark API.

    A facility to deal with RDD/file manipulations based on the Spark API.

    The goal is to remove the maximum of highly used low-level code from your spark job and replace it with methods fully tested whose name is -self-explanatory/readable.

    A few exemples:

    // Same as SparkContext.saveAsTextFile, but the result is a single file:
    -SparkHelper.saveAsSingleTextFile(myOutputRDD, "/my/output/file/path.txt")
    -// Same as SparkContext.textFile, but instead of reading one record per
    -// line, it reads records spread over several lines.
    -// This way, xml, json, yml or any multi-line record file format can be used
    -// with Spark:
    -SparkHelper.textFileWithDelimiter("/my/input/folder/path", sparkContext, "---\n")
    -// Same as SparkContext.textFile, but instead of returning an RDD of
    -// records, it returns an RDD of tuples containing both the record and the
    -// path of the file it comes from:
    -SparkHelper.textFileWithFileName("folder", sparkContext)

    Source import com.spark_helper.SparkHelper._ + +// Same as rdd.saveAsTextFile("path"), but the result is a single file (while +// keeping the processing distributed): +rdd.saveAsSingleTextFile("/my/output/file/path.txt") +rdd.saveAsSingleTextFile("/my/output/file/path.txt", classOf[BZip2Codec]) + +// Same as sc.textFile("path"), but instead of reading one record per line (by +// splitting the input with \n), it splits the file in records based on a custom +// delimiter. This way, xml, json, yml or any multi-line record file format can +// be used with Spark: +sc.textFile("/my/input/folder/path", "---\n") // for a yml file for instance + +// Equivalent to rdd.flatMap(identity) for RDDs of Seqs or Options: +rdd.flatten + +// Equivalent to sc.textFile(), but for each line is tupled with its file path: +sc.textFileWithFileName("/my/input/folder/path") +// which produces: +// RDD(("folder/file_1.txt", "record1fromfile1"), ("folder/file_1.txt", "record2fromfile1"), +// ("folder/file_2.txt", "record1fromfile2"), ...) + +// In the given folder, this generates one file per key in the given key/value +// RDD. Within each file (named from the key) are all values for this key: +rdd.saveAsTextFileByKey("/my/output/folder/path") + +// Concept mapper (the following example transforms RDD(1, 3, 2, 7, 8) into RDD(1, 3, 4, 7, 16)): +rdd.partialMap { case a if a % 2 == 0 => 2 * a } + +// For when input files contain commas and textFile can't handle it: +sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt"))

    Source SparkHelper -

    Since

    2017-02

    +

    Since

    2017-02

    To do

    sc.parallelize[T](elmts: T*) instead of sc.parallelize[T](elmts: Array[T])

  • diff --git a/docs/index.html b/docs/index.html index eb6e8e8..4ec3b1d 100644 --- a/docs/index.html +++ b/docs/index.html @@ -25,7 +25,7 @@
    -
    #ABCDEFGHIJKLMNOPQRSTUVWXYZdeprecated
    +
    #ABCDEFGHIJKLMNOPQRSTUVWXYZdeprecated
    @@ -42,6 +42,18 @@
  • +
  • + org +
      +
      1. + org.apache +
          +
          1. + org.apache.spark +
            1. (object)
              TextFileOverwrite
            +
            +
          +
      2. diff --git a/docs/index.js b/docs/index.js index 8f335a2..6668d87 100644 --- a/docs/index.js +++ b/docs/index.js @@ -1 +1 @@ -Index.PACKAGES = {"com" : [], "com.spark_helper" : [{"object" : "com\/spark_helper\/DateHelper$.html", "name" : "com.spark_helper.DateHelper"}, {"object" : "com\/spark_helper\/HdfsHelper$.html", "name" : "com.spark_helper.HdfsHelper"}, {"object" : "com\/spark_helper\/Monitor$.html", "name" : "com.spark_helper.Monitor"}, {"object" : "com\/spark_helper\/SparkHelper$.html", "name" : "com.spark_helper.SparkHelper"}], "com.spark_helper.monitoring" : [{"object" : "com\/spark_helper\/monitoring\/EQUAL_TO$.html", "name" : "com.spark_helper.monitoring.EQUAL_TO"}, {"object" : "com\/spark_helper\/monitoring\/INFERIOR_THAN$.html", "name" : "com.spark_helper.monitoring.INFERIOR_THAN"}, {"trait" : "com\/spark_helper\/monitoring\/KpiType.html", "name" : "com.spark_helper.monitoring.KpiType"}, {"object" : "com\/spark_helper\/monitoring\/NBR$.html", "name" : "com.spark_helper.monitoring.NBR"}, {"object" : "com\/spark_helper\/monitoring\/PCT$.html", "name" : "com.spark_helper.monitoring.PCT"}, {"object" : "com\/spark_helper\/monitoring\/SUPERIOR_THAN$.html", "name" : "com.spark_helper.monitoring.SUPERIOR_THAN"}, {"case class" : "com\/spark_helper\/monitoring\/Test.html", "name" : "com.spark_helper.monitoring.Test"}, {"trait" : "com\/spark_helper\/monitoring\/ThresholdType.html", "name" : "com.spark_helper.monitoring.ThresholdType"}]}; \ No newline at end of file +Index.PACKAGES = {"com.spark_helper.monitoring" : [{"object" : "com\/spark_helper\/monitoring\/EQUAL_TO$.html", "name" : "com.spark_helper.monitoring.EQUAL_TO"}, {"object" : "com\/spark_helper\/monitoring\/INFERIOR_THAN$.html", "name" : "com.spark_helper.monitoring.INFERIOR_THAN"}, {"trait" : "com\/spark_helper\/monitoring\/KpiType.html", "name" : "com.spark_helper.monitoring.KpiType"}, {"object" : "com\/spark_helper\/monitoring\/NBR$.html", "name" : "com.spark_helper.monitoring.NBR"}, {"object" : "com\/spark_helper\/monitoring\/PCT$.html", "name" : "com.spark_helper.monitoring.PCT"}, {"object" : "com\/spark_helper\/monitoring\/SUPERIOR_THAN$.html", "name" : "com.spark_helper.monitoring.SUPERIOR_THAN"}, {"case class" : "com\/spark_helper\/monitoring\/Test.html", "name" : "com.spark_helper.monitoring.Test"}, {"trait" : "com\/spark_helper\/monitoring\/ThresholdType.html", "name" : "com.spark_helper.monitoring.ThresholdType"}], "org.apache.spark" : [{"object" : "org\/apache\/spark\/TextFileOverwrite$.html", "name" : "org.apache.spark.TextFileOverwrite"}], "org.apache" : [], "com.spark_helper" : [{"object" : "com\/spark_helper\/DateHelper$.html", "name" : "com.spark_helper.DateHelper"}, {"object" : "com\/spark_helper\/HdfsHelper$.html", "name" : "com.spark_helper.HdfsHelper"}, {"object" : "com\/spark_helper\/Monitor$.html", "name" : "com.spark_helper.Monitor"}, {"object" : "com\/spark_helper\/SparkHelper$.html", "name" : "com.spark_helper.SparkHelper"}], "org" : [], "com" : []}; \ No newline at end of file diff --git a/docs/index/index-a.html b/docs/index/index-a.html index 557f98b..5d8c458 100644 --- a/docs/index/index-a.html +++ b/docs/index/index-a.html @@ -16,6 +16,9 @@
        addDescription
        +
        +
        apache
        +
        appendFooter
        diff --git a/docs/index/index-d.html b/docs/index/index-d.html index 860d355..a6dd1d1 100644 --- a/docs/index/index-d.html +++ b/docs/index/index-d.html @@ -19,12 +19,21 @@
        dayOfWeek
        +
        +
        daysAfter
        + +
        +
        daysAgo
        + +
        +
        daysBefore
        +
        daysBetween
        decreaseCoalescence
        - +
        deleteFile
        diff --git a/docs/index/index-f.html b/docs/index/index-f.html index e17cd19..3e21430 100644 --- a/docs/index/index-f.html +++ b/docs/index/index-f.html @@ -19,6 +19,9 @@
        fileModificationDateTime
        +
        +
        flatten
        +
        folderExists
        diff --git a/docs/index/index-i.html b/docs/index/index-i.html index 07f867e..d1f7b53 100644 --- a/docs/index/index-i.html +++ b/docs/index/index-i.html @@ -13,6 +13,15 @@
        INFERIOR_THAN
        +
        +
        IntExtensions
        + +
        +
        int
        + +
        +
        isCompliantWith
        +
        isDateCompliantWithFormat
        diff --git a/docs/index/index-n.html b/docs/index/index-n.html index 914674a..128fca2 100644 --- a/docs/index/index-n.html +++ b/docs/index/index-n.html @@ -13,6 +13,9 @@
        NBR
        +
        +
        nDaysAfter
        +
        nDaysAfterDate
        @@ -36,7 +39,7 @@
        nextDay
        - +
        now
        diff --git a/docs/index/index-o.html b/docs/index/index-o.html new file mode 100644 index 0000000..4a559d5 --- /dev/null +++ b/docs/index/index-o.html @@ -0,0 +1,20 @@ + + + + + + + + + + + + +
        +
        OptionRDDExtensions
        + +
        +
        org
        + +
        + diff --git a/docs/index/index-p.html b/docs/index/index-p.html index c004302..bdee856 100644 --- a/docs/index/index-p.html +++ b/docs/index/index-p.html @@ -13,9 +13,15 @@
        PCT
        +
        +
        PairRDDExtensions
        + +
        +
        partialMap
        +
        previousDay
        - +
        purgeFolder
        diff --git a/docs/index/index-r.html b/docs/index/index-r.html index 8ef3476..317ec21 100644 --- a/docs/index/index-r.html +++ b/docs/index/index-r.html @@ -11,6 +11,12 @@
        +
        RDDExtensions
        + +
        +
        rdd
        + +
        reformatDate
        diff --git a/docs/index/index-s.html b/docs/index/index-s.html index f70ca30..c0e0953 100644 --- a/docs/index/index-s.html +++ b/docs/index/index-s.html @@ -11,32 +11,86 @@
        +
        SPARK_BRANCH
        + +
        +
        SPARK_BUILD_DATE
        + +
        +
        SPARK_BUILD_USER
        + +
        +
        SPARK_REPO_URL
        + +
        +
        SPARK_REVISION
        + +
        +
        SPARK_VERSION
        + +
        SUPERIOR_THAN
        +
        +
        SeqExtensions
        + +
        +
        SeqRDDExtensions
        + +
        +
        SparkContextExtensions
        +
        SparkHelper
        -
        saveAsSingleTextFile
        +
        StringExtensions
        + +
        +
        StringRDDExtensions
        +
        +
        saveAsSingleTextFile
        +
        saveAsTextFileAndCoalesce
        - +
        saveAsTextFileByKey
        - + +
        +
        sc
        + +
        +
        seq
        + +
        +
        setConf
        + +
        +
        setFileSystem
        + +
        +
        setFormat
        +
        setLogFolder
        setTitle
        +
        +
        spark
        +
        spark_helper
        store
        +
        +
        string
        +
        success
        diff --git a/docs/index/index-t.html b/docs/index/index-t.html index bb8d1a4..c58eef1 100644 --- a/docs/index/index-t.html +++ b/docs/index/index-t.html @@ -13,18 +13,24 @@
        Test
        +
        +
        TextFileOverwrite
        +
        ThresholdType
        -
        textFileWithDelimiter
        - +
        textFile
        +
        textFileWithFileName
        - +
        thresholdType
        +
        +
        to
        +
        toString
        diff --git a/docs/index/index-w.html b/docs/index/index-w.html index 231f199..f7f4a02 100644 --- a/docs/index/index-w.html +++ b/docs/index/index-w.html @@ -13,6 +13,9 @@
        withPurge
        +
        +
        writeToHdfs
        +
        writeToHdfsFile
        diff --git a/docs/org/apache/package.html b/docs/org/apache/package.html new file mode 100644 index 0000000..724d793 --- /dev/null +++ b/docs/org/apache/package.html @@ -0,0 +1,118 @@ + + + + apache - org.apache + + + + + + + + + + + + + + + +
        + Package +

        org

        +

        apache

        + + Permalink + + +
        + +

        + + + package + + + apache + +

        + +
        + + +
        +
        + + +
        + Visibility +
        1. Public
        2. All
        +
        +
        + +
        +
        + + + + + + +
        +

        Value Members

        +
        1. + + +

          + + + package + + + spark + +

          + + Permalink + + + +
        +
        + + + + +
        + +
        + + +
        + +
        +
        +

        Ungrouped

        + +
        +
        + +
        + +
        + + + + + + diff --git a/docs/org/apache/spark/TextFileOverwrite$.html b/docs/org/apache/spark/TextFileOverwrite$.html new file mode 100644 index 0000000..908c792 --- /dev/null +++ b/docs/org/apache/spark/TextFileOverwrite$.html @@ -0,0 +1,499 @@ + + + + TextFileOverwrite - org.apache.spark.TextFileOverwrite + + + + + + + + + + + + + + + +
        + Object +

        org.apache.spark

        +

        TextFileOverwrite

        Related Doc: + package spark +

        + + Permalink + + +
        + +

        + + + object + + + TextFileOverwrite + +

        + +
        + Linear Supertypes +
        AnyRef, Any
        +
        + + +
        +
        +
        + Ordering +
          + +
        1. Alphabetic
        2. +
        3. By Inheritance
        4. +
        +
        +
        + Inherited
        +
        +
          +
        1. TextFileOverwrite
        2. AnyRef
        3. Any
        4. +
        +
        + +
          +
        1. Hide All
        2. +
        3. Show All
        4. +
        +
        +
        + Visibility +
        1. Public
        2. All
        +
        +
        + +
        +
        + + + + + + +
        +

        Value Members

        +
        1. + + +

          + + final + def + + + !=(arg0: Any): Boolean + +

          + + Permalink + + +
          Definition Classes
          AnyRef → Any
          +
        2. + + +

          + + final + def + + + ##(): Int + +

          + + Permalink + + +
          Definition Classes
          AnyRef → Any
          +
        3. + + +

          + + final + def + + + ==(arg0: Any): Boolean + +

          + + Permalink + + +
          Definition Classes
          AnyRef → Any
          +
        4. + + +

          + + final + def + + + asInstanceOf[T0]: T0 + +

          + + Permalink + + +
          Definition Classes
          Any
          +
        5. + + +

          + + + def + + + clone(): AnyRef + +

          + + Permalink + + +
          Attributes
          protected[java.lang]
          Definition Classes
          AnyRef
          Annotations
          + @throws( + + ... + ) + +
          +
        6. + + +

          + + final + def + + + eq(arg0: AnyRef): Boolean + +

          + + Permalink + + +
          Definition Classes
          AnyRef
          +
        7. + + +

          + + + def + + + equals(arg0: Any): Boolean + +

          + + Permalink + + +
          Definition Classes
          AnyRef → Any
          +
        8. + + +

          + + + def + + + finalize(): Unit + +

          + + Permalink + + +
          Attributes
          protected[java.lang]
          Definition Classes
          AnyRef
          Annotations
          + @throws( + + classOf[java.lang.Throwable] + ) + +
          +
        9. + + +

          + + final + def + + + getClass(): Class[_] + +

          + + Permalink + + +
          Definition Classes
          AnyRef → Any
          +
        10. + + +

          + + + def + + + hashCode(): Int + +

          + + Permalink + + +
          Definition Classes
          AnyRef → Any
          +
        11. + + +

          + + final + def + + + isInstanceOf[T0]: Boolean + +

          + + Permalink + + +
          Definition Classes
          Any
          +
        12. + + +

          + + final + def + + + ne(arg0: AnyRef): Boolean + +

          + + Permalink + + +
          Definition Classes
          AnyRef
          +
        13. + + +

          + + final + def + + + notify(): Unit + +

          + + Permalink + + +
          Definition Classes
          AnyRef
          +
        14. + + +

          + + final + def + + + notifyAll(): Unit + +

          + + Permalink + + +
          Definition Classes
          AnyRef
          +
        15. + + +

          + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + +

          + + Permalink + + +
          Definition Classes
          AnyRef
          +
        16. + + +

          + + + def + + + textFile(paths: Seq[String], minPartitions: Int, sc: SparkContext): RDD[String] + +

          + + Permalink + + + +
        17. + + +

          + + + def + + + toString(): String + +

          + + Permalink + + +
          Definition Classes
          AnyRef → Any
          +
        18. + + +

          + + final + def + + + wait(): Unit + +

          + + Permalink + + +
          Definition Classes
          AnyRef
          Annotations
          + @throws( + + ... + ) + +
          +
        19. + + +

          + + final + def + + + wait(arg0: Long, arg1: Int): Unit + +

          + + Permalink + + +
          Definition Classes
          AnyRef
          Annotations
          + @throws( + + ... + ) + +
          +
        20. + + +

          + + final + def + + + wait(arg0: Long): Unit + +

          + + Permalink + + +
          Definition Classes
          AnyRef
          Annotations
          + @throws( + + ... + ) + +
          +
        +
        + + + + +
        + +
        +
        +

        Inherited from AnyRef

        +
        +

        Inherited from Any

        +
        + +
        + +
        +
        +

        Ungrouped

        + +
        +
        + +
        + +
        + + + + + + diff --git a/docs/org/apache/spark/package.html b/docs/org/apache/spark/package.html new file mode 100644 index 0000000..9ae6205 --- /dev/null +++ b/docs/org/apache/spark/package.html @@ -0,0 +1,246 @@ + + + + spark - org.apache.spark + + + + + + + + + + + + + + + +
        + Package +

        org.apache

        +

        spark

        + + Permalink + + +
        + +

        + + + package + + + spark + +

        + +
        + Linear Supertypes +
        AnyRef, Any
        +
        + + +
        +
        +
        + Ordering +
          + +
        1. Alphabetic
        2. +
        3. By Inheritance
        4. +
        +
        +
        + Inherited
        +
        +
          +
        1. spark
        2. AnyRef
        3. Any
        4. +
        +
        + +
          +
        1. Hide All
        2. +
        3. Show All
        4. +
        +
        +
        + Visibility +
        1. Public
        2. All
        +
        +
        + +
        +
        + + + + + + +
        +

        Value Members

        +
        1. + + +

          + + + val + + + SPARK_BRANCH: String + +

          + + Permalink + + + +
        2. + + +

          + + + val + + + SPARK_BUILD_DATE: String + +

          + + Permalink + + + +
        3. + + +

          + + + val + + + SPARK_BUILD_USER: String + +

          + + Permalink + + + +
        4. + + +

          + + + val + + + SPARK_REPO_URL: String + +

          + + Permalink + + + +
        5. + + +

          + + + val + + + SPARK_REVISION: String + +

          + + Permalink + + + +
        6. + + +

          + + + val + + + SPARK_VERSION: String + +

          + + Permalink + + + +
        7. + + +

          + + + object + + + TextFileOverwrite + +

          + + Permalink + + + +
        +
        + + + + +
        + +
        +
        +

        Inherited from AnyRef

        +
        +

        Inherited from Any

        +
        + +
        + +
        +
        +

        Ungrouped

        + +
        +
        + +
        + +
        + + + + + + diff --git a/docs/org/package.html b/docs/org/package.html new file mode 100644 index 0000000..329e4fd --- /dev/null +++ b/docs/org/package.html @@ -0,0 +1,118 @@ + + + + org - org + + + + + + + + + + + + + + + +
        + Package + +

        org

        + + Permalink + + +
        + +

        + + + package + + + org + +

        + +
        + + +
        +
        + + +
        + Visibility +
        1. Public
        2. All
        +
        +
        + +
        +
        + + + + + + +
        +

        Value Members

        +
        1. + + +

          + + + package + + + apache + +

          + + Permalink + + + +
        +
        + + + + +
        + +
        + + +
        + +
        +
        +

        Ungrouped

        + +
        +
        + +
        + +
        + + + + + + diff --git a/docs/package.html b/docs/package.html index b941069..a6a120c 100644 --- a/docs/package.html +++ b/docs/package.html @@ -87,6 +87,23 @@

        +
      3. + + +

        + + + package + + + org + +

        + + Permalink + + +
      4. diff --git a/src/main/scala/com/spark_helper/DateHelper.scala b/src/main/scala/com/spark_helper/DateHelper.scala index c2af405..229a648 100644 --- a/src/main/scala/com/spark_helper/DateHelper.scala +++ b/src/main/scala/com/spark_helper/DateHelper.scala @@ -12,17 +12,34 @@ import scala.util.Try * spark job and replace it with methods fully tested whose name is * self-explanatory/readable. * - * A few exemples: + * A few examples: * * {{{ - * assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101")) - * assert(DateHelper.today() == "20170310") // If today's "20170310" - * assert(DateHelper.yesterday() == "20170309") // If today's "20170310" - * assert(DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") == "170327") - * assert(DateHelper.now("HH:mm") == "10:24") - * assert(DateHelper.currentTimestamp() == "1493105229736") - * assert(DateHelper.nDaysBefore(3) == "20170307") // If today's "20170310" - * assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310") + * import com.spark_helper.DateHelper + * + * DateHelper.daysBetween("20161230", "20170101") // List("20161230", "20161231", "20170101") + * DateHelper.today // "20170310" + * DateHelper.yesterday // "20170309" + * DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") // "170327" + * DateHelper.now("HH:mm") // "10:24" + * DateHelper.currentTimestamp // "1493105229736" + * DateHelper.nDaysBefore(3) // "20170307" + * DateHelper.nDaysAfterDate(3, "20170307") // "20170310" + * DateHelper.nextDay("20170310") // "20170311" + * DateHelper.nbrOfDaysSince("20170302") // 8 + * DateHelper.nbrOfDaysBetween("20170327", "20170401") // 5 + * DateHelper.dayOfWeek("20160614") // 2 + * + * import com.spark_helper.DateHelper._ + * + * 2.daysAgo // "20170308" + * "20161230" to "20170101" // List("20161230", "20161231", "20170101") + * 3.daysBefore("20170310") // "20170307" + * 5.daysAfter // "20170315" + * 4.daysAfter("20170310") // "20170314" + * "20170302".isCompliantWith("yyyyMMdd") + * "20170310".nextDay // "20170311" + * "20170310".previousDay // "20170309" * }}} * * Source HdfsHelper * + * @todo Create a touch method * @author Xavier Guihot * @since 2017-02 */ object HdfsHelper extends Serializable { + private var conf = new Configuration() + private var hdfs = FileSystem.get(conf) + + /** Sets a specific Configuration + * used by the underlying FileSystem + * in case it requires some specificities. + * + * If this setter is not used, the default Configuration is set with + * new Configuration(). + * + * @param configuration the specific Configuration to use + */ + def setConf(configuration: Configuration): Unit = { + conf = configuration + hdfs = FileSystem.get(configuration) + } + + /** Sets a specific FileSystem + * in case it requires some specificities. + * + * If this setter is not used, the default FileSystem is set with + * FileSystem.get(new Configuration()). + * + * @param fileSystem the specific FileSystem to use + */ + def setFileSystem(fileSystem: FileSystem): Unit = hdfs = fileSystem + + implicit class SeqExtensions[T <: Seq[String]: ClassTag](val seq: T) { + + /** Saves list elements in a file on hdfs. + * + * Please only consider this way of storing data when the data set is small + * enough. + * + * Overwrites the file if it already exists. + * + * {{{ + * Array("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt") + * List("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt") + * }}} + * + * @param filePath the path of the file in which to write the content of + * the List. + */ + def writeToHdfs(filePath: String): Unit = + HdfsHelper.writeToHdfsFile(seq, filePath) + } + + implicit class StringExtensions(val string: String) { + + /** Saves the String in a file on hdfs. + * + * Overwrites the file if it already exists. + * + * {{{ "some\nrelatively small\ntext".writeToHdfsFile("/some/hdfs/file/path.txt") }}} + * + * @param filePath the path of the file in which to write the String + */ + def writeToHdfs(filePath: String): Unit = + HdfsHelper.writeToHdfsFile(string, filePath) + } + /** Deletes a file on HDFS. * * Doesn't throw an exception if the file to delete doesn't exist. @@ -85,17 +151,16 @@ object HdfsHelper extends Serializable { */ def deleteFile(hdfsPath: String): Unit = { - val fileSystem = FileSystem.get(new Configuration()) - val fileToDelete = new Path(hdfsPath) - if (fileSystem.exists(fileToDelete)) { + if (hdfs.exists(fileToDelete)) { require( - fileSystem.isFile(fileToDelete), - "to delete a folder, prefer using the deleteFolder() method.") + hdfs.isFile(fileToDelete), + "to delete a folder, prefer using the deleteFolder() method." + ) - fileSystem.delete(fileToDelete, true) + hdfs.delete(fileToDelete, true) } } @@ -107,17 +172,16 @@ object HdfsHelper extends Serializable { */ def deleteFolder(hdfsPath: String): Unit = { - val fileSystem = FileSystem.get(new Configuration()) - val folderToDelete = new Path(hdfsPath) - if (fileSystem.exists(folderToDelete)) { + if (hdfs.exists(folderToDelete)) { require( - !fileSystem.isFile(folderToDelete), - "to delete a file, prefer using the deleteFile() method.") + !hdfs.isFile(folderToDelete), + "to delete a file, prefer using the deleteFile() method." + ) - fileSystem.delete(folderToDelete, true) + hdfs.delete(folderToDelete, true) } } @@ -127,8 +191,7 @@ object HdfsHelper extends Serializable { * * @param hdfsPath the path of the folder to create */ - def createFolder(hdfsPath: String): Unit = - FileSystem.get(new Configuration()).mkdirs(new Path(hdfsPath)) + def createFolder(hdfsPath: String): Unit = hdfs.mkdirs(new Path(hdfsPath)) /** Checks if the file exists. * @@ -137,16 +200,15 @@ object HdfsHelper extends Serializable { */ def fileExists(hdfsPath: String): Boolean = { - val fileSystem = FileSystem.get(new Configuration()) - val fileToCheck = new Path(hdfsPath) - if (fileSystem.exists(fileToCheck)) + if (hdfs.exists(fileToCheck)) require( - fileSystem.isFile(fileToCheck), - "to check if a folder exists, prefer using the folderExists() method.") + hdfs.isFile(fileToCheck), + "to check if a folder exists, prefer using the folderExists() method." + ) - fileSystem.exists(fileToCheck) + hdfs.exists(fileToCheck) } /** Checks if the folder exists. @@ -156,16 +218,15 @@ object HdfsHelper extends Serializable { */ def folderExists(hdfsPath: String): Boolean = { - val fileSystem = FileSystem.get(new Configuration()) - val folderToCheck = new Path(hdfsPath) - if (fileSystem.exists(folderToCheck)) + if (hdfs.exists(folderToCheck)) require( - !fileSystem.isFile(folderToCheck), - "to check if a file exists, prefer using the fileExists() method.") + !hdfs.isFile(folderToCheck), + "to check if a file exists, prefer using the fileExists() method." + ) - fileSystem.exists(folderToCheck) + hdfs.exists(folderToCheck) } /** Moves/renames a file. @@ -184,30 +245,30 @@ object HdfsHelper extends Serializable { overwrite: Boolean = false ): Unit = { - val fileSystem = FileSystem.get(new Configuration()) - val fileToRename = new Path(oldPath) val renamedFile = new Path(newPath) - if (fileSystem.exists(fileToRename)) + if (hdfs.exists(fileToRename)) require( - fileSystem.isFile(fileToRename), - "to move a folder, prefer using the moveFolder() method.") + hdfs.isFile(fileToRename), + "to move a folder, prefer using the moveFolder() method." + ) if (overwrite) - fileSystem.delete(renamedFile, true) + hdfs.delete(renamedFile, true) else require( - !fileSystem.exists(renamedFile), + !hdfs.exists(renamedFile), "overwrite option set to false, but a file already exists at target " + - "location " + newPath) + "location " + newPath + ) // Before moving the file to its final destination, we check if the folder // where to put the file exists, and if not we create it: val targetContainerFolder = newPath.split("/").init.mkString("/") createFolder(targetContainerFolder) - fileSystem.rename(fileToRename, renamedFile) + hdfs.rename(fileToRename, renamedFile) } /** Moves/renames a folder. @@ -226,36 +287,36 @@ object HdfsHelper extends Serializable { overwrite: Boolean = false ): Unit = { - val fileSystem = FileSystem.get(new Configuration()) - val folderToRename = new Path(oldPath) val renamedFolder = new Path(newPath) - if (fileSystem.exists(folderToRename)) + if (hdfs.exists(folderToRename)) require( - !fileSystem.isFile(folderToRename), - "to move a file, prefer using the moveFile() method.") + !hdfs.isFile(folderToRename), + "to move a file, prefer using the moveFile() method." + ) if (overwrite) - fileSystem.delete(renamedFolder, true) + hdfs.delete(renamedFolder, true) else require( - !fileSystem.exists(renamedFolder), + !hdfs.exists(renamedFolder), "overwrite option set to false, but a folder already exists at target " + - "location " + newPath) + "location " + newPath + ) // Before moving the folder to its final destination, we check if the folder // where to put the folder exists, and if not we create it: val targetContainerFolder = newPath.split("/").init.mkString("/") createFolder(targetContainerFolder) - fileSystem.rename(folderToRename, new Path(newPath)) + hdfs.rename(folderToRename, new Path(newPath)) } /** Creates an empty file on hdfs. * - * Might be usefull for token files. For instance a file which is only used - * as a timestamp token of the last update of a processus, or a file which + * Might be useful for token files. For instance a file which is only used + * as a timestamp token of the last update of a process, or a file which * blocks the execution of an other instance of the same job, ... * * Overwrites the file if it already exists. @@ -272,7 +333,7 @@ object HdfsHelper extends Serializable { * @param filePath the path of the empty file to create */ def createEmptyHdfsFile(filePath: String): Unit = - FileSystem.get(new Configuration()).create(new Path(filePath)).close() + hdfs.create(new Path(filePath)).close() /** Saves text in a file when content is too small to really require an RDD. * @@ -289,10 +350,7 @@ object HdfsHelper extends Serializable { * @param filePath the path of the file in which to write the content */ def writeToHdfsFile(content: String, filePath: String): Unit = { - - val outputFile = - FileSystem.get(new Configuration()).create(new Path(filePath)) - + val outputFile = hdfs.create(new Path(filePath)) outputFile.write(content.getBytes("UTF-8")) outputFile.close() } @@ -311,7 +369,7 @@ object HdfsHelper extends Serializable { * List("some", "relatively small", "text"), "/some/hdfs/file/path.txt") * }}} * - * @param content the array of strings to write in the file as one line per + * @param content the seq of strings to write in the file as one line per * string (this takes care of joining strings with "\n"s). * @param filePath the path of the file in which to write the content */ @@ -337,11 +395,9 @@ object HdfsHelper extends Serializable { onlyName: Boolean = true ): List[String] = { - FileSystem - .get(new Configuration()) + hdfs .listStatus(new Path(hdfsPath)) - .flatMap(status => { - + .flatMap { status => // If it's a file: if (status.isFile) { if (onlyName) List(status.getPath.getName) @@ -351,12 +407,13 @@ object HdfsHelper extends Serializable { else if (recursive) listFileNamesInFolder( hdfsPath + "/" + status.getPath.getName, - true, - onlyName) + recursive = true, + onlyName + ) // If it's a dir and we're not in a recursive option: else Nil - }) + } .toList .sorted } @@ -371,8 +428,7 @@ object HdfsHelper extends Serializable { * @return the list of folder names in the specified folder */ def listFolderNamesInFolder(hdfsPath: String): List[String] = - FileSystem - .get(new Configuration()) + hdfs .listStatus(new Path(hdfsPath)) .filter(!_.isFile) .map(_.getPath.getName) @@ -386,13 +442,9 @@ object HdfsHelper extends Serializable { * @return the joda DateTime of the last modification of the given file */ def fileModificationDateTime(hdfsPath: String): DateTime = - new DateTime( - FileSystem - .get(new Configuration()) - .getFileStatus(new Path(hdfsPath)) - .getModificationTime()) + new DateTime(hdfs.getFileStatus(new Path(hdfsPath)).getModificationTime) - /** Returns the stringified date of the last modification of the given file. + /** Returns the formatted date of the last modification of the given file. * * {{{ * assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306") @@ -402,7 +454,7 @@ object HdfsHelper extends Serializable { * modification date. * @param format (default = "yyyyMMdd") the format under which to get the * modification date. - * @return the stringified date of the last modification of the given file, + * @return the formatted date of the last modification of the given file, * under the provided format. */ def fileModificationDate( @@ -420,7 +472,7 @@ object HdfsHelper extends Serializable { def folderModificationDateTime(hdfsPath: String): DateTime = fileModificationDateTime(hdfsPath) - /** Returns the stringified date of the last modification of the given folder. + /** Returns the formatted date of the last modification of the given folder. * * {{{ * assert(HdfsHelper.folderModificationDate("my/hdfs/folder") == "20170306") @@ -430,7 +482,7 @@ object HdfsHelper extends Serializable { * modification date. * @param format (default = "yyyyMMdd") the format under which to get the * modification date. - * @return the stringified date of the last modification of the given folder, + * @return the formatted date of the last modification of the given folder, * under the provided format. */ def folderModificationDate( @@ -452,17 +504,17 @@ object HdfsHelper extends Serializable { def nbrOfDaysSinceFileWasLastModified(hdfsPath: String): Int = Days .daysBetween(fileModificationDateTime(hdfsPath), new DateTime()) - .getDays() + .getDays /** Appends a header and a footer to a file. * - * Usefull when creating an xml file with spark and you need to add top level + * Useful when creating an xml file with spark and you need to add top level * tags. * * If the workingFolderPath parameter is provided, then the processing is * done in a working/tmp folder and then only, the final file is moved to its * final real location. This way, in case of cluster instability, i.e. in - * case the Spark job is interupted, this avoids having a temporary or + * case the Spark job is interrupted, this avoids having a temporary or * corrupted file in output. * * @param filePath the path of the file for which to add the header and the @@ -485,13 +537,13 @@ object HdfsHelper extends Serializable { /** Appends a header to a file. * - * Usefull when creating a csv file with spark and you need to add a header + * Useful when creating a csv file with spark and you need to add a header * describing the different fields. * * If the workingFolderPath parameter is provided, then the processing is * done in a working/tmp folder and then only, the final file is moved to its * final real location. This way, in case of cluster instability, i.e. in - * case the Spark job is interupted, this avoids having a temporary or + * case the Spark job is interrupted, this avoids having a temporary or * corrupted file in output. * * @param filePath the path of the file for which to add the header @@ -514,7 +566,7 @@ object HdfsHelper extends Serializable { * If the workingFolderPath parameter is provided, then the processing is * done in a working/tmp folder and then only, the final file is moved to its * final real location. This way, in case of cluster instability, i.e. in - * case the Spark job is interupted, this avoids having a temporary or + * case the Spark job is interrupted, this avoids having a temporary or * corrupted file in output. * * @param filePath the path of the file for which to add the footer @@ -546,7 +598,7 @@ object HdfsHelper extends Serializable { validateHdfsXmlWithXsd(hdfsXmlPath, xsdFile) true } catch { - case saxe: SAXException => false + case _: SAXException => false } /** Validates an XML file on hdfs in regard to the given XSD. @@ -562,9 +614,7 @@ object HdfsHelper extends Serializable { */ def validateHdfsXmlWithXsd(hdfsXmlPath: String, xsdFile: URL): Unit = { - val fileSystem = FileSystem.get(new Configuration()) - - val xmlFile = new StreamSource(fileSystem.open(new Path(hdfsXmlPath))) + val xmlFile = new StreamSource(hdfs.open(new Path(hdfsXmlPath))) val schemaFactory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI) @@ -574,7 +624,7 @@ object HdfsHelper extends Serializable { validator.validate(xmlFile) } - /** Loads a typesafe config from Hdfs. + /** Loads a Typesafe config from Hdfs. * * The best way to load the configuration of your job from hdfs. * @@ -602,15 +652,12 @@ object HdfsHelper extends Serializable { * } * }}} * - * @param hdfsConfigPath the absolute path of the typesafe config file on - * hdfs we want to load as a typesafe Config object. + * @param hdfsConfigPath the absolute path of the Typesafe config file on + * hdfs we want to load as a Typesafe Config object. * @return the com.typesafe.config.Config object which contains usable data */ def loadTypesafeConfigFromHdfs(hdfsConfigPath: String): Config = { - - val reader = new InputStreamReader( - FileSystem.get(new Configuration()).open(new Path(hdfsConfigPath))) - + val reader = new InputStreamReader(hdfs.open(new Path(hdfsConfigPath))) try { ConfigFactory.parseReader(reader) } finally { reader.close() } } @@ -623,10 +670,7 @@ object HdfsHelper extends Serializable { * @return the scala.xml.Elem object */ def loadXmlFileFromHdfs(hdfsXmlPath: String): Elem = { - - val reader = new InputStreamReader( - FileSystem.get(new Configuration()).open(new Path(hdfsXmlPath))) - + val reader = new InputStreamReader(hdfs.open(new Path(hdfsXmlPath))) try { XML.load(reader) } finally { reader.close() } } @@ -655,22 +699,19 @@ object HdfsHelper extends Serializable { deleteInputFile: Boolean = true ): Unit = { - val fileSystem = FileSystem.get(new Configuration()) - val ClassOfGzip = classOf[GzipCodec] val ClassOfBZip2 = classOf[BZip2Codec] val outputPath = compressionCodec match { - case ClassOfGzip => inputPath + ".gz" - case ClassOfBZip2 => inputPath + ".bz2" + case ClassOfGzip => s"$inputPath.gz" + case ClassOfBZip2 => s"$inputPath.bz2" } - val inputStream = fileSystem.open(new Path(inputPath)) - val outputStream = fileSystem.create(new Path(outputPath)) + val inputStream = hdfs.open(new Path(inputPath)) + val outputStream = hdfs.create(new Path(outputPath)) // The compression code: - val codec = new CompressionCodecFactory(new Configuration()) - .getCodec(new Path(outputPath)) + val codec = new CompressionCodecFactory(conf).getCodec(new Path(outputPath)) // We include the compression codec to the output stream: val compressedOutputStream = codec.createOutputStream(outputStream) @@ -678,7 +719,7 @@ object HdfsHelper extends Serializable { IOUtils.copyBytes( inputStream, compressedOutputStream, - new Configuration(), + conf, false ) } finally { @@ -707,16 +748,16 @@ object HdfsHelper extends Serializable { require( purgeAge >= 0, - "the purgeAge provided \"" + purgeAge.toString + "\" must be superior to 0.") + "the purgeAge provided \"" + purgeAge.toString + "\" must be superior to 0." + ) - FileSystem - .get(new Configuration()) + hdfs .listStatus(new Path(folderPath)) .filter(path => { val fileAgeInDays = Days - .daysBetween(new DateTime(path.getModificationTime()), new DateTime()) - .getDays() + .daysBetween(new DateTime(path.getModificationTime), new DateTime()) + .getDays fileAgeInDays >= purgeAge @@ -744,22 +785,20 @@ object HdfsHelper extends Serializable { workingFolderPath: String ): Unit = { - val fileSystem = FileSystem.get(new Configuration()) - val tmpOutputPath = workingFolderPath match { - case "" => filePath + ".tmp" - case _ => workingFolderPath + "/xml.tmp" + case "" => s"$filePath.tmp" + case _ => s"$workingFolderPath/xml.tmp" } deleteFile(tmpOutputPath) - val inputFile = fileSystem.open(new Path(filePath)) - val tmpOutputFile = fileSystem.create(new Path(tmpOutputPath)) + val inputFile = hdfs.open(new Path(filePath)) + val tmpOutputFile = hdfs.create(new Path(tmpOutputPath)) // If there is an header, we add it to the file: header.foreach(h => tmpOutputFile.write((h + "\n").getBytes("UTF-8"))) try { - IOUtils.copyBytes(inputFile, tmpOutputFile, new Configuration(), false) + IOUtils.copyBytes(inputFile, tmpOutputFile, conf, false) } finally { inputFile.close() } diff --git a/src/main/scala/com/spark_helper/Monitor.scala b/src/main/scala/com/spark_helper/Monitor.scala index 3866cbb..060787c 100644 --- a/src/main/scala/com/spark_helper/Monitor.scala +++ b/src/main/scala/com/spark_helper/Monitor.scala @@ -6,19 +6,17 @@ import java.util.Calendar import org.apache.commons.lang3.time.DurationFormatUtils -import java.lang.Throwable - -/** A logger dedicated to Spak jobs. +/** A logger dedicated to Spark jobs. * * It's a simple logger/report which contains a report that one can update from * the driver and a success state. The idea is to persist job executions logs * and errors (and forget about grepping unreadable yarn logs). * - * It's designed for perdiodic spark jobs (handles storage and purge of logs) + * It's designed for periodic spark jobs (handles storage and purge of logs) * and provides a way to handle kpis validation. * * Logs are stored on the go which means one can have a direct real time access - * of the job logs/status and it's current state (which can overwise be a pain + * of the job logs/status and it's current state (which can otherwise be a pain * if it means going through yarn logs, or even for certain production * environments going through additional layers of software logs to get to yarn * logs). @@ -33,9 +31,9 @@ import java.lang.Throwable * This is a "driver-only" logger and is not intended at logging concurrent * actions from executors. * - * Produced reports can easily be inserted in a notification email whenerver + * Produced reports can easily be inserted in a notification email whenever * the job fails, which saves a lot of time to maintainers operating on heavy - * production environements. + * production environments. * * The produced persisted report is also a way for downstream jobs to know the * status of their input data. @@ -61,7 +59,7 @@ import java.lang.Throwable * Test("Nbr of output records", processedData.count(), SUPERIOR_THAN, 10e6d, NBR), * Test("Some pct of invalid output", your_complex_kpi, INFERIOR_THAN, 3, PCT) * ), - * "My pipeline descirption" + * "My pipeline description" * ) * * if (outputIsValid) @@ -69,9 +67,9 @@ import java.lang.Throwable * * } catch { * case iie: InvalidInputException => - * Monitor.error(iie, "My pipeline descirption", diagnostic = "No input data!") + * Monitor.error(iie, "My pipeline description", diagnostic = "No input data!") * case e: Throwable => - * Monitor.error(e, "My pipeline descirption") // whatever unexpected error + * Monitor.error(e, "My pipeline description") // whatever unexpected error * } * * if (Monitor.isSuccess()) { @@ -83,7 +81,7 @@ import java.lang.Throwable * // HDFS (this saves the logs in the folder set with Monitor.setLogFolder): * Monitor.store() * - * // At the end of the job, if the job isn't successfull, you might want to + * // At the end of the job, if the job isn't successful, you might want to * // crash it (for instance to get a notification from your scheduler): * if (!Monitor.isSuccess()) throw new Exception() // or send an email, or ... * }}} @@ -100,8 +98,8 @@ import java.lang.Throwable * * My job description (whatever you want); for instance: * Documentation: https://github.com/xavierguihot/spark_helper - * [10:23] Begining - * [10:23-10:23] My pipeline descirption: failed + * [10:23] Beginning + * [10:23-10:23] My pipeline description: failed * Diagnostic: No input data! * org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://my/hdfs/input/path * at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:285) @@ -116,8 +114,8 @@ import java.lang.Throwable * * My job description (whatever you want); for instance: * Documentation: https://github.com/xavierguihot/spark_helper - * [10:23] Begining - * [10:23-10:36] My pipeline descirption: failed + * [10:23] Beginning + * [10:23-10:36] My pipeline description: failed * java.lang.NumberFormatException: For input string: "a" * java.lang.NumberFormatException.forInputString(NumberFormatException.java:65) * java.lang.Integer.parseInt(Integer.java:492) @@ -125,14 +123,14 @@ import java.lang.Throwable * [10:36] Duration: 00:13:47 * }}} * - * Another scenario, successfull spark pipeline and KPIs are valid; all good!: + * Another scenario, successful spark pipeline and KPIs are valid; all good!: * {{{ * My job title * * My job description (whatever you want); for instance: * Documentation: https://github.com/xavierguihot/spark_helper - * [10:23] Begining - * [10:23-10:41] My pipeline descirption: success + * [10:23] Beginning + * [10:23-10:41] My pipeline description: success * KPI: Nbr of output records * Value: 14669071.0 * Must be superior than 10000000.0 @@ -148,6 +146,7 @@ import java.lang.Throwable * Source Monitor * + * @todo would a State monad be appropriate? * @author Xavier Guihot * @since 2017-02 */ @@ -159,13 +158,13 @@ object Monitor { private var logDirectory: Option[String] = None private var purgeWindow: Option[Int] = None - private val jobStart = DateHelper.now("[HH:mm]") + " Begining" + private val jobStart = DateHelper.now("[HH:mm]") + " Beginning" // Join of reportTitle, pointsOfContact, reportDescription, logDirectory and // jobStart: private var reportHeader = buildReportHeader() - private val begining = Calendar.getInstance().getTimeInMillis() + private val beginning = Calendar.getInstance().getTimeInMillis private var lastReportUpdate = DateHelper.now("HH:mm") /** Sets the report's title. @@ -175,7 +174,7 @@ object Monitor { * {{{ * // Using: * Monitor.setReportTitle("My Simple Job") - * // Produces this at the begining of the report: + * // Produces this at the beginning of the report: * " My Simple Job" * "" * }}} @@ -196,7 +195,7 @@ object Monitor { * // Using: * Monitor.setReportTitle("My Simple Job") * Monitor.addContacts(List("x.guihot@gmail.com", "smbdy@gmail.com")) - * // Produces this at the begining of the report: + * // Produces this at the beginning of the report: * " My Simple Job" * "" * "Point of contact: x.guihot@gmail.com, smbdy@gmail.com" @@ -218,7 +217,7 @@ object Monitor { * // Using: * Monitor.setReportTitle("My Simple Job") * Monitor.addDescription("Documentation: https://github.com/xavierguihot/spark_helper") - * // Produces this at the begining of the report: + * // Produces this at the beginning of the report: * " My Simple Job" * "" * "Documentation: https://github.com/xavierguihot/spark_helper" @@ -269,7 +268,7 @@ object Monitor { * * @return if your spark job is successful. */ - def isSuccess(): Boolean = successful + def isSuccess: Boolean = successful /** Returns the current state of the monitoring report. * @@ -286,7 +285,7 @@ object Monitor { * * @param text the text to append to the report */ - def log(text: String): Unit = log(text, true) + def log(text: String): Unit = log(text, withTimestamp = true) /** Updates the report with some text and a success. * @@ -316,7 +315,7 @@ object Monitor { * will result in this to be appended to the report: * {{{ "[10:35-10:37] Some text: failure\n" }}} * - * Once the monitoring is a failure, then whatever following successfull + * Once the monitoring is a failure, then whatever following successful * action won't change the failed status of the monitoring. * * @param taskDescription the text to append to the report @@ -342,12 +341,12 @@ object Monitor { * {{{ * monitor.error( * invalidInputException, - * "My pipeline descirption", + * "My pipeline description", * diagnostic = "No input data!") * }}} * will result in this to be appended to the report: * {{{ - * [10:23-10:24] My pipeline descirption: failed + * [10:23-10:24] My pipeline description: failed * Diagnostic: No input data! * org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://my/hdfs/input/path * at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:285) @@ -370,7 +369,7 @@ object Monitor { successful = false val serializedException = - "\t\t" + exception.toString() + "\n" + + "\t\t" + exception.toString + "\n" + exception.getStackTrace.map(line => s"\t\t$line").mkString("\n") val update = List( @@ -433,14 +432,13 @@ object Monitor { if (!testsAreValid) successful = false - val seriralizedTests = tests.mkString("\n") + val serializedTests = tests.mkString("\n") val update = testSuitName match { - case "" => seriralizedTests - case _ => { + case "" => serializedTests + case _ => val status = if (testsAreValid) "success" else "failed" - s"$testSuitName: $status\n$seriralizedTests" - } + s"$testSuitName: $status\n$serializedTests" } log(update) @@ -506,11 +504,10 @@ object Monitor { logDirectory match { - case Some(logFolder) => { - + case Some(logFolder) => // We add the job duration to the report: val jobDuration = DurationFormatUtils.formatDuration( - Calendar.getInstance().getTimeInMillis() - begining, + Calendar.getInstance().getTimeInMillis - beginning, "HH:mm:ss") var now = DateHelper.now("[HH:mm]") @@ -532,13 +529,13 @@ object Monitor { .writeToHdfsFile(finalReport, s"$logFolder/current.$reportExtension") purgeWindow.foreach(window => purgeOutdatedLogs(logFolder, window)) - } case None => require( logDirectory.nonEmpty, "to save the report, please specify the log folder using " + - "Monitor.setLogFolder(\"hdfs/path/to/log/folder\")") + "Monitor.setLogFolder(\"hdfs/path/to/log/folder\")" + ) } } @@ -583,20 +580,17 @@ object Monitor { /** Updates the current stored version of logs in file * logFolder/current.ongoing */ private def storeCurrent(): Unit = - logDirectory.foreach { - case logFolder => { - - val warning = - "WARNING: If this file exists it does not necessarily mean that " + - "your job is still running. This file might persist if your job " + - "has been killed and thus couldn't reach your call to the " + - "Monitor.store()." + logDirectory.foreach { logFolder => + val warning = + "WARNING: If this file exists it does not necessarily mean that " + + "your job is still running. This file might persist if your job " + + "has been killed and thus couldn't reach your call to the " + + "Monitor.store()." - val ongoingReport = - s"$reportHeader\n$report\n$warning" + val ongoingReport = + s"$reportHeader\n$report\n$warning" - HdfsHelper.writeToHdfsFile(ongoingReport, s"$logFolder/current.ongoing") - } + HdfsHelper.writeToHdfsFile(ongoingReport, s"$logFolder/current.ongoing") } private def purgeOutdatedLogs(logFolder: String, window: Int): Unit = { diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index 3f612eb..9d05d96 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -1,5 +1,6 @@ package com.spark_helper +import org.apache.spark.TextFileOverwrite import org.apache.spark.{HashPartitioner, SparkContext} import org.apache.spark.rdd.{RDD, HadoopRDD} import org.apache.hadoop.conf.Configuration @@ -10,6 +11,8 @@ import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.hadoop.mapred.{FileSplit, TextInputFormat => TextInputFormat2} +import scala.reflect.ClassTag + import scala.util.Random /** A facility to deal with RDD/file manipulations based on the Spark API. @@ -18,611 +21,768 @@ import scala.util.Random * spark job and replace it with methods fully tested whose name is * self-explanatory/readable. * - * A few exemples: + * A few examples: * * {{{ - * // Same as SparkContext.saveAsTextFile, but the result is a single file: - * SparkHelper.saveAsSingleTextFile(myOutputRDD, "/my/output/file/path.txt") - * // Same as SparkContext.textFile, but instead of reading one record per - * // line, it reads records spread over several lines. - * // This way, xml, json, yml or any multi-line record file format can be used - * // with Spark: - * SparkHelper.textFileWithDelimiter("/my/input/folder/path", sparkContext, "---\n") - * // Same as SparkContext.textFile, but instead of returning an RDD of - * // records, it returns an RDD of tuples containing both the record and the - * // path of the file it comes from: - * SparkHelper.textFileWithFileName("folder", sparkContext) + * import com.spark_helper.SparkHelper._ + * + * // Same as rdd.saveAsTextFile("path"), but the result is a single file (while + * // keeping the processing distributed): + * rdd.saveAsSingleTextFile("/my/output/file/path.txt") + * rdd.saveAsSingleTextFile("/my/output/file/path.txt", classOf[BZip2Codec]) + * + * // Same as sc.textFile("path"), but instead of reading one record per line (by + * // splitting the input with \n), it splits the file in records based on a custom + * // delimiter. This way, xml, json, yml or any multi-line record file format can + * // be used with Spark: + * sc.textFile("/my/input/folder/path", "---\n") // for a yml file for instance + * + * // Equivalent to rdd.flatMap(identity) for RDDs of Seqs or Options: + * rdd.flatten + * + * // Equivalent to sc.textFile(), but for each line is tupled with its file path: + * sc.textFileWithFileName("/my/input/folder/path") + * // which produces: + * // RDD(("folder/file_1.txt", "record1fromfile1"), ("folder/file_1.txt", "record2fromfile1"), + * // ("folder/file_2.txt", "record1fromfile2"), ...) + * + * // In the given folder, this generates one file per key in the given key/value + * // RDD. Within each file (named from the key) are all values for this key: + * rdd.saveAsTextFileByKey("/my/output/folder/path") + * + * // Concept mapper (the following example transforms RDD(1, 3, 2, 7, 8) into RDD(1, 3, 4, 7, 16)): + * rdd.partialMap { case a if a % 2 == 0 => 2 * a } + * + * // For when input files contain commas and textFile can't handle it: + * sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt")) * }}} * * Source SparkHelper * + * @todo sc.parallelize[T](elmts: T*) instead of sc.parallelize[T](elmts: Array[T]) * @author Xavier Guihot * @since 2017-02 */ object SparkHelper extends Serializable { - /** Saves an RDD in exactly one file. - * - * Allows one to save an RDD in one file, while keeping the processing - * parallelized. - * - * {{{ SparkHelper.saveAsSingleTextFile(myRddToStore, "/my/file/path.txt") }}} - * - * @param outputRDD the RDD of strings to store in one file - * @param outputFile the path of the produced file - */ - def saveAsSingleTextFile(outputRDD: RDD[String], outputFile: String): Unit = - saveAsSingleTextFileInternal(outputRDD, outputFile, None) - - /** Saves an RDD in exactly one file. - * - * Allows one to save an RDD in one file, while keeping the processing - * parallelized. - * - * {{{ - * SparkHelper.saveAsSingleTextFile( - * myRddToStore, "/my/file/path.txt", classOf[BZip2Codec]) - * }}} - * - * @param outputRDD the RDD of strings to store in one file - * @param outputFile the path of the produced file - * @param compressionCodec the type of compression to use (for instance - * classOf[BZip2Codec] or classOf[GzipCodec])) - */ - def saveAsSingleTextFile( - outputRDD: RDD[String], - outputFile: String, - compressionCodec: Class[_ <: CompressionCodec] - ): Unit = - saveAsSingleTextFileInternal(outputRDD, outputFile, Some(compressionCodec)) - - /** Saves an RDD in exactly one file. - * - * Allows one to save an RDD in one file, while keeping the processing - * parallelized. - * - * This variant of saveAsSingleTextFile performs the storage in a temporary - * folder instead of directly in the final output folder. This way the - * risks of having corrupted files in the real output folder due to cluster - * interruptions is minimized. - * - * {{{ - * SparkHelper.saveAsSingleTextFile( - * myRddToStore, "/my/file/path.txt", "/my/working/folder/path") - * }}} - * - * @param outputRDD the RDD of strings to store in one file - * @param outputFile the path of the produced file - * @param workingFolder the path where file manipulations will temporarily - * happen. - */ - def saveAsSingleTextFile( - outputRDD: RDD[String], - outputFile: String, - workingFolder: String - ): Unit = - saveAsSingleTextFileWithWorkingFolderInternal( - outputRDD, - outputFile, - workingFolder, - None) - - /** Saves an RDD in exactly one file. - * - * Allows one to save an RDD in one file, while keeping the processing - * parallelized. - * - * This variant of saveAsSingleTextFile performs the storage in a temporary - * folder instead of directly in the final output folder. This way the risks - * of having corrupted files in the real output folder due to cluster - * interruptions is minimized. - * - * {{{ - * SparkHelper.saveAsSingleTextFile( - * myRddToStore, - * "/my/file/path.txt", - * "/my/working/folder/path", - * classOf[BZip2Codec]) - * }}} - * - * @param outputRDD the RDD of strings to store in one file - * @param outputFile the path of the produced file - * @param workingFolder the path where file manipulations will temporarily - * happen. - * @param compressionCodec the type of compression to use (for instance - * classOf[BZip2Codec] or classOf[GzipCodec])) - */ - def saveAsSingleTextFile( - outputRDD: RDD[String], - outputFile: String, - workingFolder: String, - compressionCodec: Class[_ <: CompressionCodec] - ): Unit = - saveAsSingleTextFileWithWorkingFolderInternal( - outputRDD, - outputFile, - workingFolder, - Some(compressionCodec)) - - /** Equivalent to sparkContext.textFile(), but for a specific record delimiter. - * - * By default, sparkContext.textFile() will provide one record per line. But - * what if the format you want to read considers that one record (one entity) - * is stored in more than one line (yml, xml, ...)? - * - * For instance in order to read a yml file, which is a format for which a - * record (a single entity) is spread other several lines, you can modify the - * record delimiter with "---\n" instead of "\n". Same goes when reading an - * xml file where a record might be spread over several lines or worse the - * whole xml file is one line. - * - * {{{ - * // Let's say data we want to use with Spark looks like this (one record is - * // a customer, but it's spread over several lines): - * \n - * \n - *
        34 thingy street, someplace, sometown
        \n - *
        \n - * \n - *
        12 thingy street, someplace, sometown
        \n - *
        \n - *
        - * //Then you can use it this way: - * val computedRecords = SparkHelper.textFileWithDelimiter( - * "my/path/to/customers.xml", sparkContext, \n - * ).collect() - * val expectedRecords = Array( - * \n, - * ( - *
        34 thingy street, someplace, sometown
        \n + - *
        \n - * ), - * ( - *
        12 thingy street, someplace, sometown
        \n + - * \n + - * - * ) - * ) - * assert(computedRecords == expectedRecords) - * }}} - * - * @param hdfsPath the path of the file to read (folder or file, '*' works as - * well). - * @param sparkContext the SparkContext - * @param delimiter the specific record delimiter which replaces "\n" - * @param maxRecordLength the max length (not sure which unit) of a record - * before considering the record too long to fit into memory. - * @return the RDD of records - */ - def textFileWithDelimiter( - hdfsPath: String, - sparkContext: SparkContext, - delimiter: String, - maxRecordLength: String = "1000000" - ): RDD[String] = { - - val conf = new Configuration(sparkContext.hadoopConfiguration) - - // This configuration sets the record delimiter: - conf.set("textinputformat.record.delimiter", delimiter) - - // and this one limits the size of one record. This is necessary in order to - // avoid reading from a corrupted file from which a record could be too long - // to fit in memory. This way, when reading a corrupted file, this will - // throw an exception (java.io.IOException - thus catchable) rather than - // having a messy out of memory which will stop the sparkContext: - conf.set("mapreduce.input.linerecordreader.line.maxlength", maxRecordLength) - - sparkContext - .newAPIHadoopFile( - hdfsPath, - classOf[TextInputFormat], - classOf[LongWritable], - classOf[Text], - conf - ) - .map { case (_, text) => text.toString } + implicit class RDDExtensions[T: ClassTag](val rdd: RDD[T]) { + + /** Map an RDD to the same type, by applying a partial function and the + * identity otherwise. + * + * Avoids having case x => x. + * + * Similar idea to .collect, + * but instead of skipping non-matching items, it keeps them as-is. + * + * {{{ + * sc.parallelize(Array(1, 3, 2, 7, 8)).partialMap { case a if a % 2 == 0 => 2 * a } + * // is equivalent to: + * sc.parallelize(Array(1, 3, 2, 7, 8)).map { + * case a if a % 2 == 0 => 2 * a + * case a => a + * } + * // in order to map to: + * sc.parallelize(Array(1, 3, 4, 7, 16)) + * }}} + * + * @param pf the partial function to apply + * @return an rdd of the same type, for which each element is either the + * application of the partial function where defined or the identity. + */ + def partialMap(pf: PartialFunction[T, T]): RDD[T] = + rdd.map { + case x if pf.isDefinedAt(x) => pf(x) + case x => x + } } - /** Saves and repartitions a key/value RDD on files whose name is the key. - * - * Within the provided outputFolder, will be one file per key in your - * keyValueRDD. And within a file for a given key are only values for this - * key. - * - * You need to know the nbr of keys beforehand (in general you use this to - * split your dataset in subsets, or to output one file per client, so you - * know how many keys you have). So you need to put as keyNbr the exact nbr - * of keys you'll have. - * - * This is not scalable. This shouldn't be considered for any data flow with - * normal or big volumes. - * - * {{{ - * SparkHelper.saveAsTextFileByKey( - * myKeyValueRddToStore, "/my/output/folder/path", 12) - * }}} - * - * @param keyValueRDD the key/value RDD - * @param outputFolder the foldder where will be storrred key files - * @param keyNbr the nbr of expected keys (which is the nbr of outputed files) - */ - def saveAsTextFileByKey( - keyValueRDD: RDD[(String, String)], - outputFolder: String, - keyNbr: Int - ): Unit = { - - HdfsHelper.deleteFolder(outputFolder) - - keyValueRDD - .partitionBy(new HashPartitioner(keyNbr)) - .saveAsHadoopFile( - outputFolder, - classOf[String], - classOf[String], - classOf[KeyBasedOutput] + implicit class StringRDDExtensions(val rdd: RDD[String]) extends AnyVal { + + /** Saves an RDD in exactly one file. + * + * Allows one to save an RDD in one file, while keeping the processing + * distributed. + * + * {{{ rdd.saveAsSingleTextFile("/my/file/path.txt") }}} + * + * @param path the path of the produced file + */ + def saveAsSingleTextFile(path: String): Unit = + SparkHelper.saveAsSingleTextFileInternal(rdd, path, None) + + /** Saves an RDD in exactly one file. + * + * Allows one to save an RDD in one file, while keeping the processing + * distributed. + * + * {{{ rdd.saveAsSingleTextFile("/my/file/path.txt", classOf[BZip2Codec]) }}} + * + * @param path the path of the produced file + * @param codec the type of compression to use (for instance + * classOf[BZip2Codec] or classOf[GzipCodec])) + */ + def saveAsSingleTextFile( + path: String, + codec: Class[_ <: CompressionCodec] + ): Unit = + SparkHelper.saveAsSingleTextFileInternal(rdd, path, Some(codec)) + + /** Saves an RDD in exactly one file. + * + * Allows one to save an RDD in one file, while keeping the processing + * distributed. + * + * This variant of saveAsSingleTextFile + * performs the storage in a temporary folder instead of directly in the + * final output folder. This way the risks of having corrupted files in the + * real output folder due to cluster interruptions is minimized. + * + * {{{ rdd.saveAsSingleTextFile("/my/file/path.txt", "/my/working/folder/path") }}} + * + * @param path the path of the produced file + * @param workingFolder the path where file manipulations will temporarily + * happen. + */ + def saveAsSingleTextFile(path: String, workingFolder: String): Unit = + SparkHelper.saveAsSingleTextFileWithWorkingFolderInternal( + rdd, + path, + workingFolder, + None ) - } - /** Saves and repartitions a key/value RDD on files whose name is the key. - * - * Within the provided outputFolder, will be one file per key in your - * keyValueRDD. And within a file for a given key are only values for this - * key. - * - * You need to know the nbr of keys beforehand (in general you use this to - * split your dataset in subsets, or to output one file per client, so you - * know how many keys you have). So you need to put as keyNbr the exact nbr - * of keys you'll have. - * - * This is not scalable. This shouldn't be considered for any data flow with - * normal or big volumes. - * - * {{{ - * SparkHelper.saveAsTextFileByKey( - * myKeyValueRddToStore, "/my/output/folder/path", 12, classOf[BZip2Codec]) - * }}} - * - * @param keyValueRDD the key/value RDD - * @param outputFolder the foldder where will be storrred key files - * @param keyNbr the nbr of expected keys (which is the nbr of outputed files) - * @param compressionCodec the type of compression to use (for instance - * classOf[BZip2Codec] or classOf[GzipCodec])) - */ - def saveAsTextFileByKey( - keyValueRDD: RDD[(String, String)], - outputFolder: String, - keyNbr: Int, - compressionCodec: Class[_ <: CompressionCodec] - ): Unit = { + /** Saves an RDD in exactly one file. + * + * Allows one to save an RDD in one file, while keeping the processing + * distributed. + * + * This variant of saveAsSingleTextFile + * performs the storage in a temporary folder instead of directly in the + * final output folder. This way the risks of having corrupted files in the + * real output folder due to cluster interruptions is minimized. + * + * {{{ + * rdd.saveAsSingleTextFile("/my/file/path.txt", "/my/working/folder/path", classOf[BZip2Codec]) + * }}} + * + * @param path the path of the produced file + * @param workingFolder the path where file manipulations will temporarily + * happen. + * @param codec the type of compression to use (for instance + * classOf[BZip2Codec] or classOf[GzipCodec])) + */ + def saveAsSingleTextFile( + path: String, + workingFolder: String, + codec: Class[_ <: CompressionCodec] + ): Unit = + SparkHelper.saveAsSingleTextFileWithWorkingFolderInternal( + rdd, + path, + workingFolder, + Some(codec) + ) - HdfsHelper.deleteFolder(outputFolder) + /** Saves as text file, but by decreasing the nbr of partitions of the output. + * + * Same as rdd.saveAsTextFile() + * , but decreases the nbr of partitions in the output folder before doing + * so. + * + * The result is equivalent to rdd.coalesce(x).saveAsTextFile() + * , but if x + * is very low, coalesce + * would make the processing time explode, whereas this methods keeps the + * processing distributed, save as text file and then only merges the + * result in a lower nbr of partitions. + * + * {{{ rdd.saveAsTextFileAndCoalesce("/produced/folder/path/with/only/30/files", 30) }}} + * + * @param path the folder where will finally be stored the RDD but spread + * on only 30 files (where 30 is the value of the finalCoalesceLevel + * parameter). + * @param finalCoalesceLevel the nbr of files within the folder at the end + * of this method. + */ + def saveAsTextFileAndCoalesce( + path: String, + finalCoalesceLevel: Int + ): Unit = { + + // We remove folders where to store data in case they already exist: + HdfsHelper.deleteFolder(s"${path}_tmp") + HdfsHelper.deleteFolder(path) + + // We first save the rdd with the level of coalescence used during the + // processing. This way the processing is done with the right level of + // tasks: + rdd.saveAsTextFile(s"${path}_tmp") + + // Then we read back this tmp folder, apply the coalesce and store it back: + SparkHelper.decreaseCoalescenceInternal( + s"${path}_tmp", + path, + finalCoalesceLevel, + rdd.context, + None + ) + } - keyValueRDD - .partitionBy(new HashPartitioner(keyNbr)) - .saveAsHadoopFile( - outputFolder, - classOf[String], - classOf[String], - classOf[KeyBasedOutput], - compressionCodec + /** Saves as text file, and decreases the nbr of output partitions. + * + * Same as rdd.saveAsTextFile() + * , but decreases the nbr of partitions in the output folder before doing + * so. + * + * The result is equivalent to rdd.coalesce(x).saveAsTextFile() + * , but if x + * is very low, coalesce + * would make the processing time explode, whereas this methods keeps the + * processing distributed, save as text file and then only merges the + * result in a lower nbr of partitions. + * + * {{{ rdd.saveAsTextFileAndCoalesce("/produced/folder/path/with/only/30/files", 30, classOf[BZip2Codec]) }}} + * + * @param path the folder where will finally be stored the RDD but spread + * on only 30 files (where 30 is the value of the finalCoalesceLevel + * parameter). + * @param finalCoalesceLevel the nbr of files within the folder at the end + * of this method. + * @param codec the type of compression to use (for instance + * classOf[BZip2Codec] or classOf[GzipCodec])) + */ + def saveAsTextFileAndCoalesce( + path: String, + finalCoalesceLevel: Int, + codec: Class[_ <: CompressionCodec] + ): Unit = { + + // We remove folders where to store data in case they already exist: + HdfsHelper.deleteFolder(s"${path}_tmp") + HdfsHelper.deleteFolder(path) + + // We first save the rdd with the level of coalescence used during the + // processing. This way the processing is done with the right level of + // tasks: + rdd.saveAsTextFile(s"${path}_tmp") + + // Then we read back this tmp folder, apply the coalesce and store it back: + decreaseCoalescenceInternal( + s"${path}_tmp", + path, + finalCoalesceLevel, + rdd.context, + Some(codec) ) + } } - /** Decreases the nbr of partitions of a folder. - * - * This is often handy when the last step of your job needs to run on - * thousands of files, but you want to store your final output on let's say - * only 300 files. - * - * It's like a FileUtil.copyMerge, but the merging produces more than one - * file. - * - * Be aware that this methods deletes the provided input folder. - * - * {{{ - * SparkHelper.decreaseCoalescence( - * "/folder/path/with/2000/files", - * "/produced/folder/path/with/only/300/files", - * 300, - * sparkContext) - * }}} - * - * @param highCoalescenceLevelFolder the folder which contains 10000 files - * @param lowerCoalescenceLevelFolder the folder which will contain the same - * data as highCoalescenceLevelFolder but spread on only 300 files (where 300 - * is the finalCoalescenceLevel parameter). - * @param finalCoalescenceLevel the nbr of files within the folder at the end - * of this method. - * @param sparkContext the SparkContext - */ - def decreaseCoalescence( - highCoalescenceLevelFolder: String, - lowerCoalescenceLevelFolder: String, - finalCoalescenceLevel: Int, - sparkContext: SparkContext - ): Unit = - decreaseCoalescenceInternal( - highCoalescenceLevelFolder, - lowerCoalescenceLevelFolder, - finalCoalescenceLevel, - sparkContext, - None) - - /** Decreases the nbr of partitions of a folder. - * - * This is often handy when the last step of your job needs to run on - * thousands of files, but you want to store your final output on let's say - * only 300 files. - * - * It's like a FileUtil.copyMerge, but the merging produces more than one - * file. - * - * Be aware that this methods deletes the provided input folder. - * - * {{{ - * SparkHelper.decreaseCoalescence( - * "/folder/path/with/2000/files", - * "/produced/folder/path/with/only/300/files", - * 300, - * sparkContext, - * classOf[BZip2Codec]) - * }}} - * - * @param highCoalescenceLevelFolder the folder which contains 10000 files - * @param lowerCoalescenceLevelFolder the folder which will contain the same - * data as highCoalescenceLevelFolder but spread on only 300 files (where 300 - * is the finalCoalescenceLevel parameter). - * @param finalCoalescenceLevel the nbr of files within the folder at the end - * of this method. - * @param sparkContext the SparkContext - * @param compressionCodec the type of compression to use (for instance - * classOf[BZip2Codec] or classOf[GzipCodec])) - */ - def decreaseCoalescence( - highCoalescenceLevelFolder: String, - lowerCoalescenceLevelFolder: String, - finalCoalescenceLevel: Int, - sparkContext: SparkContext, - compressionCodec: Class[_ <: CompressionCodec] - ): Unit = - decreaseCoalescenceInternal( - highCoalescenceLevelFolder, - lowerCoalescenceLevelFolder, - finalCoalescenceLevel, - sparkContext, - Some(compressionCodec)) - - /** Saves as text file, but by decreasing the nbr of partitions of the output. - * - * Same as decreaseCoalescence, but the storage of the RDD in an intermediate - * folder is included. - * - * This still makes the processing parallelized, but the output is coalesced. - * - * {{{ - * SparkHelper.saveAsTextFileAndCoalesce( - * myRddToStore, "/produced/folder/path/with/only/300/files", 300) - * }}} - * - * @param outputRDD the RDD to store, processed for instance on 10000 tasks - * (which would thus be stored as 10000 files). - * @param outputFolder the folder where will finally be stored the RDD but - * spread on only 300 files (where 300 is the value of the - * finalCoalescenceLevel parameter). - * @param finalCoalescenceLevel the nbr of files within the folder at the end - * of this method. - */ - def saveAsTextFileAndCoalesce( - outputRDD: RDD[String], - outputFolder: String, - finalCoalescenceLevel: Int - ): Unit = { - - val sparkContext = outputRDD.context - - // We remove folders where to store data in case they already exist: - HdfsHelper.deleteFolder(outputFolder + "_tmp") - HdfsHelper.deleteFolder(outputFolder) - - // We first save the rdd with the level of coalescence used during the - // processing. This way the processing is done with the right level of - // tasks: - outputRDD.saveAsTextFile(outputFolder + "_tmp") - - // Then we read back this tmp folder, apply the coalesce and store it back: - decreaseCoalescenceInternal( - outputFolder + "_tmp", - outputFolder, - finalCoalescenceLevel, - sparkContext, - None) + implicit class SeqRDDExtensions[T: ClassTag](val rdd: RDD[Seq[T]]) { + + /** Flattens an RDD of Seq[T] + * to RDD[T]. + * + * {{{ sc.parallelize(Array(Seq(1, 2, 3), Nil, Seq(4))).flatten == sc.parallelize(Array(Seq(1, 2, 3, 4))) }}} + * + * @return the flat RDD as RDD.flatMap(identity) + * or List.flatten + * would have. + */ + def flatten: RDD[T] = rdd.flatMap(identity) } - /** Saves as text file, but by decreasing the nbr of partitions of the output. - * - * Same as decreaseCoalescence, but the storage of the RDD in an intermediate - * folder is included. - * - * This still makes the processing parallelized, but the output is coalesced. - * - * {{{ - * SparkHelper.saveAsTextFileAndCoalesce( - * myRddToStore, - * "/produced/folder/path/with/only/300/files", - * 300, - * classOf[BZip2Codec]) - * }}} - * - * @param outputRDD the RDD to store, processed for instance on 10000 tasks - * (which would thus be stored as 10000 files). - * @param outputFolder the folder where will finally be stored the RDD but - * spread on only 300 files (where 300 is the value of the - * finalCoalescenceLevel parameter). - * @param finalCoalescenceLevel the nbr of files within the folder at the end - * of this method. - * @param compressionCodec the type of compression to use (for instance - * classOf[BZip2Codec] or classOf[GzipCodec])) - */ - def saveAsTextFileAndCoalesce( - outputRDD: RDD[String], - outputFolder: String, - finalCoalescenceLevel: Int, - compressionCodec: Class[_ <: CompressionCodec] - ): Unit = { + implicit class OptionRDDExtensions[T: ClassTag](val rdd: RDD[Option[T]]) { + + /** Flattens an RDD of Option[T] + * to RDD[T]. + * + * {{{ sc.parallelize(Array(Some(1), None, Some(2))).flatten == sc.parallelize(Array(Seq(1, 2))) }}} + * + * @return the flat RDD as RDD.flatMap(x => x) + * or List.flatten + * would have. + */ + def flatten: RDD[T] = rdd.flatMap(o => o) + } - val sparkContext = outputRDD.context + implicit class PairRDDExtensions(val rdd: RDD[(String, String)]) + extends AnyVal { + + /** Saves and repartitions a key/value RDD on files whose name is the key. + * + * Within the provided path, there will be one file per key in the given + * keyValueRDD. And within a file for a given key are only stored values + * for this key. + * + * As this internally needs to know the nbr of keys, this will have to + * compute it. If this nbr of keys is known beforehand, it would spare + * resources to use saveAsTextFileByKey(path: String, keyNbr: Int) + * instead. + * + * This is not scalable. This shouldn't be considered for any data flow + * with normal or big volumes. + * + * {{{ rdd.saveAsTextFileByKey("/my/output/folder/path") }}} + * + * @param path the folder where will be stored key files + */ + def saveAsTextFileByKey(path: String): Unit = + SparkHelper.saveAsTextFileByKeyInternal(rdd, path, None, None) + + /** Saves and repartitions a key/value RDD on files whose name is the key. + * + * Within the provided path, there will be one file per key in the given + * keyValueRDD. And within a file for a given key are only stored values + * for this key. + * + * This is not scalable. This shouldn't be considered for any data flow + * with normal or big volumes. + * + * {{{ rdd.saveAsTextFileByKey("/my/output/folder/path", 12) }}} + * + * @param path the folder where will be stored key files + * @param keyNbr the nbr of expected keys (which is the nbr of output + * files) + */ + def saveAsTextFileByKey(path: String, keyNbr: Int): Unit = + SparkHelper.saveAsTextFileByKeyInternal(rdd, path, Some(keyNbr), None) + + /** Saves and repartitions a key/value RDD on files whose name is the key. + * + * Within the provided path, there will be one file per key in the given + * keyValueRDD. And within a file for a given key are only stored values + * for this key. + * + * As this internally needs to know the nbr of keys, this will have to + * compute it. If this nbr of keys is known beforehand, it would spare + * resources to use + * saveAsTextFileByKey(path: String, keyNbr: Int, codec: Class[_ <: CompressionCodec]) + * instead. + * + * This is not scalable. This shouldn't be considered for any data flow + * with normal or big volumes. + * + * {{{ rdd.saveAsTextFileByKey("/my/output/folder/path", classOf[BZip2Codec]) }}} + * + * @param path the folder where will be stored key files + * @param codec the type of compression to use (for instance + * classOf[BZip2Codec] or classOf[GzipCodec])) + */ + def saveAsTextFileByKey( + path: String, + codec: Class[_ <: CompressionCodec] + ): Unit = + SparkHelper.saveAsTextFileByKeyInternal(rdd, path, None, Some(codec)) + + /** Saves and repartitions a key/value RDD on files whose name is the key. + * + * Within the provided path, there will be one file per key in the given + * keyValueRDD. And within a file for a given key are only stored values + * for this key. + * + * This is not scalable. This shouldn't be considered for any data flow + * with normal or big volumes. + * + * {{{ rdd.saveAsTextFileByKey("/my/output/folder/path", 12, classOf[BZip2Codec]) }}} + * + * @param path the folder where will be stored key files + * @param keyNbr the nbr of expected keys (which is the nbr of output + * files) + * @param codec the type of compression to use (for instance + * classOf[BZip2Codec] or classOf[GzipCodec])) + */ + def saveAsTextFileByKey( + path: String, + keyNbr: Int, + codec: Class[_ <: CompressionCodec] + ): Unit = + SparkHelper + .saveAsTextFileByKeyInternal(rdd, path, Some(keyNbr), Some(codec)) + } - // We remove folders where to store data in case they already exist: - HdfsHelper.deleteFolder(outputFolder + "_tmp") - HdfsHelper.deleteFolder(outputFolder) + implicit class SparkContextExtensions(val sc: SparkContext) extends AnyVal { + + /** Equivalent to sparkContext.textFile() + * , but for a specific record delimiter. + * + * By default, sparkContext.textFile() + * will provide one record per line (per '\n'). + * But what if the format to read considers that one record is stored in + * more than one line (yml, custom format, ...)? + * + * For instance in order to read a yml file, which is a format for which a + * record (a single entity) is spread other several lines, you can modify + * the record delimiter with "---\n" + * instead of "\n". + * Same goes when reading an xml file where a record might be spread over + * several lines or worse the whole xml file is one line. + * + * {{{ + * // Let's say data we want to use with Spark looks like this (one record + * // is a customer, but it's spread over several lines): + * \n + * \n + *
        34 thingy street, someplace, sometown
        \n + *
        \n + * \n + *
        12 thingy street, someplace, sometown
        \n + *
        \n + *
        + * //Then you can use it this way: + * val computedRecords = sc.textFile("my/path/to/customers.xml", "\n") + * val expectedRecords = RDD( + * \n, + * ( + *
        34 thingy street, someplace, sometown
        \n + + *
        \n + * ), + * ( + *
        12 thingy street, someplace, sometown
        \n + + * \n + + * + * ) + * ) + * assert(computedRecords == expectedRecords) + * }}} + * + * @param path the path of the file to read (folder or file, '*' works + * as well). + * @param delimiter the specific record delimiter which replaces "\n" + * @param maxRecordLength the max length (not sure which unit) of a record + * before considering the record too long to fit into memory. + * @return the RDD of records + */ + def textFile( + path: String, + delimiter: String, + maxRecordLength: String = "1000000" + ): RDD[String] = { + + val conf = new Configuration(sc.hadoopConfiguration) + + // This configuration sets the record delimiter: + conf.set("textinputformat.record.delimiter", delimiter) + + // and this one limits the size of one record. This is necessary in order + // to avoid reading from a corrupted file from which a record could be too + // long to fit in memory. This way, when reading a corrupted file, this + // will throw an exception (java.io.IOException - thus catchable) rather + // than having a messy out of memory which will stop the sparkContext: + conf + .set("mapreduce.input.linerecordreader.line.maxlength", maxRecordLength) + + sc.newAPIHadoopFile( + path, + classOf[TextInputFormat], + classOf[LongWritable], + classOf[Text], + conf + ) + .map { case (_, text) => text.toString } + } - // We first save the rdd with the level of coalescence used during the - // processing. This way the processing is done with the right level of - // tasks: - outputRDD.saveAsTextFile(outputFolder + "_tmp") + /** Equivalent to sparkContext.textFile() + * , but each record is associated with the file path it comes from. + * + * Produces an RDD[(file_name, line)] + * which provides a way to know from which file a given line comes from. + * + * {{{ + * // Considering this folder: + * // folder/file_1.txt whose content is data1\ndata2\ndata3 + * // folder/file_2.txt whose content is data4\ndata4 + * // folder/folder_1/file_3.txt whose content is data6\ndata7 + * // then: + * sc.textFileWithFileName("folder") + * // will return: + * RDD( + * ("file:/path/on/machine/folder/file_1.txt", "data1"), + * ("file:/path/on/machine/folder/file_1.txt", "data2"), + * ("file:/path/on/machine/folder/file_1.txt", "data3"), + * ("file:/path/on/machine/folder/file_2.txt", "data4"), + * ("file:/path/on/machine/folder/file_2.txt", "data5"), + * ("file:/path/on/machine/folder/folder_1/file_3.txt", "data6"), + * ("file:/path/on/machine/folder/folder_1/file_3.txt", "data7") + * ) + * }}} + * + * @param path the path of the folder (or structure of folders) to read + * @return the RDD of records where a record is a tuple containing the path + * of the file the record comes from and the record itself. + */ + def textFileWithFileName(path: String): RDD[(String, String)] = { + + // In order to go through the folder structure recursively: + sc.hadoopConfiguration + .set("mapreduce.input.fileinputformat.input.dir.recursive", "true") + + sc.hadoopFile( + path, + classOf[TextInputFormat2], + classOf[LongWritable], + classOf[Text], + sc.defaultMinPartitions + ) + .asInstanceOf[HadoopRDD[LongWritable, Text]] + .mapPartitionsWithInputSplit { + case (inputSplit, iterator) => + val file = inputSplit.asInstanceOf[FileSplit] + iterator.map(tpl => (file.getPath.toString, tpl._2.toString)) + } + + // An other way of doing would be: + // + // import org.apache.spark.sql.functions.input_file_name + // import spark.implicits._ + // + // spark.read + // .text(testFolder) + // .select(input_file_name, $"value") + // .as[(String, String)] + // .rdd + } - // Then we read back this tmp folder, apply the coalesce and store it back: - decreaseCoalescenceInternal( - outputFolder + "_tmp", - outputFolder, - finalCoalescenceLevel, - sparkContext, - Some(compressionCodec)) - } + /** A replacement for sc.textFile() + * when files contains commas in their name. + * + * As sc.textFile() + * allows to provide several files at once by giving them as a string which + * is a list of strings joined with ,, + * we can't give it files containing commas in their name. + * + * This method aims at bypassing this limitation by passing paths as a + * sequence of strings. + * + * {{{ sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt")) }}} + * + * @param paths the paths of the file(s)/folder(s) to read + */ + def textFile(paths: Seq[String]): RDD[String] = + TextFileOverwrite.textFile(paths, sc.defaultMinPartitions, sc) + + /** A replacement for sc.textFile() + * when files contains commas in their name. + * + * As sc.textFile() + * allows to provide several files at once by giving them as a string which + * is a list of strings joined with ,, + * we can't give it files containing commas in their name. + * + * This method aims at bypassing this limitation by passing paths as a + * sequence of strings. + * + * {{{ sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt")) }}} + * + * @param paths the paths of the file(s)/folder(s) to read + * @param minPartitions the nbr of partitions in which to split the input + */ + def textFile(paths: Seq[String], minPartitions: Int): RDD[String] = + TextFileOverwrite.textFile(paths, minPartitions, sc) + + /** Decreases the nbr of partitions of a folder. + * + * This comes in handy when the last step of your job needs to run on + * thousands of files, but you want to store your final output on let's say + * only 30 files. + * + * It's like a FileUtil.copyMerge() + * , but the merging produces more than one file. + * + * Be aware that this methods deletes the provided input folder. + * + * {{{ + * sc.decreaseCoalescence( + * "/folder/path/with/2000/files", + * "/produced/folder/path/with/only/30/files", + * 30 + * ) + * }}} + * + * @param highCoalescenceLevelFolder the folder which contains 10000 files + * @param lowerCoalescenceLevelFolder the folder which will contain the same + * data as highCoalescenceLevelFolder but spread on only 30 files (where 30 + * is the finalCoalesceLevel parameter). + * @param finalCoalesceLevel the nbr of files within the folder at the end + * of this method. + */ + def decreaseCoalescence( + highCoalescenceLevelFolder: String, + lowerCoalescenceLevelFolder: String, + finalCoalesceLevel: Int + ): Unit = + SparkHelper.decreaseCoalescenceInternal( + highCoalescenceLevelFolder, + lowerCoalescenceLevelFolder, + finalCoalesceLevel, + sc, + None + ) - /** Equivalent to sparkContext.textFile(), but for each line is associated - * with its file path. - * - * Produces a RDD[(file_name, line)] which provides a way to know from which - * file a given line comes from. - * - * {{{ - * // Considering this folder: - * // folder/file_1.txt whose content is data1\ndata2\ndata3 - * // folder/file_2.txt whose content is data4\ndata4 - * // folder/folder_1/file_3.txt whose content is data6\ndata7 - * // then: - * SparkHelper.textFileWithFileName("folder", sparkContext) - * // will return: - * RDD( - * ("file:/path/on/machine/folder/file_1.txt", "data1"), - * ("file:/path/on/machine/folder/file_1.txt", "data2"), - * ("file:/path/on/machine/folder/file_1.txt", "data3"), - * ("file:/path/on/machine/folder/file_2.txt", "data4"), - * ("file:/path/on/machine/folder/file_2.txt", "data5"), - * ("file:/path/on/machine/folder/folder_1/file_3.txt", "data6"), - * ("file:/path/on/machine/folder/folder_1/file_3.txt", "data7") - * ) - * }}} - * - * @param hdfsPath the path of the folder (or structure of folders) to read - * @param sparkContext the SparkContext - * @return the RDD of records where a record is a tuple containing the path - * of the file the record comes from and the record itself. - */ - def textFileWithFileName( - hdfsPath: String, - sparkContext: SparkContext - ): RDD[(String, String)] = { - - // In order to go through the folder structure recursively: - sparkContext.hadoopConfiguration - .set("mapreduce.input.fileinputformat.input.dir.recursive", "true") - - sparkContext - .hadoopFile( - hdfsPath, - classOf[TextInputFormat2], - classOf[LongWritable], - classOf[Text], - sparkContext.defaultMinPartitions + /** Decreases the nbr of partitions of a folder. + * + * This comes in handy when the last step of your job needs to run on + * thousands of files, but you want to store your final output on let's say + * only 30 files. + * + * It's like a FileUtil.copyMerge() + * , but the merging produces more than one file. + * + * Be aware that this methods deletes the provided input folder. + * + * {{{ + * sc.decreaseCoalescence( + * "/folder/path/with/2000/files", + * "/produced/folder/path/with/only/30/files", + * 30, + * classOf[BZip2Codec] + * ) + * }}} + * + * @param highCoalescenceLevelFolder the folder which contains 10000 files + * @param lowerCoalescenceLevelFolder the folder which will contain the same + * data as highCoalescenceLevelFolder but spread on only 30 files (where 30 + * is the finalCoalesceLevel parameter). + * @param finalCoalesceLevel the nbr of files within the folder at the end + * of this method. + * @param codec the type of compression to use (for instance + * classOf[BZip2Codec] or classOf[GzipCodec])) + */ + def decreaseCoalescence( + highCoalescenceLevelFolder: String, + lowerCoalescenceLevelFolder: String, + finalCoalesceLevel: Int, + codec: Class[_ <: CompressionCodec] + ): Unit = + SparkHelper.decreaseCoalescenceInternal( + highCoalescenceLevelFolder, + lowerCoalescenceLevelFolder, + finalCoalesceLevel, + sc, + Some(codec) ) - .asInstanceOf[HadoopRDD[LongWritable, Text]] - .mapPartitionsWithInputSplit { - case (inputSplit, iterator) => - val file = inputSplit.asInstanceOf[FileSplit] - iterator.map(tpl => (file.getPath.toString, tpl._2.toString)) - } } // Internal core: private def saveAsSingleTextFileWithWorkingFolderInternal( outputRDD: RDD[String], - outputFile: String, + path: String, workingFolder: String, - compressionCodec: Option[Class[_ <: CompressionCodec]] + codec: Option[Class[_ <: CompressionCodec]] ): Unit = { // We chose a random name for the temporary file: val temporaryName = Random.alphanumeric.take(10).mkString("") - val temporaryFile = workingFolder + "/" + temporaryName + val temporaryFile = s"$workingFolder/$temporaryName" // We perform the merge into a temporary single text file: - saveAsSingleTextFileInternal(outputRDD, temporaryFile, compressionCodec) + saveAsSingleTextFileInternal(outputRDD, temporaryFile, codec) // And then only we put the resulting file in its final real location: - HdfsHelper.moveFile(temporaryFile, outputFile, overwrite = true) + HdfsHelper.moveFile(temporaryFile, path, overwrite = true) } /** Saves RDD in exactly one file. * * Allows one to save an RDD as one text file, but at the same time to keep - * the processing parallelized. + * the processing distributed. * * @param outputRDD the RDD of strings to save as text file - * @param outputFile the path where to save the file - * @param compression the compression codec to use (can be left to None) + * @param path the path where to save the file + * @param codec the compression codec to use (can be left to None) */ private def saveAsSingleTextFileInternal( outputRDD: RDD[String], - outputFile: String, - compressionCodec: Option[Class[_ <: CompressionCodec]] + path: String, + codec: Option[Class[_ <: CompressionCodec]] ): Unit = { - val fileSystem = FileSystem.get(new Configuration()) + val hadoopConfiguration = outputRDD.sparkContext.hadoopConfiguration + val fileSystem = FileSystem.get(hadoopConfiguration) // Classic saveAsTextFile in a temporary folder: - HdfsHelper.deleteFolder(outputFile + ".tmp") - compressionCodec match { - case Some(compressionCodec) => - outputRDD.saveAsTextFile(outputFile + ".tmp", compressionCodec) + HdfsHelper.deleteFolder(s"$path.tmp") + codec match { + case Some(compression) => + outputRDD.saveAsTextFile(s"$path.tmp", compression) case None => - outputRDD.saveAsTextFile(outputFile + ".tmp") + outputRDD.saveAsTextFile(s"$path.tmp") } // Merge the folder into a single file: - HdfsHelper.deleteFile(outputFile) + HdfsHelper.deleteFile(path) FileUtil.copyMerge( fileSystem, - new Path(outputFile + ".tmp"), + new Path(s"$path.tmp"), fileSystem, - new Path(outputFile), + new Path(path), true, - new Configuration(), + hadoopConfiguration, null) - HdfsHelper.deleteFolder(outputFile + ".tmp") + HdfsHelper.deleteFolder(s"$path.tmp") + } + + private def saveAsTextFileByKeyInternal( + rdd: RDD[(String, String)], + path: String, + optKeyNbr: Option[Int], + codec: Option[Class[_ <: CompressionCodec]] + ): Unit = { + + HdfsHelper.deleteFolder(path) + + // Whether the rdd was already cached or not (used to unpersist it if we + // have to get the nbr of keys): + val isCached = rdd.getStorageLevel.useMemory + + // If the nbr of keys isn't provided, we have to get it ourselves: + val keyNbr = optKeyNbr.getOrElse { + if (!isCached) + rdd.cache() + rdd.keys.distinct.count.toInt + } + + val prdd = rdd.partitionBy(new HashPartitioner(keyNbr)) + + codec match { + case Some(compression) => + prdd.saveAsHadoopFile( + path, + classOf[String], + classOf[String], + classOf[KeyBasedOutput], + compression + ) + case None => + prdd.saveAsHadoopFile( + path, + classOf[String], + classOf[String], + classOf[KeyBasedOutput] + ) + } + + if (optKeyNbr.isEmpty && !isCached) + rdd.unpersist() } private def decreaseCoalescenceInternal( highCoalescenceLevelFolder: String, lowerCoalescenceLevelFolder: String, - finalCoalescenceLevel: Int, - sparkContext: SparkContext, - compressionCodec: Option[Class[_ <: CompressionCodec]] + finalCoalesceLevel: Int, + sc: SparkContext, + codec: Option[Class[_ <: CompressionCodec]] ): Unit = { - val intermediateRDD = sparkContext + val intermediateRDD = sc .textFile(highCoalescenceLevelFolder) - .coalesce(finalCoalescenceLevel) + .coalesce(finalCoalesceLevel) - compressionCodec match { - case Some(compressionCodec) => - intermediateRDD - .saveAsTextFile(lowerCoalescenceLevelFolder, compressionCodec) + codec match { + case Some(compression) => + intermediateRDD.saveAsTextFile(lowerCoalescenceLevelFolder, compression) case None => intermediateRDD.saveAsTextFile(lowerCoalescenceLevelFolder) } diff --git a/src/main/scala/com/spark_helper/monitoring/Test.scala b/src/main/scala/com/spark_helper/monitoring/Test.scala index 97942e2..80b3ad7 100644 --- a/src/main/scala/com/spark_helper/monitoring/Test.scala +++ b/src/main/scala/com/spark_helper/monitoring/Test.scala @@ -7,7 +7,7 @@ import java.lang.Math.abs * This is intended to be used as parameter of Monitor.updateByKpiValidation * and Monitor.updateByKpisValidation methods. * - * Some exemples of Test objects: + * Some examples of Test objects: * {{{ * Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT) * Test("pctOfSomethingElse", 0.27d, SUPERIOR_THAN, 0.3d, PCT) @@ -19,7 +19,7 @@ import java.lang.Math.abs * * @constructor Creates a Test object. * - * Some exemples of Test objects: + * Some examples of Test objects: * {{{ * Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT) * Test("pctOfSomethingElse", 0.27d, SUPERIOR_THAN, 0.3d, PCT) @@ -42,22 +42,22 @@ final case class Test( kpiType: KpiType ) { - private[spark_helper] def isSuccess(): Boolean = thresholdType match { + private[spark_helper] def isSuccess: Boolean = thresholdType match { case EQUAL_TO => kpiValue == appliedThreshold case SUPERIOR_THAN => abs(kpiValue) >= appliedThreshold case INFERIOR_THAN => abs(kpiValue) <= appliedThreshold } - override def toString(): String = + override def toString: String = List( "\tKPI: " + description, "\t\tValue: " + kpiValue.toString + kpiType.name, "\t\tMust be " + thresholdType.name + " " + appliedThreshold.toString + kpiType.name, - "\t\tValidated: " + isSuccess().toString + "\t\tValidated: " + isSuccess.toString ).mkString("\n") } -/** An enumeration which represents the type of threshol to use (EQUAL_TO, +/** An enumeration which represents the type of threshold to use (EQUAL_TO, * SUPERIOR_THAN or INFERIOR_THAN) */ sealed trait ThresholdType { def name: String } diff --git a/src/main/scala/org/apache/spark/TextFileOverwrite.scala b/src/main/scala/org/apache/spark/TextFileOverwrite.scala new file mode 100644 index 0000000..28935ea --- /dev/null +++ b/src/main/scala/org/apache/spark/TextFileOverwrite.scala @@ -0,0 +1,54 @@ +package org.apache.spark + +import org.apache.spark.rdd.{RDD, HadoopRDD} +import org.apache.spark.util.SerializableConfiguration +import org.apache.hadoop.mapred.{FileInputFormat, JobConf, TextInputFormat} +import org.apache.hadoop.io.{LongWritable, Text} +import org.apache.hadoop.fs.Path + +object TextFileOverwrite { + + def textFile( + paths: Seq[String], + minPartitions: Int, + sc: SparkContext + ): RDD[String] = { + + /* Private notes: + * + * * Compared to sc.textFile(), the only difference in the implementation is + * the call to FileInputFormat.setInputPaths which takes Paths in input + * instead of a comma-separated String. + * + * * I use the package org.apache.spark to store this function, because + * SerializableConfiguration has the visibility private[spark] in spark's + * code base. + * + * * I would have preferred giving Seq[Path] instead of Seq[String] as an + * input of this method, but Path is not yet Serializable in the current + * version of hadoop-common used by Spark (it will become Serializable + * starting version 3 of hadoop-common). + * + * * I don't use String* (instead of Seq[String]) as for 1 String only it + * would confuse the compiler as to which sc.textFile to use (the default + * one or this one). + */ + + val confBroadcast = + sc.broadcast(new SerializableConfiguration(sc.hadoopConfiguration)) + + val setInputPathsFunc = + (jobConf: JobConf) => + FileInputFormat.setInputPaths(jobConf, paths.map(p => new Path(p)): _*) + + new HadoopRDD( + sc, + confBroadcast, + Some(setInputPathsFunc), + classOf[TextInputFormat], + classOf[LongWritable], + classOf[Text], + minPartitions + ).map(pair => pair._2.toString) + } +} diff --git a/src/test/scala/com/spark_helper/DateHelperTest.scala b/src/test/scala/com/spark_helper/DateHelperTest.scala index 7154831..5c68404 100644 --- a/src/test/scala/com/spark_helper/DateHelperTest.scala +++ b/src/test/scala/com/spark_helper/DateHelperTest.scala @@ -1,7 +1,11 @@ package com.spark_helper +import com.spark_helper.DateHelper._ + import org.scalatest.FunSuite +import com.spark_helper.{DateHelper => DH} + /** Testing facility for date helpers. * * @author Xavier Guihot @@ -23,7 +27,11 @@ class DateHelperTest extends FunSuite { ) assert(dates === expectedDates) - // 2: With a custom formatter: + // 2: Same as 1, but using the pimped String: + dates = "20161229" to "20170103" + assert(dates === expectedDates) + + // 3: With a custom formatter: dates = DateHelper.daysBetween("29Dec16", "03Jan17", "ddMMMyy") expectedDates = List( "29Dec16", @@ -37,25 +45,20 @@ class DateHelperTest extends FunSuite { } test("Reformat date") { - assert( - DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") === "170327") - assert( - DateHelper.reformatDate("20170327", "yyyyMMdd", "MMddyy") === "032717") + assert(DH.reformatDate("20170327", "yyyyMMdd", "yyMMdd") === "170327") + assert(DH.reformatDate("20170327", "yyyyMMdd", "MMddyy") === "032717") } test("Next day") { - assert(DateHelper.nextDay("20170310") === "20170311") - assert(DateHelper.nextDay("170310", "yyMMdd") === "170311") - assert( - DateHelper.nextDay("20170310_0000", "yyyyMMdd_HHmm") === "20170311_0000") + assert(DH.nextDay("20170310") === "20170311") + assert(DH.nextDay("170310", "yyMMdd") === "170311") + assert(DH.nextDay("20170310_0000", "yyyyMMdd_HHmm") === "20170311_0000") } test("Previous day") { - assert(DateHelper.previousDay("20170310") === "20170309") - assert(DateHelper.previousDay("170310", "yyMMdd") === "170309") - assert( - DateHelper - .previousDay("20170310_0000", "yyyyMMdd_HHmm") === "20170309_0000") + assert(DH.previousDay("20170310") === "20170309") + assert(DH.previousDay("170310", "yyMMdd") === "170309") + assert(DH.previousDay("20170310_0000", "yyyyMMdd_HHmm") === "20170309_0000") } test("Nbr of days between two dates") { @@ -78,7 +81,7 @@ class DateHelperTest extends FunSuite { assert(DateHelper.nDaysBeforeDate(5, "170310", "yyMMdd") === "170305") } - test("Date it will be N days affter date") { + test("Date it will be N days after date") { assert(DateHelper.nDaysAfterDate(3, "20170307") === "20170310") assert(DateHelper.nDaysAfterDate(5, "170305", "yyMMdd") === "170310") } @@ -88,6 +91,7 @@ class DateHelperTest extends FunSuite { } test("Date versus provided format") { + assert(DateHelper.isDateCompliantWithFormat("20170302", "yyyyMMdd")) assert(!DateHelper.isDateCompliantWithFormat("20170333", "yyyyMMdd")) assert(DateHelper.isDateCompliantWithFormat("20170228", "yyyyMMdd")) @@ -96,5 +100,14 @@ class DateHelperTest extends FunSuite { assert(!DateHelper.isDateCompliantWithFormat("", "yyyyMMdd")) assert(!DateHelper.isDateCompliantWithFormat("a", "yyyyMMdd")) assert(!DateHelper.isDateCompliantWithFormat("24JAN17", "yyyyMMdd")) + + assert("20170302".isCompliantWith("yyyyMMdd")) + assert(!"20170333".isCompliantWith("yyyyMMdd")) + assert("20170228".isCompliantWith("yyyyMMdd")) + assert(!"20170229".isCompliantWith("yyyyMMdd")) + assert(!"170228".isCompliantWith("yyyyMMdd")) + assert(!"".isCompliantWith("yyyyMMdd")) + assert(!"a".isCompliantWith("yyyyMMdd")) + assert(!"24JAN17".isCompliantWith("yyyyMMdd")) } } diff --git a/src/test/scala/com/spark_helper/HdfsHelperTest.scala b/src/test/scala/com/spark_helper/HdfsHelperTest.scala index 1935e89..b64306d 100644 --- a/src/test/scala/com/spark_helper/HdfsHelperTest.scala +++ b/src/test/scala/com/spark_helper/HdfsHelperTest.scala @@ -1,5 +1,9 @@ package com.spark_helper +import com.spark_helper.HdfsHelper._ + +import org.apache.hadoop.io.compress.GzipCodec + import com.holdenkarau.spark.testing.SharedSparkContext import org.scalatest.FunSuite @@ -11,60 +15,67 @@ import org.scalatest.FunSuite */ class HdfsHelperTest extends FunSuite with SharedSparkContext { + val resourceFolder = "src/test/resources" + val testFolder = s"$resourceFolder/folder" + test("Delete file/folder") { + val filePath = s"$testFolder/file.txt" + // Let's try to delete a file: - HdfsHelper.writeToHdfsFile("", "src/test/resources/file_to_delete.txt") + HdfsHelper.createEmptyHdfsFile(filePath) // 1: Let's try to delete it with the deleteFolder method: var messageThrown = intercept[IllegalArgumentException] { - HdfsHelper.deleteFolder("src/test/resources/file_to_delete.txt") + HdfsHelper.deleteFolder(filePath) } var expectedMessage = "requirement failed: to delete a file, prefer using the " + "deleteFile() method." assert(messageThrown.getMessage === expectedMessage) - assert(HdfsHelper.fileExists("src/test/resources/file_to_delete.txt")) + assert(HdfsHelper.fileExists(filePath)) // 2: Let's delete it with the deleteFile method: - HdfsHelper.deleteFile("src/test/resources/file_to_delete.txt") - assert(!HdfsHelper.fileExists("src/test/resources/file_to_delete.txt")) + HdfsHelper.deleteFile(filePath) + assert(!HdfsHelper.fileExists(filePath)) // Let's try to delete a folder: - HdfsHelper - .writeToHdfsFile("", "src/test/resources/folder_to_delete/file.txt") + HdfsHelper.createEmptyHdfsFile(s"$testFolder/file.txt") // 3: Let's try to delete it with the deleteFile method: messageThrown = intercept[IllegalArgumentException] { - HdfsHelper.deleteFile("src/test/resources/folder_to_delete") + HdfsHelper.deleteFile(testFolder) } expectedMessage = "requirement failed: to delete a folder, prefer using the " + "deleteFolder() method." assert(messageThrown.getMessage === expectedMessage) - assert(HdfsHelper.folderExists("src/test/resources/folder_to_delete")) + assert(HdfsHelper.folderExists(testFolder)) // 4: Let's delete it with the deleteFolder method: - HdfsHelper.deleteFolder("src/test/resources/folder_to_delete") - assert(!HdfsHelper.folderExists("src/test/resources/folder_to_delete")) + HdfsHelper.deleteFolder(testFolder) + assert(!HdfsHelper.folderExists(testFolder)) } test("File/folder exists") { - HdfsHelper.deleteFile("src/test/resources/file_to_check.txt") - HdfsHelper.deleteFolder("src/test/resources/folder_to_check") + val folderPath = s"$resourceFolder/folder" + val filePath = s"$folderPath/file.txt" + + HdfsHelper.deleteFile(filePath) + HdfsHelper.deleteFolder(folderPath) // Let's try to check if a file exists: - assert(!HdfsHelper.fileExists("src/test/resources/file_to_check.txt")) + assert(!HdfsHelper.fileExists(filePath)) - HdfsHelper.writeToHdfsFile("", "src/test/resources/file_to_check.txt") + HdfsHelper.createEmptyHdfsFile(filePath) // 1: Let's try to check it exists with the folderExists method: var messageThrown = intercept[IllegalArgumentException] { - HdfsHelper.folderExists("src/test/resources/file_to_check.txt") + HdfsHelper.folderExists(filePath) } var expectedMessage = "requirement failed: to check if a file exists, prefer using the " + @@ -72,18 +83,18 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext { assert(messageThrown.getMessage === expectedMessage) // 2: Let's try to check it exists with the fileExists method: - assert(HdfsHelper.fileExists("src/test/resources/file_to_check.txt")) + assert(HdfsHelper.fileExists(filePath)) // Let's try to check if a folder exists: - assert(!HdfsHelper.folderExists("src/test/resources/folder_to_check")) + HdfsHelper.deleteFolder(folderPath) + assert(!HdfsHelper.folderExists(folderPath)) - HdfsHelper - .writeToHdfsFile("", "src/test/resources/folder_to_check/file.txt") + HdfsHelper.createEmptyHdfsFile(filePath) // 3: Let's try to check it exists with the fileExists method: messageThrown = intercept[IllegalArgumentException] { - HdfsHelper.fileExists("src/test/resources/folder_to_check") + HdfsHelper.fileExists(folderPath) } expectedMessage = "requirement failed: to check if a folder exists, prefer using " + @@ -91,377 +102,325 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext { assert(messageThrown.getMessage === expectedMessage) // 2: Let's try to check it exists with the folderExists method: - assert(HdfsHelper.folderExists("src/test/resources/folder_to_check")) + assert(HdfsHelper.folderExists(folderPath)) - HdfsHelper.deleteFile("src/test/resources/file_to_check.txt") - HdfsHelper.deleteFolder("src/test/resources/folder_to_check") + HdfsHelper.deleteFile(filePath) + HdfsHelper.deleteFolder(folderPath) } test("Create an empty file on hdfs") { - HdfsHelper.deleteFile("src/test/resources/empty_file.token") + val filePath = s"$testFolder/empty_file.token" - HdfsHelper.createEmptyHdfsFile("src/test/resources/empty_file.token") + HdfsHelper.deleteFile(filePath) - assert(HdfsHelper.fileExists("src/test/resources/empty_file.token")) + HdfsHelper.createEmptyHdfsFile(filePath) - val tokenContent = sc - .textFile("src/test/resources/empty_file.token") - .collect() - .sorted - .mkString("\n") + assert(HdfsHelper.fileExists(filePath)) + val tokenContent = sc.textFile(filePath).collect().sorted.mkString("\n") assert(tokenContent === "") - HdfsHelper.deleteFile("src/test/resources/empty_file.token") + HdfsHelper.deleteFile(filePath) } test( "Save text in HDFS file with the fileSystem API instead of the Spark API") { - // 1: Stores using a "\n"-joined string: + val filePath = s"$testFolder/small_file.txt" - HdfsHelper.deleteFile("src/test/resources/folder/small_file.txt") + HdfsHelper.deleteFolder(testFolder) - val contentToStore = "Hello World\nWhatever" + // 1: Stores using a "\n"-joined string: - HdfsHelper.writeToHdfsFile( - contentToStore, - "src/test/resources/folder/small_file.txt") + val contentToStore = "Hello World\nWhatever" - assert(HdfsHelper.fileExists("src/test/resources/folder/small_file.txt")) + HdfsHelper.writeToHdfsFile(contentToStore, filePath) - var storedContent = sc - .textFile("src/test/resources/folder/small_file.txt") - .collect() - .sorted - .mkString("\n") + assert(HdfsHelper.fileExists(filePath)) + var storedContent = sc.textFile(filePath).collect().sorted.mkString("\n") assert(storedContent === contentToStore) - HdfsHelper.deleteFolder("src/test/resources/folder") + HdfsHelper.deleteFolder(testFolder) // 2: Stores using a list of strings to be "\n"-joined: - HdfsHelper.deleteFile("src/test/resources/folder/small_file.txt") - val listToStore = List("Hello World", "Whatever") + HdfsHelper.writeToHdfsFile(listToStore, filePath) - HdfsHelper - .writeToHdfsFile(listToStore, "src/test/resources/folder/small_file.txt") + assert(HdfsHelper.fileExists(filePath)) - assert(HdfsHelper.fileExists("src/test/resources/folder/small_file.txt")) + storedContent = sc.textFile(filePath).collect().sorted.mkString("\n") + assert(storedContent === listToStore.mkString("\n")) - storedContent = sc - .textFile("src/test/resources/folder/small_file.txt") - .collect() - .sorted - .mkString("\n") + HdfsHelper.deleteFolder(testFolder) - assert(storedContent === listToStore.mkString("\n")) + // 3: Using the pimped Seq/String: - HdfsHelper.deleteFolder("src/test/resources/folder") + val seqToStore = Seq("Hello World", "Whatever") + seqToStore.writeToHdfs(filePath) + assert(HdfsHelper.fileExists(filePath)) + storedContent = sc.textFile(filePath).collect().sorted.mkString("\n") + assert(storedContent === contentToStore) + HdfsHelper.deleteFolder(testFolder) + + listToStore.writeToHdfs(filePath) + assert(HdfsHelper.fileExists(filePath)) + storedContent = sc.textFile(filePath).collect().sorted.mkString("\n") + assert(storedContent === contentToStore) + HdfsHelper.deleteFolder(testFolder) + + contentToStore.writeToHdfs(filePath) + assert(HdfsHelper.fileExists(filePath)) + storedContent = sc.textFile(filePath).collect().sorted.mkString("\n") + assert(storedContent === contentToStore) + HdfsHelper.deleteFolder(testFolder) } test("List file names in Hdfs folder") { - HdfsHelper.writeToHdfsFile("", "src/test/resources/folder_1/file_1.txt") - HdfsHelper.writeToHdfsFile("", "src/test/resources/folder_1/file_2.csv") - HdfsHelper - .writeToHdfsFile("", "src/test/resources/folder_1/folder_2/file_3.txt") + val folder1 = s"$resourceFolder/folder_1" + + HdfsHelper.createEmptyHdfsFile(s"$folder1/file_1.txt") + HdfsHelper.createEmptyHdfsFile(s"$folder1/file_2.csv") + HdfsHelper.createEmptyHdfsFile(s"$folder1/folder_2/file_3.txt") // 1: Not recursive, names only: - var fileNames = - HdfsHelper.listFileNamesInFolder("src/test/resources/folder_1") + var fileNames = HdfsHelper.listFileNamesInFolder(folder1) var expectedFileNames = List("file_1.txt", "file_2.csv") assert(fileNames === expectedFileNames) // 2: Not recursive, full paths: - fileNames = HdfsHelper - .listFileNamesInFolder("src/test/resources/folder_1", onlyName = false) - expectedFileNames = List( - "src/test/resources/folder_1/file_1.txt", - "src/test/resources/folder_1/file_2.csv" - ) + fileNames = HdfsHelper.listFileNamesInFolder(folder1, onlyName = false) + expectedFileNames = List(s"$folder1/file_1.txt", s"$folder1/file_2.csv") assert(fileNames === expectedFileNames) // 3: Recursive, names only: - fileNames = HdfsHelper - .listFileNamesInFolder("src/test/resources/folder_1", recursive = true) + fileNames = HdfsHelper.listFileNamesInFolder(folder1, recursive = true) expectedFileNames = List("file_1.txt", "file_2.csv", "file_3.txt") assert(fileNames === expectedFileNames) // 4: Recursive, full paths: - fileNames = HdfsHelper.listFileNamesInFolder( - "src/test/resources/folder_1", - recursive = true, - onlyName = false) + fileNames = HdfsHelper + .listFileNamesInFolder(folder1, recursive = true, onlyName = false) expectedFileNames = List( - "src/test/resources/folder_1/file_1.txt", - "src/test/resources/folder_1/file_2.csv", - "src/test/resources/folder_1/folder_2/file_3.txt" + s"$folder1/file_1.txt", + s"$folder1/file_2.csv", + s"$folder1/folder_2/file_3.txt" ) assert(fileNames === expectedFileNames) - HdfsHelper.deleteFolder("src/test/resources/folder_1") + HdfsHelper.deleteFolder(folder1) } test("List folder names in Hdfs folder") { - HdfsHelper.writeToHdfsFile("", "src/test/resources/folder_1/file_1.txt") - HdfsHelper - .writeToHdfsFile("", "src/test/resources/folder_1/folder_2/file_2.txt") - HdfsHelper - .writeToHdfsFile("", "src/test/resources/folder_1/folder_3/file_3.txt") + val folder1 = s"$resourceFolder/folder_1" - val folderNames = HdfsHelper.listFolderNamesInFolder( - "src/test/resources/folder_1" - ) + HdfsHelper.createEmptyHdfsFile(s"$folder1/file_1.txt") + HdfsHelper.createEmptyHdfsFile(s"$folder1/folder_2/file_2.txt") + HdfsHelper.createEmptyHdfsFile(s"$folder1/folder_3/file_3.txt") + + val folderNames = HdfsHelper.listFolderNamesInFolder(folder1) val expectedFolderNames = List("folder_2", "folder_3") assert(folderNames === expectedFolderNames) - HdfsHelper.deleteFolder("src/test/resources/folder_1") + HdfsHelper.deleteFolder(folder1) } test("Move file") { + val filePath = s"$testFolder/some_file.txt" + val renamedPath = s"$testFolder/renamed_file.txt" + // Let's remove possible previous stuff: - HdfsHelper.deleteFile("src/test/resources/some_file.txt") - HdfsHelper.deleteFile("src/test/resources/renamed_file.txt") + HdfsHelper.deleteFolder(testFolder) // Let's create the file to rename: - HdfsHelper.writeToHdfsFile("whatever", "src/test/resources/some_file.txt") + HdfsHelper.writeToHdfsFile("whatever", filePath) // 1: Let's try to move the file on a file which already exists without // the overwrite option: - assert(HdfsHelper.fileExists("src/test/resources/some_file.txt")) - assert(!HdfsHelper.fileExists("src/test/resources/renamed_file.txt")) + assert(HdfsHelper.fileExists(filePath)) + assert(!HdfsHelper.fileExists(renamedPath)) // Let's create the existing file where we want to move our file: - HdfsHelper.writeToHdfsFile("", "src/test/resources/renamed_file.txt") + HdfsHelper.createEmptyHdfsFile(renamedPath) // Let's rename the file to the path where a file already exists: val ioExceptionThrown = intercept[IllegalArgumentException] { - HdfsHelper.moveFile( - "src/test/resources/some_file.txt", - "src/test/resources/renamed_file.txt") + HdfsHelper.moveFile(filePath, renamedPath) } var expectedMessage = "requirement failed: overwrite option set to false, but a file " + - "already exists at target location src/test/resources/renamed_file.txt" + "already exists at target location " + + "src/test/resources/folder/renamed_file.txt" assert(ioExceptionThrown.getMessage === expectedMessage) - assert(HdfsHelper.fileExists("src/test/resources/some_file.txt")) - assert(HdfsHelper.fileExists("src/test/resources/renamed_file.txt")) + assert(HdfsHelper.fileExists(filePath)) + assert(HdfsHelper.fileExists(renamedPath)) - HdfsHelper.deleteFile("src/test/resources/renamed_file.txt") + HdfsHelper.deleteFile(renamedPath) // 2: Let's fail to move the file with the moveFolder() method: - assert(HdfsHelper.fileExists("src/test/resources/some_file.txt")) - assert(!HdfsHelper.fileExists("src/test/resources/renamed_file.txt")) + assert(HdfsHelper.fileExists(filePath)) + assert(!HdfsHelper.fileExists(renamedPath)) // Let's rename the file: val illegalArgExceptionThrown = intercept[IllegalArgumentException] { - HdfsHelper.moveFolder( - "src/test/resources/some_file.txt", - "src/test/resources/renamed_file.txt") + HdfsHelper.moveFolder(filePath, renamedPath) } expectedMessage = "requirement failed: to move a file, prefer using the " + "moveFile() method." assert(illegalArgExceptionThrown.getMessage === expectedMessage) - assert(HdfsHelper.fileExists("src/test/resources/some_file.txt")) - assert(!HdfsHelper.fileExists("src/test/resources/renamed_file.txt")) + assert(HdfsHelper.fileExists(filePath)) + assert(!HdfsHelper.fileExists(renamedPath)) - // 3: Let's successfuly move the file with the moveFile() method: + // 3: Let's successfully move the file with the moveFile() method: // Let's rename the file: - HdfsHelper.moveFile( - "src/test/resources/some_file.txt", - "src/test/resources/renamed_file.txt") + HdfsHelper.moveFile(filePath, renamedPath) - assert(!HdfsHelper.fileExists("src/test/resources/some_file.txt")) - assert(HdfsHelper.fileExists("src/test/resources/renamed_file.txt")) - - val newContent = sc.textFile("src/test/resources/renamed_file.txt").collect + assert(!HdfsHelper.fileExists(filePath)) + assert(HdfsHelper.fileExists(renamedPath)) + val newContent = sc.textFile(renamedPath).collect assert(Array("whatever") === newContent) - HdfsHelper.deleteFile("src/test/resources/renamed_file.txt") + HdfsHelper.deleteFolder(testFolder) } test("Move folder") { + val folderToMove = s"$testFolder/folder_to_move" + val renamedFolder = s"$testFolder/renamed_folder" + // Let's remove possible previous stuff: - HdfsHelper.deleteFolder("src/test/resources/some_folder_to_move") - HdfsHelper.deleteFolder("src/test/resources/renamed_folder") + HdfsHelper.deleteFolder(testFolder) // Let's create the folder to rename: - HdfsHelper.writeToHdfsFile( - "whatever", - "src/test/resources/some_folder_to_move/file_1.txt") - HdfsHelper.writeToHdfsFile( - "something", - "src/test/resources/some_folder_to_move/file_2.txt") + HdfsHelper.writeToHdfsFile("whatever", s"$folderToMove/file_1.txt") + HdfsHelper.writeToHdfsFile("something", s"$folderToMove/file_2.txt") // 1: Let's fail to move the folder with the moveFile() method: - assert( - HdfsHelper.fileExists( - "src/test/resources/some_folder_to_move/file_1.txt")) - assert( - HdfsHelper.fileExists( - "src/test/resources/some_folder_to_move/file_2.txt")) - assert(!HdfsHelper.folderExists("src/test/resources/renamed_folder")) + assert(HdfsHelper.fileExists(s"$folderToMove/file_1.txt")) + assert(HdfsHelper.fileExists(s"$folderToMove/file_2.txt")) + assert(!HdfsHelper.folderExists(renamedFolder)) // Let's rename the folder: val messageThrown = intercept[IllegalArgumentException] { - HdfsHelper.moveFile( - "src/test/resources/some_folder_to_move", - "src/test/resources/renamed_folder") + HdfsHelper.moveFile(folderToMove, renamedFolder) } val expectedMessage = "requirement failed: to move a folder, prefer using the " + "moveFolder() method." assert(messageThrown.getMessage === expectedMessage) - assert( - HdfsHelper.fileExists( - "src/test/resources/some_folder_to_move/file_1.txt")) - assert( - HdfsHelper.fileExists( - "src/test/resources/some_folder_to_move/file_2.txt")) - assert(!HdfsHelper.folderExists("src/test/resources/renamed_folder")) + assert(HdfsHelper.fileExists(s"$folderToMove/file_1.txt")) + assert(HdfsHelper.fileExists(s"$folderToMove/file_2.txt")) + assert(!HdfsHelper.folderExists(renamedFolder)) - // 2: Let's successfuly move the folder with the moveFolder() method: + // 2: Let's successfully move the folder with the moveFolder() method: // Let's rename the folder: - HdfsHelper.moveFolder( - "src/test/resources/some_folder_to_move", - "src/test/resources/renamed_folder") + HdfsHelper.moveFolder(folderToMove, renamedFolder) - assert(!HdfsHelper.folderExists("src/test/resources/some_folder_to_move")) - assert( - HdfsHelper.fileExists("src/test/resources/renamed_folder/file_1.txt")) - assert( - HdfsHelper.fileExists("src/test/resources/renamed_folder/file_2.txt")) - - val newContent = - sc.textFile("src/test/resources/renamed_folder").collect().sorted + assert(!HdfsHelper.folderExists(folderToMove)) + assert(HdfsHelper.fileExists(s"$renamedFolder/file_1.txt")) + assert(HdfsHelper.fileExists(s"$renamedFolder/file_2.txt")) + val newContent = sc.textFile(renamedFolder).collect().sorted assert(newContent === Array("something", "whatever")) - HdfsHelper.deleteFolder("src/test/resources/renamed_folder") + HdfsHelper.deleteFolder(testFolder) } test("Append header and footer to file") { + val filePath = s"$testFolder/header_footer_file.txt" + val tmpFolder = s"$testFolder/header_footer_tmp" + // 1: Without the tmp/working folder: - HdfsHelper.deleteFile("src/test/resources/header_footer_file.txt") + HdfsHelper.deleteFolder(testFolder) // Let's create the file for which to add header and footer: - HdfsHelper.writeToHdfsFile( - "whatever\nsomething else\n", - "src/test/resources/header_footer_file.txt") + HdfsHelper.writeToHdfsFile("whatever\nsomething else\n", filePath) - HdfsHelper.appendHeaderAndFooter( - "src/test/resources/header_footer_file.txt", - "my_header", - "my_footer") + HdfsHelper.appendHeaderAndFooter(filePath, "my_header", "my_footer") - var newContent = sc - .textFile("src/test/resources/header_footer_file.txt") - .collect - .mkString("\n") + var newContent = sc.textFile(filePath).collect.mkString("\n") - var expectedNewContent = ( + var expectedNewContent = "my_header\n" + "whatever\n" + "something else\n" + "my_footer" - ) assert(newContent === expectedNewContent) - HdfsHelper.deleteFile("src/test/resources/header_footer_file.txt") + HdfsHelper.deleteFile(filePath) // 2: With the tmp/working folder: // Let's create the file for which to add header and footer: - HdfsHelper.writeToHdfsFile( - "whatever\nsomething else\n", - "src/test/resources/header_footer_file.txt") + HdfsHelper.writeToHdfsFile("whatever\nsomething else\n", filePath) - HdfsHelper.appendHeaderAndFooter( - "src/test/resources/header_footer_file.txt", - "my_header", - "my_footer", - workingFolderPath = "src/test/resources/header_footer_tmp") + HdfsHelper + .appendHeaderAndFooter(filePath, "my_header", "my_footer", tmpFolder) - assert(HdfsHelper.folderExists("src/test/resources/header_footer_tmp")) - assert( - !HdfsHelper.fileExists("src/test/resources/header_footer_tmp/xml.tmp")) + assert(HdfsHelper.folderExists(tmpFolder)) + assert(!HdfsHelper.fileExists(s"$tmpFolder/xml.tmp")) - newContent = sc - .textFile("src/test/resources/header_footer_file.txt") - .collect - .mkString("\n") + newContent = sc.textFile(filePath).collect.mkString("\n") - expectedNewContent = ( + expectedNewContent = "my_header\n" + "whatever\n" + "something else\n" + "my_footer" - ) assert(newContent === expectedNewContent) - HdfsHelper.deleteFile("src/test/resources/header_footer_file.txt") - HdfsHelper.deleteFolder("src/test/resources/header_footer_tmp") + HdfsHelper.deleteFolder(testFolder) } test("Validate Xml Hdfs file with Xsd") { + val xmlPath = s"$testFolder/file.xml" + // 1: Valid xml: - HdfsHelper.deleteFile("src/test/resources/xml_file.txt") + HdfsHelper.deleteFolder(testFolder) HdfsHelper.writeToHdfsFile( "\n" + " 24\n" + "
        34 thingy street, someplace, sometown
        \n" + "
        ", - "src/test/resources/xml_file.txt" + xmlPath ) - var xsdFile = getClass.getResource("/some_xml.xsd") - - var isValid = HdfsHelper - .isHdfsXmlCompliantWithXsd("src/test/resources/xml_file.txt", xsdFile) - - assert(isValid) + assert(HdfsHelper.isHdfsXmlCompliantWithXsd(xmlPath, xsdFile)) // 2: Invalid xml: - HdfsHelper.deleteFile("src/test/resources/xml_file.txt") + HdfsHelper.deleteFolder(testFolder) HdfsHelper.writeToHdfsFile( "\n" + - " trente\n" + + " thirty\n" + "
        34 thingy street, someplace, sometown
        \n" + "
        ", - "src/test/resources/xml_file.txt" + xmlPath ) - xsdFile = getClass.getResource("/some_xml.xsd") + assert(!HdfsHelper.isHdfsXmlCompliantWithXsd(xmlPath, xsdFile)) - isValid = HdfsHelper - .isHdfsXmlCompliantWithXsd("src/test/resources/xml_file.txt", xsdFile) - - assert(!isValid) - - HdfsHelper.deleteFile("src/test/resources/xml_file.txt") + HdfsHelper.deleteFolder(testFolder) } test("Load Typesafe Config from Hdfs") { @@ -484,58 +443,73 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext { test("Load Xml file from Hdfs") { - HdfsHelper.deleteFile("src/test/resources/folder/xml_to_load.xml") + val xmlPath = s"$testFolder/file.xml" + + HdfsHelper.deleteFolder(testFolder) HdfsHelper.writeToHdfsFile( "\n" + " whatever\n" + "", - "src/test/resources/folder/xml_to_load.xml" + xmlPath ) - val xmlContent = HdfsHelper - .loadXmlFileFromHdfs("src/test/resources/folder/xml_to_load.xml") + val xmlContent = HdfsHelper.loadXmlFileFromHdfs(xmlPath) assert((xmlContent \ "sometag" \ "@value").text === "something") assert((xmlContent \ "sometag").text === "whatever") - HdfsHelper.deleteFolder("src/test/resources/folder/") + HdfsHelper.deleteFolder(testFolder) } test("Purge folder from too old files/folders") { - HdfsHelper.deleteFolder("src/test/resources/folder_to_purge") - HdfsHelper - .createEmptyHdfsFile("src/test/resources/folder_to_purge/file.txt") - HdfsHelper - .createEmptyHdfsFile("src/test/resources/folder_to_purge/folder/file.txt") - assert(HdfsHelper.fileExists("src/test/resources/folder_to_purge/file.txt")) - assert(HdfsHelper.folderExists("src/test/resources/folder_to_purge/folder")) - - HdfsHelper.purgeFolder("src/test/resources/folder_to_purge", 63) + val folderToPurge = s"$testFolder/folder_to_purge" - assert(HdfsHelper.fileExists("src/test/resources/folder_to_purge/file.txt")) - assert(HdfsHelper.folderExists("src/test/resources/folder_to_purge/folder")) + HdfsHelper.deleteFolder(testFolder) + HdfsHelper.createEmptyHdfsFile(s"$folderToPurge/file.txt") + HdfsHelper.createEmptyHdfsFile(s"$folderToPurge/folder/file.txt") + assert(HdfsHelper.fileExists(s"$folderToPurge/file.txt")) + assert(HdfsHelper.folderExists(s"$folderToPurge/folder")) - HdfsHelper.purgeFolder("src/test/resources/folder_to_purge", 1) + HdfsHelper.purgeFolder(folderToPurge, 63) + assert(HdfsHelper.fileExists(s"$folderToPurge/file.txt")) + assert(HdfsHelper.folderExists(s"$folderToPurge/folder")) - assert(HdfsHelper.fileExists("src/test/resources/folder_to_purge/file.txt")) - assert(HdfsHelper.folderExists("src/test/resources/folder_to_purge/folder")) + HdfsHelper.purgeFolder(folderToPurge, 1) + assert(HdfsHelper.fileExists(s"$folderToPurge/file.txt")) + assert(HdfsHelper.folderExists(s"$folderToPurge/folder")) val messageThrown = intercept[IllegalArgumentException] { - HdfsHelper.purgeFolder("src/test/resources/folder_to_purge", -3) + HdfsHelper.purgeFolder(folderToPurge, -3) } val expectedMessage = "requirement failed: the purgeAge provided \"-3\" must be superior to 0." assert(messageThrown.getMessage === expectedMessage) - HdfsHelper.purgeFolder("src/test/resources/folder_to_purge", 0) + HdfsHelper.purgeFolder(folderToPurge, 0) + assert(!HdfsHelper.fileExists(s"$folderToPurge/file.txt")) + assert(!HdfsHelper.folderExists(s"$folderToPurge/folder")) + + HdfsHelper.deleteFolder(testFolder) + } + + test("Compress hdfs file") { + + val filePath = s"$testFolder/file.txt" + + HdfsHelper.deleteFile(filePath) + + HdfsHelper.writeToHdfsFile("hello\nworld", filePath) + HdfsHelper.compressFile(filePath, classOf[GzipCodec]) + + assert(HdfsHelper.fileExists(s"$filePath.gz")) - assert( - !HdfsHelper.fileExists("src/test/resources/folder_to_purge/file.txt")) - assert( - !HdfsHelper.folderExists("src/test/resources/folder_to_purge/folder")) + // Easy to test with spark, as reading a file with the ".gz" extension + // forces the read with the compression codec: + val content = sc.textFile(s"$filePath.gz").collect.sorted + assert(content === Array("hello", "world")) - HdfsHelper.deleteFolder("src/test/resources/folder_to_purge") + HdfsHelper.deleteFolder(testFolder) } } diff --git a/src/test/scala/com/spark_helper/MonitorTest.scala b/src/test/scala/com/spark_helper/MonitorTest.scala index 45381ac..bda85da 100644 --- a/src/test/scala/com/spark_helper/MonitorTest.scala +++ b/src/test/scala/com/spark_helper/MonitorTest.scala @@ -17,12 +17,12 @@ class MonitorTest extends FunSuite with SharedSparkContext { test("Basic monitoring testing") { - // Monitor is initialy successful: - assert(Monitor.isSuccess()) + // Monitor is initially successful: + assert(Monitor.isSuccess) // Here is what a report generated without any additional settings should // look like: var report = removeTimeStamps(Monitor.logs()) - assert(report === "[..:..] Begining\n") + assert(report === "[..:..] Beginning\n") // Include additional info which are placed in the report's header: Monitor.setTitle("Processing of whatever") @@ -30,33 +30,30 @@ class MonitorTest extends FunSuite with SharedSparkContext { Monitor.addDescription( "Documentation: https://github.com/xavierguihot/spark_helper") report = removeTimeStamps(Monitor.logs()) - var expectedReport = ( + var expectedReport = " Processing of whatever\n" + "\n" + "Point of contact: x.guihot@gmail.com, smbdy@gmail.com\n" + "Documentation: https://github.com/xavierguihot/spark_helper\n" + - "[..:..] Begining\n" - ) + "[..:..] Beginning\n" assert(report === expectedReport) // Simple text update without success modification: Monitor.reset() Monitor.log("My First Stage") report = removeTimeStamps(Monitor.logs()) - expectedReport = ( - "[..:..] Begining\n" + + expectedReport = + "[..:..] Beginning\n" + "[..:..-..:..] My First Stage\n" - ) assert(report === expectedReport) // Let's call .log() another time: Monitor.log("My Second Stage") report = removeTimeStamps(Monitor.logs()) - expectedReport = ( - "[..:..] Begining\n" + + expectedReport = + "[..:..] Beginning\n" + "[..:..-..:..] My First Stage\n" + "[..:..-..:..] My Second Stage\n" - ) assert(report === expectedReport) // Successive updates: @@ -64,33 +61,30 @@ class MonitorTest extends FunSuite with SharedSparkContext { Monitor.reset() Monitor.success("My First Stage") report = removeTimeStamps(Monitor.logs()) - expectedReport = ( - "[..:..] Begining\n" + + expectedReport = + "[..:..] Beginning\n" + "[..:..-..:..] My First Stage: success\n" - ) assert(report === expectedReport) - assert(Monitor.isSuccess()) + assert(Monitor.isSuccess) // Update report with a failure: Monitor.error("My Second Stage") report = removeTimeStamps(Monitor.logs()) - expectedReport = ( - "[..:..] Begining\n" + + expectedReport = + "[..:..] Beginning\n" + "[..:..-..:..] My First Stage: success\n" + "[..:..-..:..] My Second Stage: failed\n" - ) assert(report === expectedReport) - assert(!Monitor.isSuccess()) + assert(!Monitor.isSuccess) // A success after a failure, which must not overwrite the failure: Monitor.success("My Third Stage") report = removeTimeStamps(Monitor.logs()) - expectedReport = ( - "[..:..] Begining\n" + + expectedReport = + "[..:..] Beginning\n" + "[..:..-..:..] My First Stage: success\n" + "[..:..-..:..] My Second Stage: failed\n" + "[..:..-..:..] My Third Stage: success\n" - ) assert(report === expectedReport) - assert(!Monitor.isSuccess()) + assert(!Monitor.isSuccess) } test("Check current.ongoing live monitoring") { @@ -112,18 +106,17 @@ class MonitorTest extends FunSuite with SharedSparkContext { .toList .mkString("\n") - val expectedReport = ( + val expectedReport = " My Processing\n" + "\n" + "Point of contact: x.guihot@gmail.com, smbdy@gmail.com\n" + "Documentation: https://github.com/xavierguihot/spark_helper\n" + - "[..:..] Begining\n" + + "[..:..] Beginning\n" + "[..:..-..:..] Doing something\n" + "\n" + "WARNING: If this file exists it does not necessarily mean that " + "your job is still running. This file might persist if your job has " + "been killed and thus couldn't reach your call to the Monitor.store()." - ) assert(removeTimeStamps(reportStoredLines) === expectedReport) } @@ -132,7 +125,7 @@ class MonitorTest extends FunSuite with SharedSparkContext { Monitor.reset() // Explanation to someone running tests and seeing an error stack trace - // even though tests are actually successfull: + // even though tests are actually successful: println( "README: The following stack trace is NOT a test failure. This " + "is the logging/print of the tested stack trace error as it would " + @@ -146,14 +139,13 @@ class MonitorTest extends FunSuite with SharedSparkContext { Monitor.error(nfe, "Parse to integer", "my diagnostic") } // Warning, here I remove the stack trace because it depends on the - // java/scala version! And yes this test is a bit less usefull. + // java/scala version! And yes this test is a bit less useful. val report = removeTimeStamps(Monitor.logs()).split("\n").take(3).mkString("\n") - val expectedReport = ( - "[..:..] Begining\n" + + val expectedReport = + "[..:..] Beginning\n" + "[..:..-..:..] Parse to integer: failed\n" + " Diagnostic: my diagnostic" - ) assert(report === expectedReport) } @@ -171,11 +163,11 @@ class MonitorTest extends FunSuite with SharedSparkContext { ) assert(!success) - assert(!Monitor.isSuccess()) + assert(!Monitor.isSuccess) var report = removeTimeStamps(Monitor.logs()) - var expectedReport = ( - "[..:..] Begining\n" + + var expectedReport = + "[..:..] Beginning\n" + "[..:..-..:..] Tests for whatever: failed\n" + " KPI: pctOfWhatever\n" + " Value: 0.06%\n" + @@ -189,7 +181,6 @@ class MonitorTest extends FunSuite with SharedSparkContext { " Value: 1235.0\n" + " Must be equal to 1235.0\n" + " Validated: true\n" - ) assert(report === expectedReport) // 2: Single test: @@ -199,17 +190,16 @@ class MonitorTest extends FunSuite with SharedSparkContext { "Tests for whatever") assert(success) - assert(Monitor.isSuccess()) + assert(Monitor.isSuccess) report = removeTimeStamps(Monitor.logs()) - expectedReport = ( - "[..:..] Begining\n" + + expectedReport = + "[..:..] Beginning\n" + "[..:..-..:..] Tests for whatever: success\n" + " KPI: someNbr\n" + " Value: 5.5E7\n" + " Must be superior than 5.0E7\n" + " Validated: true\n" - ) assert(report === expectedReport) } @@ -235,15 +225,14 @@ class MonitorTest extends FunSuite with SharedSparkContext { .mkString("\n") .dropRight(2) + "00" // removes the seconds of the job duration - val expectedReport = ( + val expectedReport = " My Processing\n" + "\n" + "Point of contact: x.guihot@gmail.com\n" + "Documentation: https://github.com/xavierguihot/spark_helper\n" + - "[..:..] Begining\n" + + "[..:..] Beginning\n" + "[..:..-..:..] Doing something: success\n" + "[..:..] Duration: 00:00:00" - ) assert(removeTimeStamps(reportStoredLines) === expectedReport) } @@ -296,7 +285,7 @@ class MonitorTest extends FunSuite with SharedSparkContext { timeStampFreeLogs.substring(0, index) + "[..:..-..:..]" + timeStampFreeLogs.substring(index + 13) - index = timeStampFreeLogs.indexOf("[", index + 1); + index = timeStampFreeLogs.indexOf("[", index + 1) } timeStampFreeLogs diff --git a/src/test/scala/com/spark_helper/SparkHelperTest.scala b/src/test/scala/com/spark_helper/SparkHelperTest.scala index b3578df..70d706d 100644 --- a/src/test/scala/com/spark_helper/SparkHelperTest.scala +++ b/src/test/scala/com/spark_helper/SparkHelperTest.scala @@ -1,5 +1,9 @@ package com.spark_helper +import com.spark_helper.SparkHelper._ + +import org.apache.hadoop.io.compress.GzipCodec + import com.holdenkarau.spark.testing.{SharedSparkContext, RDDComparisons} import org.scalatest.FunSuite @@ -14,105 +18,100 @@ class SparkHelperTest with SharedSparkContext with RDDComparisons { + val resourceFolder = "src/test/resources" + test("Save as single text file") { - // 1: Without an intermediate working dir: + val testFolder = s"$resourceFolder/folder" + val singleTextFilePath = s"$testFolder/single_text_file.txt" + val tmpFolder = s"$resourceFolder/tmp" + + HdfsHelper.deleteFolder(testFolder) + HdfsHelper.deleteFolder(tmpFolder) + + val rddToStore = + sc.parallelize(Array("data_a", "data_b", "data_c")).repartition(3) - var repartitionedDataToStore = sc - .parallelize(Array("data_a", "data_b", "data_c")) - .repartition(3) + // 1: Without an intermediate working dir: - HdfsHelper.deleteFile("src/test/resources/single_text_file.txt") - SparkHelper.saveAsSingleTextFile( - repartitionedDataToStore, - "src/test/resources/single_text_file.txt") + rddToStore.saveAsSingleTextFile(singleTextFilePath) - var singleFileStoredData = sc - .textFile("src/test/resources/single_text_file.txt") - .collect() - .sorted + var singleFileStoredData = sc.textFile(singleTextFilePath).collect().sorted assert(singleFileStoredData === Array("data_a", "data_b", "data_c")) - HdfsHelper.deleteFile("src/test/resources/single_text_file.txt") + HdfsHelper.deleteFolder(testFolder) // 2: With an intermediate working dir: // Notice as well that we test by moving the single file in a folder // which doesn't exists. - repartitionedDataToStore = sc - .parallelize(Array("data_a", "data_b", "data_c")) - .repartition(3) + rddToStore.saveAsSingleTextFile( + singleTextFilePath, + workingFolder = tmpFolder + ) + + singleFileStoredData = sc.textFile(singleTextFilePath).collect().sorted + + assert(singleFileStoredData === Array("data_a", "data_b", "data_c")) + + HdfsHelper.deleteFolder(testFolder) + HdfsHelper.deleteFolder(tmpFolder) - HdfsHelper.deleteFile("src/test/resources/folder/single_text_file.txt") - HdfsHelper.deleteFolder("src/test/resources/folder") - SparkHelper.saveAsSingleTextFile( - repartitionedDataToStore, - "src/test/resources/folder/single_text_file.txt", - workingFolder = "src/test/resources/tmp") - assert( - HdfsHelper.fileExists("src/test/resources/folder/single_text_file.txt")) + // 3: With a compression codec: - singleFileStoredData = sc - .textFile("src/test/resources/folder/single_text_file.txt") - .collect() - .sorted + rddToStore + .saveAsSingleTextFile(s"$singleTextFilePath.gz", classOf[GzipCodec]) + + singleFileStoredData = + sc.textFile(s"$singleTextFilePath.gz").collect().sorted assert(singleFileStoredData === Array("data_a", "data_b", "data_c")) - HdfsHelper.deleteFolder("src/test/resources/folder") - HdfsHelper.deleteFolder("src/test/resources/tmp") + HdfsHelper.deleteFolder(testFolder) } test("Read text file with specific record delimiter") { - // 1: Let's read a file where a record begins with a line begining with - // 3 and other lines begining by 4: + val weirdFormatFilePath = s"$resourceFolder/some_weird_format.txt" + + // 1: Let's read a file where a record begins with a line beginning with + // 3 and other lines beginning by 4: - HdfsHelper.deleteFile("src/test/resources/some_weird_format.txt") + HdfsHelper.deleteFile(weirdFormatFilePath) - val textContent = ( + val textContent = "3 first line of the first record\n" + "4 another line of the first record\n" + "4 and another one for the first record\n" + "3 first line of the second record\n" + "3 first line of the third record\n" + "4 another line for the third record" - ) - HdfsHelper - .writeToHdfsFile(textContent, "src/test/resources/some_weird_format.txt") + HdfsHelper.writeToHdfsFile(textContent, weirdFormatFilePath) - var computedRecords = SparkHelper - .textFileWithDelimiter( - "src/test/resources/some_weird_format.txt", - sc, - "\n3" - ) - .collect() + var computedRecords = sc.textFile(weirdFormatFilePath, "\n3").collect() var expectedRecords = Array( - ( - "3 first line of the first record\n" + - "4 another line of the first record\n" + - "4 and another one for the first record" - ), + "3 first line of the first record\n" + + "4 another line of the first record\n" + + "4 and another one for the first record", " first line of the second record", - ( - " first line of the third record\n" + - "4 another line for the third record" - ) + " first line of the third record\n" + + "4 another line for the third record" ) assert(computedRecords === expectedRecords) - HdfsHelper.deleteFile("src/test/resources/some_weird_format.txt") + HdfsHelper.deleteFile(weirdFormatFilePath) // 2: Let's read an xml file: - HdfsHelper.deleteFile("src/test/resources/some_basic_xml.xml") + val xmlFilePath = s"$resourceFolder/some_basic_xml.xml" + + HdfsHelper.deleteFile(xmlFilePath) - val xmlTextContent = ( + val xmlTextContent = "\n" + "\n" + "
        34 thingy street, someplace, sometown
        \n" + @@ -121,40 +120,47 @@ class SparkHelperTest "
        12 thingy street, someplace, sometown
        \n" + "
        \n" + "
        " - ) - HdfsHelper - .writeToHdfsFile(xmlTextContent, "src/test/resources/some_basic_xml.xml") + HdfsHelper.writeToHdfsFile(xmlTextContent, xmlFilePath) - computedRecords = SparkHelper - .textFileWithDelimiter( - "src/test/resources/some_basic_xml.xml", - sc, - "\n" - ) - .collect() + computedRecords = sc.textFile(xmlFilePath, "\n").collect() expectedRecords = Array( "\n", - ( - "
        34 thingy street, someplace, sometown
        \n" + - "
        \n" - ), - ( - "
        12 thingy street, someplace, sometown
        \n" + - "
        \n" + - "" - ) + "
        34 thingy street, someplace, sometown
        \n" + + "\n", + "
        12 thingy street, someplace, sometown
        \n" + + "\n" + + "" ) assert(computedRecords === expectedRecords) - HdfsHelper.deleteFile("src/test/resources/some_basic_xml.xml") + HdfsHelper.deleteFile(xmlFilePath) + } + + test("Flatten RDD") { + + var in = sc.parallelize(Array(Seq(1, 2, 3), Seq(), Nil, Seq(4), Seq(5, 6))) + var out = sc.parallelize(Array(1, 2, 3, 4, 5, 6)) + assertRDDEquals(in.flatten, out) + + in = sc.parallelize(Array(List(1, 2, 3), List(), Nil, List(4), List(5, 6))) + out = sc.parallelize(Array(1, 2, 3, 4, 5, 6)) + assertRDDEquals(in.flatten, out) + + val in2 = sc.parallelize(Array(Option(1), None, Option(2))) + val out2 = sc.parallelize(Array(1, 2)) + assertRDDEquals(in2.flatten, out2) } test("Save as text file by key") { - HdfsHelper.deleteFolder("src/test/resources/key_value_storage") + val keyValueFolder = s"$resourceFolder/key_value_storage" + + // 1: Let's store key values per file: + + HdfsHelper.deleteFolder(keyValueFolder) val someKeyValueRdd = sc.parallelize[(String, String)]( Array( @@ -168,79 +174,129 @@ class SparkHelperTest ) ) - SparkHelper.saveAsTextFileByKey( - someKeyValueRdd, - "src/test/resources/key_value_storage", - 3) + someKeyValueRdd.saveAsTextFileByKey(keyValueFolder, 3) + + // The folder key_value_storage has been created: + assert(HdfsHelper.folderExists(keyValueFolder)) + + // And it contains one file per key: + var generatedKeyFiles = HdfsHelper.listFileNamesInFolder(keyValueFolder) + var expectedKeyFiles = List("_SUCCESS", "key_1", "key_2", "key_3") + assert(generatedKeyFiles === expectedKeyFiles) + + var valuesForKey1 = sc.textFile(s"$keyValueFolder/key_1").collect().sorted + assert(valuesForKey1 === Array("value_a", "value_b")) + + val valuesForKey2 = sc.textFile(s"$keyValueFolder/key_2").collect().sorted + assert(valuesForKey2 === Array("value_b", "value_c", "value_d")) + + val valuesForKey3 = sc.textFile(s"$keyValueFolder/key_3").collect().sorted + assert(valuesForKey3 === Array("value_a", "value_b")) + + // 2: Let's store key values per file; but without providing the nbr of + // keys: + + HdfsHelper.deleteFolder(keyValueFolder) + + someKeyValueRdd.saveAsTextFileByKey(keyValueFolder) // The folder key_value_storage has been created: - assert(HdfsHelper.folderExists("src/test/resources/key_value_storage")) + assert(HdfsHelper.folderExists(keyValueFolder)) // And it contains one file per key: - val genratedKeyFiles = HdfsHelper - .listFileNamesInFolder("src/test/resources/key_value_storage") - val expectedKeyFiles = List("_SUCCESS", "key_1", "key_2", "key_3") - assert(genratedKeyFiles === expectedKeyFiles) + generatedKeyFiles = HdfsHelper.listFileNamesInFolder(keyValueFolder) + expectedKeyFiles = List("_SUCCESS", "key_1", "key_2", "key_3") + assert(generatedKeyFiles === expectedKeyFiles) + + valuesForKey1 = sc.textFile(s"$keyValueFolder/key_1").collect().sorted + assert(valuesForKey1 === Array("value_a", "value_b")) + + // 3: Let's store key values per file and compress these files: + + HdfsHelper.deleteFolder(keyValueFolder) + + someKeyValueRdd.saveAsTextFileByKey(keyValueFolder, 3, classOf[GzipCodec]) - val valuesForKey1 = sc - .textFile("src/test/resources/key_value_storage/key_1") - .collect() - .sorted + // The folder key_value_storage has been created: + assert(HdfsHelper.folderExists(keyValueFolder)) + + // And it contains one file per key: + generatedKeyFiles = HdfsHelper.listFileNamesInFolder(keyValueFolder) + expectedKeyFiles = List("_SUCCESS", "key_1.gz", "key_2.gz", "key_3.gz") + assert(generatedKeyFiles === expectedKeyFiles) + valuesForKey1 = sc.textFile(s"$keyValueFolder/key_1.gz").collect().sorted assert(valuesForKey1 === Array("value_a", "value_b")) - val valuesForKey2 = sc - .textFile("src/test/resources/key_value_storage/key_2") - .collect() - .sorted + HdfsHelper.deleteFolder(keyValueFolder) + } - assert(valuesForKey2 === Array("value_b", "value_c", "value_d")) + test("Save as text file and reduce nbr of partitions") { - val valuesForKey3 = sc - .textFile("src/test/resources/key_value_storage/key_3") - .collect() - .sorted + val testFolder = s"$resourceFolder/folder" - assert(valuesForKey3 === Array("value_a", "value_b")) + HdfsHelper.deleteFolder(testFolder) + + val rddToStore = + sc.parallelize(Array("data_a", "data_b", "data_c")).repartition(3) + + // 1: Without compressing: + + rddToStore.saveAsTextFileAndCoalesce(testFolder, 2) + + // Let's check the nbr of partitions: + var generatedKeyFiles = HdfsHelper.listFileNamesInFolder(testFolder) + var expectedKeyFiles = List("_SUCCESS", "part-00000", "part-00001") + assert(generatedKeyFiles === expectedKeyFiles) + + // And let's check the content: + var singleFileStoredData = sc.textFile(testFolder).collect().sorted + assert(singleFileStoredData === Array("data_a", "data_b", "data_c")) + + HdfsHelper.deleteFolder(testFolder) + + // 2: By compressing: + + rddToStore.saveAsTextFileAndCoalesce(testFolder, 2, classOf[GzipCodec]) + + // Let's check the nbr of partitions: + generatedKeyFiles = HdfsHelper.listFileNamesInFolder(testFolder) + expectedKeyFiles = List("_SUCCESS", "part-00000.gz", "part-00001.gz") + assert(generatedKeyFiles === expectedKeyFiles) + + // And let's check the content: + singleFileStoredData = sc.textFile(testFolder).collect().sorted + assert(singleFileStoredData === Array("data_a", "data_b", "data_c")) - HdfsHelper.deleteFolder("src/test/resources/key_value_storage") + HdfsHelper.deleteFolder(testFolder) } test("Decrease coalescence level") { - HdfsHelper.deleteFolder("src/test/resources/re_coalescence_test_input") - HdfsHelper.deleteFolder("src/test/resources/re_coalescence_test_output") + val inputTestFolder = s"$resourceFolder/re_coalescence_test_input" + val outputTestFolder = s"$resourceFolder/re_coalescence_test_output" + + HdfsHelper.deleteFolder(inputTestFolder) + HdfsHelper.deleteFolder(outputTestFolder) // Let's create the folder with high level of coalescence (3 files): - SparkHelper.saveAsSingleTextFile( - sc.parallelize[String](Array("data_1_a", "data_1_b", "data_1_c")), - "src/test/resources/re_coalescence_test_input/input_file_1") - SparkHelper.saveAsSingleTextFile( - sc.parallelize[String](Array("data_2_a", "data_2_b")), - "src/test/resources/re_coalescence_test_input/input_file_2") - SparkHelper.saveAsSingleTextFile( - sc.parallelize[String](Array("data_3_a", "data_3_b", "data_3_c")), - "src/test/resources/re_coalescence_test_input/input_file_3") + sc.parallelize(Array("data_1_a", "data_1_b", "data_1_c")) + .saveAsSingleTextFile(s"$inputTestFolder/input_file_1") + sc.parallelize(Array("data_2_a", "data_2_b")) + .saveAsSingleTextFile(s"$inputTestFolder/input_file_2") + sc.parallelize(Array("data_3_a", "data_3_b", "data_3_c")) + .saveAsSingleTextFile(s"$inputTestFolder/input_file_3") // Let's decrease the coalescence level in order to only have 2 files: - SparkHelper.decreaseCoalescence( - "src/test/resources/re_coalescence_test_input", - "src/test/resources/re_coalescence_test_output", - 2, - sc) + sc.decreaseCoalescence(inputTestFolder, outputTestFolder, 2) // And we check we have two files in output: - val outputFileList = HdfsHelper - .listFileNamesInFolder("src/test/resources/re_coalescence_test_output") + val outputFileList = HdfsHelper.listFileNamesInFolder(outputTestFolder) val expectedFileList = List("_SUCCESS", "part-00000", "part-00001") assert(outputFileList === expectedFileList) // And that all input data is in the output: - val outputData = sc - .textFile("src/test/resources/re_coalescence_test_output") - .collect - .sorted - + val outputData = sc.textFile(outputTestFolder).collect.sorted val expectedOutputData = Array( "data_1_a", "data_1_b", @@ -253,38 +309,45 @@ class SparkHelperTest ) assert(outputData === expectedOutputData) - HdfsHelper.deleteFolder("src/test/resources/re_coalescence_test_output") + HdfsHelper.deleteFolder(inputTestFolder) + HdfsHelper.deleteFolder(outputTestFolder) } test( "Extract lines of files to an RDD of tuple containing the line and file " + "the line comes from") { - HdfsHelper.deleteFolder("src/test/resources/with_file_name") + val testFolder = s"$resourceFolder/with_file_name" + + HdfsHelper.deleteFolder(testFolder) + HdfsHelper.writeToHdfsFile( "data_1_a\ndata_1_b\ndata_1_c", - "src/test/resources/with_file_name/file_1.txt") + s"$testFolder/file_1.txt" + ) HdfsHelper.writeToHdfsFile( "data_2_a\ndata_2_b", - "src/test/resources/with_file_name/file_2.txt") + s"$testFolder/file_2.txt" + ) HdfsHelper.writeToHdfsFile( "data_3_a\ndata_3_b\ndata_3_c\ndata_3_d", - "src/test/resources/with_file_name/folder_1/file_3.txt") + s"$testFolder/folder_1/file_3.txt" + ) - val computedRdd = SparkHelper - .textFileWithFileName("src/test/resources/with_file_name", sc) + val computedRdd = sc + .textFileWithFileName(testFolder) // We remove the part of the path which is specific to the local machine // on which the test run: .map { case (filePath, line) => val nonLocalPath = filePath.split("src/test/") match { - case Array(localPartOfPath, projectRelativePath) => + case Array(_, projectRelativePath) => "file:/.../src/test/" + projectRelativePath } (nonLocalPath, line) } - val expectedRDD = sc.parallelize( + val expectedRdd = sc.parallelize( Array( ("file:/.../src/test/resources/with_file_name/file_1.txt", "data_1_a"), ("file:/.../src/test/resources/with_file_name/file_1.txt", "data_1_b"), @@ -305,8 +368,41 @@ class SparkHelperTest ("file:/.../src/test/resources/with_file_name/file_2.txt", "data_2_b") )) - assertRDDEquals(computedRdd, expectedRDD) + assertRDDEquals(computedRdd, expectedRdd) + + HdfsHelper.deleteFolder(testFolder) + } + + test("textFile with files containing commas in their path") { + + val testFolder = s"$resourceFolder/files_containing_commas" + + HdfsHelper.deleteFolder(testFolder) + + HdfsHelper.writeToHdfsFile( + "data_1_a\ndata_1_b", + s"$testFolder/file,1.txt" + ) + HdfsHelper.writeToHdfsFile( + "data_2_a\ndata_2_b", + s"$testFolder/file_2.txt" + ) + + val computedRdd = + sc.textFile(List(s"$testFolder/file,1.txt", s"$testFolder/file_2.txt")) + val expectedRdd = + sc.parallelize("data_1_a\ndata_1_b\ndata_2_a\ndata_2_b".split("\n")) + + assertRDDEquals(computedRdd, expectedRdd) + + HdfsHelper.deleteFolder(testFolder) + } + + test("Partial map") { - HdfsHelper.deleteFolder("src/test/resources/with_file_name") + val in = sc.parallelize(Array(1, 3, 2, 7, 8)) + val computedOut = in.partialMap { case a if a % 2 == 0 => 2 * a } + val expectedOut = sc.parallelize(Array(1, 3, 4, 7, 16)) + assertRDDEquals(computedOut, expectedOut) } }