diff --git a/.gitignore b/.gitignore
index d838934..b76ffde 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,5 @@ project/target
target
*.crc
+
+.idea
diff --git a/README.md b/README.md
index e36dad4..90015ab 100644
--- a/README.md
+++ b/README.md
@@ -5,8 +5,6 @@
## Overview
-Version: 1.1.1
-
API Scaladoc: [SparkHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.SparkHelper$)
This library contains a bunch of low-level basic methods for data processing
@@ -18,14 +16,14 @@ names are self-explanatory and readable.
This also provides a monitoring/logger tool.
-This is a bunch of 4 modules:
+This is a set of 4 modules:
-* [HdfsHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.HdfsHelper$): Wrapper around [apache Hadoop FileSystem API](https://hadoop.apache.org/docs/r2.6.1/api/org/apache/hadoop/fs/FileSystem.html) for file manipulations on hdfs.
-* [SparkHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.SparkHelper$): Hdfs file manipulations through the Spark API.
+* [HdfsHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.HdfsHelper$): Wrapper around the [apache Hadoop FileSystem API](https://hadoop.apache.org/docs/r2.6.1/api/org/apache/hadoop/fs/FileSystem.html) for file manipulations on hdfs.
+* [SparkHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.SparkHelper$): Hdfs file manipulations through the Spark API (pimped RDDs and SparkContext).
* [DateHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.DateHelper$): Wrapper around [joda-time](http://www.joda.org/joda-time/apidocs/) for usual data mining dates manipulations.
* [Monitor](http://xavierguihot.com/spark_helper/#com.spark_helper.Monitor$): Spark custom monitoring/logger and kpi validator.
-Compatible with Spark 2.
+Compatible with Spark 2.x
### HdfsHelper:
@@ -36,21 +34,21 @@ The full list of methods is available at
Contains basic file-related methods mostly based on hdfs apache Hadoop
FileSystem API [org.apache.hadoop.fs.FileSystem](https://hadoop.apache.org/docs/r2.6.1/api/org/apache/hadoop/fs/FileSystem.html).
-For instance, one don't want to remove a file from hdfs using 3 lines of code
-and thus could instead just use `HdfsHelper.deleteFile("my/hdfs/file/path.csv")`.
-
-A non-exhaustive list of exemples:
+A non-exhaustive list of examples:
```scala
import com.spark_helper.HdfsHelper
// A bunch of methods wrapping the FileSystem API, such as:
-HdfsHelper.fileExists("my/hdfs/file/path.txt")
-assert(HdfsHelper.listFileNamesInFolder("my/folder/path") == List("file_name_1.txt", "file_name_2.csv"))
-assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306")
-assert(HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") == 3)
-HdfsHelper.deleteFile("my/hdfs/file/path.csv")
-HdfsHelper.moveFolder("my/hdfs/folder")
+HdfsHelper.fileExists("my/hdfs/file/path.txt") // HdfsHelper.folderExists("my/hdfs/folder")
+HdfsHelper.listFileNamesInFolder("my/folder/path") // List("file_name_1.txt", "file_name_2.csv")
+HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") // "20170306"
+HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") // 3
+HdfsHelper.deleteFile("my/hdfs/file/path.csv") // HdfsHelper.deleteFolder("my/hdfs/folder")
+HdfsHelper.moveFolder("old/path", "new/path") // HdfsHelper.moveFile("old/path.txt", "new/path.txt")
+HdfsHelper.createEmptyHdfsFile("/some/hdfs/file/path.token") // HdfsHelper.createFolder("my/hdfs/folder")
+
+// File content helpers:
HdfsHelper.compressFile("hdfs/path/to/uncompressed_file.txt", classOf[GzipCodec])
HdfsHelper.appendHeader("my/hdfs/file/path.csv", "colum0,column1")
@@ -58,46 +56,71 @@ HdfsHelper.appendHeader("my/hdfs/file/path.csv", "colum0,column1")
HdfsHelper.isHdfsXmlCompliantWithXsd("my/hdfs/file/path.xml", getClass.getResource("/some_xml.xsd"))
HdfsHelper.loadXmlFileFromHdfs("my/hdfs/file/path.xml")
-// Very handy to load a config (typesafe format) stored on hdfs at the begining of a spark job:
+// Very handy to load a config (typesafe format) stored on hdfs at the beginning of a spark job:
HdfsHelper.loadTypesafeConfigFromHdfs("my/hdfs/file/path.conf"): Config
// In order to write small amount of data in a file on hdfs without the whole spark stack:
HdfsHelper.writeToHdfsFile(Array("some", "relatively small", "text"), "/some/hdfs/file/path.txt")
+// or:
+import com.spark_helper.HdfsHelper._
+Array("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt")
+"hello world".writeToHdfs("/some/hdfs/file/path.txt")
// Deletes all files/folders in "hdfs/path/to/folder" for which the timestamp is older than 10 days:
HdfsHelper.purgeFolder("hdfs/path/to/folder", 10)
```
+In case a specific configuration is needed to access the file system, these
+setters are available:
+
+```scala
+// To use a specific conf FileSystem.get(whateverConf) instead of FileSystem.get(new Configuration()):
+HdfsHelper.setConf(whateverConf)
+// Or directly the FileSystem:
+HdfsHelper.setFileSystem(whateverFileSystem)
+```
+
### SparkHelper:
The full list of methods is available at
[SparkHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.SparkHelper$).
-Contains basic file/RRD-related methods based on the Spark APIs.
+Contains basic RRD-related methods.
-A non-exhaustive list of exemples:
+A non-exhaustive list of examples:
```scala
-import com.spark_helper.SparkHelper
+import com.spark_helper.SparkHelper._
-// Same as SparkContext.saveAsTextFile, but the result is a single file:
-SparkHelper.saveAsSingleTextFile(myOutputRDD, "/my/output/file/path.txt")
+// Same as rdd.saveAsTextFile("path"), but the result is a single file (while
+// keeping the processing distributed):
+rdd.saveAsSingleTextFile("/my/output/file/path.txt")
+rdd.saveAsSingleTextFile("/my/output/file/path.txt", classOf[BZip2Codec])
-// Same as SparkContext.textFile, but instead of reading one record per line,
-// it reads records spread over several lines. This way, xml, json, yml or
-// any multi-line record file format can be used with Spark:
-SparkHelper.textFileWithDelimiter("/my/input/folder/path", sparkContext, "---\n")
+// Same as sc.textFile("path"), but instead of reading one record per line (by
+// splitting the input with \n), it splits the file in records based on a custom
+// delimiter. This way, xml, json, yml or any multi-line record file format can
+// be used with Spark:
+sc.textFile("/my/input/folder/path", "---\n") // for a yml file for instance
-// Equivalent to sparkContext.textFile(), but for each line is tupled with its
-// file path:
-SparkHelper.textFileWithFileName("folder", sparkContext)
+// Equivalent to rdd.flatMap(identity) for RDDs of Seqs or Options:
+rdd.flatten
+
+// Equivalent to sc.textFile(), but for each line is tupled with its file path:
+sc.textFileWithFileName("/my/input/folder/path")
// which produces:
-RDD(
- ("file:/path/on/machine/folder/file_1.txt", "record1fromfile1"),
- ("file:/path/on/machine/folder/file_1.txt", "record2fromfile1"),
- ("file:/path/on/machine/folder/file_2.txt", "record1fromfile2"),
- ...
-)
+// RDD(("folder/file_1.txt", "record1fromfile1"), ("folder/file_1.txt", "record2fromfile1"),
+// ("folder/file_2.txt", "record1fromfile2"), ...)
+
+// In the given folder, this generates one file per key in the given key/value
+// RDD. Within each file (named from the key) are all values for this key:
+rdd.saveAsTextFileByKey("/my/output/folder/path")
+
+// Concept mapper (the following example transforms RDD(1, 3, 2, 7, 8) into RDD(1, 3, 4, 7, 16)):
+rdd.partialMap { case a if a % 2 == 0 => 2 * a }
+
+// For when input files contain commas and textFile can't handle it:
+sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt"))
```
### DateHelper:
@@ -106,21 +129,43 @@ The full list of methods is available at
[DateHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.DateHelper$).
Wrapper around [joda-time](http://www.joda.org/joda-time/apidocs/) for
-data-mining classic dates manipulations.
+data-mining classic dates manipulations and job scheduling.
-A non-exhaustive list of exemples:
+A non-exhaustive list of examples:
```scala
import com.spark_helper.DateHelper
-assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101"))
-assert(DateHelper.today() == "20170310") // If today's "20170310"
-assert(DateHelper.yesterday() == "20170309") // If today's "20170310"
-assert(DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") == "170327")
-assert(DateHelper.now("HH:mm") == "10:24")
-assert(DateHelper.currentTimestamp() == "1493105229736")
-assert(DateHelper.nDaysBefore(3) == "20170307") // If today's "20170310"
-assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310")
+DateHelper.daysBetween("20161230", "20170101") // List("20161230", "20161231", "20170101")
+DateHelper.today // "20170310"
+DateHelper.yesterday // "20170309"
+DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") // "170327"
+DateHelper.now("HH:mm") // "10:24"
+DateHelper.currentTimestamp // "1493105229736"
+DateHelper.nDaysBefore(3) // "20170307"
+DateHelper.nDaysAfterDate(3, "20170307") // "20170310"
+DateHelper.nextDay("20170310") // "20170311"
+DateHelper.nbrOfDaysSince("20170302") // 8
+DateHelper.nbrOfDaysBetween("20170327", "20170401") // 5
+DateHelper.dayOfWeek("20160614") // 2
+
+import com.spark_helper.DateHelper._
+
+2.daysAgo // "20170308"
+"20161230" to "20170101" // List("20161230", "20161231", "20170101")
+3.daysBefore("20170310") // "20170307"
+5.daysAfter // "20170315"
+4.daysAfter("20170310") // "20170314"
+"20170302".isCompliantWith("yyyyMMdd")
+"20170310".nextDay // "20170311"
+"20170310".previousDay // "20170309"
+```
+
+The default format (when no format is specified) is "yyyyMMdd" (20170327). It
+can be modified globally with:
+
+```scala
+DateHelper.setFormat("ddMMMyy")
```
### Monitor:
@@ -128,15 +173,15 @@ assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310")
The full list of methods is available at
[Monitor](http://xavierguihot.com/spark_helper/#com.spark_helper.Monitor$)
-It's a simple logger/report which contains a report that one can update from
-the driver and a success state. The idea is to persist job executions logs and
-errors (and forget about grepping unreadable yarn logs).
+It's a simple logger/report which contains a report and a state that one can
+update from the driver. The idea is to persist job executions logs and errors
+(and forget about grepping unreadable yarn logs).
-It's designed for perdiodic spark jobs (handles storage and purge of logs) and
+It's designed for periodic spark jobs (handles storage and purge of logs) and
provides a way to handle kpis validation.
Logs are stored on the go which means one can have a direct real time access of
-the job logs/status and it's current state (which can overwise be a pain if it
+the job logs/status and it's current state (which can otherwise be a pain if it
means going through yarn logs, or even for certain production environments going
through additional layers of software logs to get to yarn logs).
@@ -150,9 +195,9 @@ the logger for a clean logging.
This is a "driver-only" logger and is not intended at logging concurrent actions
from executors.
-Produced reports can easily be inserted in a notification email whenerver the
+Produced reports can easily be inserted in a notification email whenever the
job fails, which saves a lot of time to maintainers operating on heavy
-production environements.
+production environments.
The produced persisted report is also a way for downstream jobs to know the
status of their input data.
@@ -190,7 +235,7 @@ try {
Monitor.error(e, "My pipeline descirption") // whatever unexpected error
}
-if (Monitor.isSuccess()) {
+if (Monitor.isSuccess) {
val doMore = "Let's do some more stuff!"
Monitor.log("My second pipeline description: success")
}
@@ -199,9 +244,9 @@ if (Monitor.isSuccess()) {
// HDFS (this saves the logs in the folder set with Monitor.setLogFolder):
Monitor.store()
-// At the end of the job, if the job isn't successfull, you might want to
+// At the end of the job, if the job isn't successful, you might want to
// crash it (for instance to get a notification from your scheduler):
-if (!Monitor.isSuccess()) throw new Exception() // or send an email, or ...
+if (!Monitor.isSuccess) throw new Exception() // or send an email, or ...
```
At any time during the job, logs can be accessed from file
@@ -214,7 +259,7 @@ Here are some possible reports generated by the previous pipeline:
My job description (whatever you want); for instance:
Documentation: https://github.com/xavierguihot/spark_helper
-[10:23] Begining
+[10:23] Beginning
[10:23-10:23] My pipeline descirption: failed
Diagnostic: No input data!
org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://my/hdfs/input/path
@@ -231,7 +276,7 @@ or
My job description (whatever you want); for instance:
Documentation: https://github.com/xavierguihot/spark_helper
-[10:23] Begining
+[10:23] Beginning
[10:23-10:41] My pipeline descirption: success
KPI: Nbr of output records
Value: 14669071.0
@@ -248,15 +293,15 @@ Documentation: https://github.com/xavierguihot/spark_helper
## Including spark_helper to your dependencies:
-With sbt, add these lines to your build.sbt:
+With sbt:
```scala
resolvers += "jitpack" at "https://jitpack.io"
-libraryDependencies += "com.github.xavierguihot" % "spark_helper" % "v1.1.1"
+libraryDependencies += "com.github.xavierguihot" % "spark_helper" % "2.0.0"
```
-With maven, add these lines to your pom.xml:
+With maven:
```xml
Returns which date it will be x days after the given date under the
+default format. Returns which date it will be x days after the given date under the
+default format. If the given date is "20170122" and we request the date it will be 3
+days after, we'll return "20170125". the date under the default format for which we want the date
+for nbrOfDaysAfter days after. the date it was nbrOfDaysAfter after date under the default
+format. Returns which date it will be x days after today under the default format. Returns which date it will be x days after today under the default format. If we're "20170125" and we request for 3 days after, we'll return
+"20170127". today's date plus the given nbr of days Returns which date it was x days before today under the default format. Returns which date it was x days before today under the default format. If we're "20170125" and we request for 3 days before, we'll return
+"20170122". today's date minus the given nbr of days Returns which date it was x days before the given date. Returns which date it was x days before the given date. If the given date is "20170125" and we request the date it was 3 days
+before, this will return "20170122". the date under the default format for which we want the date
+for nbrOfDaysBefore days before. the date it was nbrOfDaysBefore before date under the default
+format. Validates the formatted date is compliant with the provided format. Validates the formatted date is compliant with the provided format. if the provided date is under the provided format Returns the date one day after the given date. Returns the date one day after the given date. the date of the day after the given date Returns the date one day before the given date. Returns the date one day before the given date. the date of the day before the given date Creates the list of dates between the two given dates. Creates the list of dates between the two given dates. the last date the list of dates between this string and the lastDate in the
+default format. A facility which deals with usual date needs (wrapper around
joda-time). The goal is to remove the maximum of highly used low-level code from your
spark job and replace it with methods fully tested whose name is
-self-explanatory/readable. A few exemples: Source import com.spark_helper.DateHelper
+
+DateHelper.daysBetween("20161230", "20170101") // List("20161230", "20161231", "20170101")
+DateHelper.today // "20170310"
+DateHelper.yesterday // "20170309"
+DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") // "170327"
+DateHelper.now("HH:mm") // "10:24"
+DateHelper.currentTimestamp // "1493105229736"
+DateHelper.nDaysBefore(3) // "20170307"
+DateHelper.nDaysAfterDate(3, "20170307") // "20170310"
+DateHelper.nextDay("20170310") // "20170311"
+DateHelper.nbrOfDaysSince("20170302") // 8
+DateHelper.nbrOfDaysBetween("20170327", "20170401") // 5
+DateHelper.dayOfWeek("20160614") // 2
+
+import com.spark_helper.DateHelper._
+
+2.daysAgo // "20170308"
+"20161230" to "20170101" // List("20161230", "20161231", "20170101")
+3.daysBefore("20170310") // "20170307"
+5.daysAfter // "20170315"
+4.daysAfter("20170310") // "20170314"
+"20170302".isCompliantWith("yyyyMMdd")
+"20170310".nextDay // "20170311"
+"20170310".previousDay // "20170309" Source DateHelper
2017-02 Returns the date associated to the given UTC timestamp. Returns the date associated to the given UTC timestamp. the UTC timestamps (nbr of millis since 1970-01-01) for
-which to get the associated date. (default = "yyyyMMdd") the format of the provided dates the associated date under the requested format the format of the provided dates the associated date under the requested format Returns the day of week for a date under the given format. Returns the day of week for a date under the given format. A Monday is 1 and a Sunday is 7. the date for which to get the day of week (default = "yyyyMMdd") the format under which the date is
-provided. the associated day of week, such as 2 for Tuesday Returns the day of week for a date under the given format. Returns the day of week for a date under the given format. A Monday is 1 and a Sunday is 7. the date for which to get the day of week the format under which the date is provided the associated day of week, such as 2 for Tuesday Finds the list of dates between the two given dates. Finds the list of dates between the two given dates. the first date (in the given format) the last date (in the given format) (default = "yyyyMMdd") the format to use for firstDate and
-lastDate and for the returned list of dates. the list of dates between firstDate and lastDate in the given
+ Finds the list of dates between the two given dates. Finds the list of dates between the two given dates. the first date (in the given format) the last date (in the given format) the format to use for firstDate and lastDate and for the
+returned list of dates. the list of dates between firstDate and lastDate in the given
format. the stringified date if the provided date is under the provided format the formatted date if the provided date is under the provided format
+
+ implicit
+ class
+
+
+ IntExtensions extends AnyRef
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Instance Constructors
+
+
+
+
+ new
+
+
+ IntExtensions(int: Int)
+
+
+
+
+
+
+
+ Value Members
+
+
+
+ final
+ def
+
+
+ !=(arg0: Any): Boolean
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ ##(): Int
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ ==(arg0: Any): Boolean
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ asInstanceOf[T0]: T0
+
+
+
+
+
+
+
+
+
+ def
+
+
+ clone(): AnyRef
+
+
+
+
+
+
+
+
+
+ def
+
+
+ daysAfter(date: String): String
+
+
+
+
+
+
+ assert(5.daysAfter("20170305") == "20170310")
+
+
+ def
+
+
+ daysAfter: String
+
+
+
+
+
+
+ // If today's "20170310":
+assert(3.daysAfter == "20170313")
+
+
+ def
+
+
+ daysAgo: String
+
+
+
+
+
+
+ // If today's "20170310":
+assert(3.daysAgo == "20170307")
+
+
+ def
+
+
+ daysBefore(date: String): String
+
+
+
+
+
+
+ assert(3.daysBefore("20170310") == "20170307")
+
+ final
+ def
+
+
+ eq(arg0: AnyRef): Boolean
+
+
+
+
+
+
+
+
+
+ def
+
+
+ equals(arg0: Any): Boolean
+
+
+
+
+
+
+
+
+
+ def
+
+
+ finalize(): Unit
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ getClass(): Class[_]
+
+
+
+
+
+
+
+
+
+ def
+
+
+ hashCode(): Int
+
+
+
+
+
+
+
+
+
+ val
+
+
+ int: Int
+
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ isInstanceOf[T0]: Boolean
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ ne(arg0: AnyRef): Boolean
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ notify(): Unit
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ notifyAll(): Unit
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ synchronized[T0](arg0: ⇒ T0): T0
+
+
+
+
+
+
+
+
+
+ def
+
+
+ toString(): String
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ wait(): Unit
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ wait(arg0: Long, arg1: Int): Unit
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ wait(arg0: Long): Unit
+
+
+
+
+
+
+
Inherited from AnyRef
+ Inherited from Any
+ Ungrouped
+
+
+
+ implicit
+ class
+
+
+ StringExtensions extends AnyRef
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Instance Constructors
+
+
+
+
+ new
+
+
+ StringExtensions(string: String)
+
+
+
+
+
+
+
+ Value Members
+
+
+
+ final
+ def
+
+
+ !=(arg0: Any): Boolean
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ ##(): Int
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ ==(arg0: Any): Boolean
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ asInstanceOf[T0]: T0
+
+
+
+
+
+
+
+
+
+ def
+
+
+ clone(): AnyRef
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ eq(arg0: AnyRef): Boolean
+
+
+
+
+
+
+
+
+
+ def
+
+
+ equals(arg0: Any): Boolean
+
+
+
+
+
+
+
+
+
+ def
+
+
+ finalize(): Unit
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ getClass(): Class[_]
+
+
+
+
+
+
+
+
+
+ def
+
+
+ hashCode(): Int
+
+
+
+
+
+
+
+
+
+ def
+
+
+ isCompliantWith(format: String): Boolean
+
+
+
+
+
+
+ assert("20170302".isCompliantWith("yyyyMMdd"))
+assert(!"20170333".isCompliantWith("yyyyMMdd"))
+assert("20170228".isCompliantWith("yyyyMMdd"))
+assert(!"20170229".isCompliantWith("yyyyMMdd"))
+assert(!"170228".isCompliantWith("yyyyMMdd"))
+assert(!"".isCompliantWith("yyyyMMdd"))
+assert(!"a".isCompliantWith("yyyyMMdd"))
+assert(!"24JAN17".isCompliantWith("yyyyMMdd"))
+
+ final
+ def
+
+
+ isInstanceOf[T0]: Boolean
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ ne(arg0: AnyRef): Boolean
+
+
+
+
+
+
+
+
+
+ def
+
+
+ nextDay: String
+
+
+
+
+
+
+ assert("20170310".nextDay == "20170311")
+
+ final
+ def
+
+
+ notify(): Unit
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ notifyAll(): Unit
+
+
+
+
+
+
+
+
+
+ def
+
+
+ previousDay: String
+
+
+
+
+
+
+ assert("20170310".previousDay == "20170309")
+
+
+ val
+
+
+ string: String
+
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ synchronized[T0](arg0: ⇒ T0): T0
+
+
+
+
+
+
+
+
+
+ def
+
+
+ to(lastDate: String): List[String]
+
+
+
+
+
+
+ assert(("20161230" to "20170101") == List("20161230", "20161231", "20170101"))
+
+
+ def
+
+
+ toString(): String
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ wait(): Unit
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ wait(arg0: Long, arg1: Int): Unit
+
+
+
+
+
+
+
+
+ final
+ def
+
+
+ wait(arg0: Long): Unit
+
+
+
+
+
+
+
Inherited from AnyRef
+ Inherited from Any
+ Ungrouped
+
+
assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101"))
-assert(DateHelper.today() == "20170310") // If today's "20170310"
-assert(DateHelper.yesterday() == "20170309") // If today's "20170310"
-assert(DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") == "170327")
-assert(DateHelper.now("HH:mm") == "10:24")
-assert(DateHelper.currentTimestamp() == "1493105229736")
-assert(DateHelper.nDaysBefore(3) == "20170307") // If today's "20170310"
-assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310")
Type Members
+
+
+
+ implicit
+ class
+
+
+ IntExtensions extends AnyRef
+
+
+
+
+
+
+
+
+
+ implicit
+ class
+
+
+ StringExtensions extends AnyRef
+
+
+
+
+
+
+
+
def
- currentTimestamp(): String
+ currentTimestamp: String
-
+
@@ -241,7 +295,7 @@
def
- dateFromTimestamp(timestamp: Long, format: String = "yyyyMMdd"): String
+ dateFromTimestamp(timestamp: Long, format: String = defaultFormat): String
@@ -250,7 +304,7 @@
assert(DateHelper.dateFromTimestamp(1496074819L) == "20170529")
assert(DateHelper.dateFromTimestamp(1496074819L, "yyMMdd") == "170529")
def
- dayOfWeek(date: String, format: String = "yyyyMMdd"): Int
+ dayOfWeek(date: String, format: String = defaultFormat): Int
- assert(DateHelper.dayOfWeek("20160614") == 2)
assert(DateHelper.dayOfWeek("20160614") == 2)
def
- daysBetween(firstDate: String, lastDate: String, format: String = "yyyyMMdd"): List[String]
+ daysBetween(firstDate: String, lastDate: String, format: String = defaultFormat): List[String]
- assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101"))
assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101"))
assert(!DateHelper.isDateCompliantWithFormat("170228", "yyyyMMdd"))
assert(!DateHelper.isDateCompliantWithFormat("", "yyyyMMdd"))
assert(!DateHelper.isDateCompliantWithFormat("a", "yyyyMMdd"))
-assert(!DateHelper.isDateCompliantWithFormat("24JAN17", "yyyyMMdd"))
returned list of dates and thus prefer getting a list of Joda DateTime
objects instead of String dates.
the joda DateTime first date
the joda DateTime last date
the list of joda DateTime between jodaFirstDate and jodaLastDate
Returns which date it will be x days after today under the default format.
Returns which date it will be x days after today under the default format.
// If today's "20170310": +assert(DateHelper.nDaysAfter(5) == "20170315")
the nbr of days after today
today's date plus the nbrOfDaysAfter under the default format
Returns which date it will be x days after today under the requested format.
Returns which date it will be x days after today under the requested format.
// If today's "20170310": +assert(DateHelper.nDaysAfter(5, "yyMMdd") == "170315")
the nbr of days after today
the format for the returned date
today's date plus the nbrOfDaysAfter under the requested format
Returns which date it will be x days after the given date under the +default format.
Returns which date it will be x days after the given date under the +default format.
If the given date is "20170122" and we request the date it will be 3 days +after, we'll return "20170125".
assert(DateHelper.nDaysAfterDate(5, "20170305") == "20170310")
the nbr of days after the given date
the date under the default format for which we want the date +for nbrOfDaysAfter days after.
the date it was nbrOfDaysAfter after date under the default +format.
Returns which date it will be x days after the given date.
Returns which date it will be x days after the given date.
If the given date is "20170122" and we request the date it will be 3 days -after, we'll return "20170125".
assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310") -assert(DateHelper.nDaysAfterDate(5, "170305", "yyMMdd") == "170310")
the nbr of days after the given date
the date under the provided format for which we want the date -for nbrOfDaysAfter days after.
(default = "yyyyMMdd") the format for the provided and -returned dates.
the date it was nbrOfDaysAfter after date under the requested +after, we'll return "20170125".
assert(DateHelper.nDaysAfterDate(5, "170305", "yyMMdd") == "170310")
the nbr of days after the given date
the date under the provided format for which we want the date +for nbrOfDaysAfter days after.
the format for the provided and returned dates.
the date it was nbrOfDaysAfter after date under the requested format.
Returns which date it was x days before today.
Returns which date it was x days before today.
// If today's "20170310": +assert(DateHelper.nDaysBefore(5) == "20170305")
the nbr of days before today
today's date minus the nbrOfDaysBefore under the default format
Returns which date it was x days before today under the requested format.
Returns which date it was x days before today under the requested format.
If we're "20170125" and we request for 3 days before, we'll return -"20170122".
// If today's "20170310": -assert(DateHelper.nDaysBefore(3) == "20170307") -assert(DateHelper.nDaysBefore(5, "yyMMdd") == "170305")
the nbr of days before today
(default = "yyyyMMdd") the format for the returned date
today's date minus the nbrOfDaysBefore under the requested format
Returns which date it was x days before today under the requested format.
Returns which date it was x days before today under the requested format.
// If today's "20170310": +assert(DateHelper.nDaysBefore(5, "yyMMdd") == "170305")
the nbr of days before today
the format for the returned date
today's date minus the nbrOfDaysBefore under the requested format
Returns which date it was x days before the given date.
Returns which date it was x days before the given date.
If the given date is "20170125" and we request the date it was 3 days +before, this will return "20170122".
assert(DateHelper.nDaysBeforeDate(5, "20170310") == "20170305")
the nbr of days before the given date
the date under the default format for which we want the date +for nbrOfDaysBefore days before.
the date it was nbrOfDaysBefore before date under the default +format.
Returns which date it was x days before the given date.
Returns which date it was x days before the given date.
If the given date is "20170125" and we request the date it was 3 days -before, we'll return "20170122".
assert(DateHelper.nDaysBeforeDate(3, "20170310") == "20170307") -assert(DateHelper.nDaysBeforeDate(5, "170310", "yyMMdd") == "170305")
the nbr of days before the given date
the date under the provided format for which we want the date -for nbrOfDaysBefore days before.
(default = "yyyyMMdd") the format for the provided and -returned dates.
the date it was nbrOfDaysBefore before date under the requested +before, this will return "20170122".
assert(DateHelper.nDaysBeforeDate(5, "170310", "yyMMdd") == "170305")
the nbr of days before the given date
the date under the provided format for which we want the date +for nbrOfDaysBefore days before.
the format for the provided and returned dates.
the date it was nbrOfDaysBefore before date under the requested format.
This expects the first date to be before the last date.
the first date of the range for which to egt the nbr of days.
the last date of the range for which to egt the nbr of -days.
(default = "yyyyMMdd") the format of the provided dates
the nbr of days between the two given dates
the format of the provided dates
the nbr of days between the two given dates
Returns the nbr of days between today and the given date.
Returns the nbr of days between today and the given date.
// If today is "20170327": assert(DateHelper.nbrOfDaysSince("20170310") == 17) -assert(DateHelper.nbrOfDaysSince("170310", "yyMMdd") == 17)
the date for which to find the nbr of days of diff with today
(default = "yyyyMMdd") the format of the provided date
the nbr of days between today and the given date
the date for which to find the nbr of days of diff with today
the format of the provided date
the nbr of days between today and the given date
Returns for a date the date one day latter.
Returns for a date the date one day latter.
// If the given date is "20170310": -assert(DateHelper.nextDay("20170310") == "20170311") -assert(DateHelper.nextDay("170310", "yyMMdd") == "170311")
the date for which to find the date of the day after
(default = "yyyyMMdd") the format of the provided and the -returned dates.
the date of the day after the given date
Returns for a date the date one day latter.
Returns for a date the date one day latter.
assert(DateHelper.nextDay("20170310") == "20170311") +assert(DateHelper.nextDay("170310", "yyMMdd") == "170311")
the date for which to find the date of the day after
the format of the provided and the returned dates
the date of the day after the given date
Returns for a date the date one day before.
Returns for a date the date one day before.
// If the given date is "20170310": -assert(DateHelper.previousDay("20170310") == "20170309") -assert(DateHelper.previousDay("170310", "yyMMdd") == "170309")
the date for which to find the date of the day before
(default = "yyyyMMdd") the format of the provided and the -returned dates.
the date of the day before the given date
Returns for a date the date one day before.
Returns for a date the date one day before.
assert(DateHelper.previousDay("20170310") == "20170309") +assert(DateHelper.previousDay("170310", "yyMMdd") == "170309")
the date for which to find the date of the day before
the format of the provided and the returned dates
the date of the day before the given date
Reformats a date from one format to another.
Reformats a date from one format to another.
assert(DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") == "170327")
the date to reformat
the format in which the date to reformat is provided
the format in which to format the provided date
the date under the new format
Sets the default date format used by these functions when no date format +is specified.
Sets the default date format used by these functions when no date format +is specified.
// By default, yyyyMMdd is used: +assert(3.daysBefore == "20170307") +// But this can be modified globally: +DateHelper.setFormat("ddMMMyy") +assert(3.daysBefore == "07Mar17")
the new default format
Returns today's date/time under the default format.
Returns today's date/time under the default format.
// If today's "20170310": +assert(DateHelper.today() == "20170310")
today's date under the default format
Returns today's date/time under the requested format.
Returns today's date/time under the requested format.
// If today's "20170310": -assert(DateHelper.today() == "20170310") -assert(DateHelper.today("yyMMdd") == "170310")
(default = "yyyyMMdd") the format for the current date
today's date under the requested format
the format for the current date
today's date under the requested format
Returns which date it was 2 days before today under the default format.
Returns which date it was 2 days before today under the default format.
// If today's "20170310": +assert(DateHelper.twoDaysAgo() == "20170308")
the date of two days ago under the default format
Returns which date it was 2 days before today under the requested format.
Returns which date it was 2 days before today under the requested format.
// If today's "20170310": -assert(DateHelper.twoDaysAgo() == "20170308") -assert(DateHelper.twoDaysAgo("yyMMdd") == "170308")
(default = "yyyyMMdd") the format in which to output the -date of two days ago.
the date of two days ago under the requested format
the format in which to output the date of two days ago
the date of two days ago under the requested format
Returns yesterday's date/time under the default format.
Returns yesterday's date/time under the default format.
// If today's "20170310": +assert(DateHelper.yesterday() == "20170309")
yesterday's date under the default format
Returns yesterday's date/time under the requested format.
Returns yesterday's date/time under the requested format.
// If today's "20170310": -assert(DateHelper.yesterday() == "20170309") -assert(DateHelper.yesterday("yyMMdd") == "170309")
(default = "yyyyMMdd") the format in which to output the -date of yesterday.
yesterday's date under the requested format
the format in which to output the date of yesterday
yesterday's date under the requested format
Saves list elements in a file on hdfs.
Saves list elements in a file on hdfs.
Please only consider this way of storing data when the data set is small +enough.
Overwrites the file if it already exists.
Array("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt") +List("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt")
the path of the file in which to write the content of +the List.
Saves the String in a file on hdfs.
Saves the String in a file on hdfs.
Overwrites the file if it already exists.
"some\nrelatively small\ntext".writeToHdfsFile("/some/hdfs/file/path.txt")
the path of the file in which to write the String
For instance, one don't want to remove a file from hdfs using 3 lines of code and thus could instead just use -HdfsHelper.deleteFile("my/hdfs/file/path.csv").
A few exemples:
import com.spark_helper.HdfsHelper +HdfsHelper.deleteFile("my/hdfs/file/path.csv").A few examples:
import com.spark_helper.HdfsHelper // A bunch of methods wrapping the FileSystem API, such as: -HdfsHelper.fileExists("my/hdfs/file/path.txt") -assert(HdfsHelper.listFileNamesInFolder("my/folder/path") == List("file_name_1.txt", "file_name_2.csv")) -assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306") -assert(HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") == 3) -HdfsHelper.deleteFile("my/hdfs/file/path.csv") -HdfsHelper.moveFolder("my/hdfs/folder") +HdfsHelper.fileExists("my/hdfs/file/path.txt") // HdfsHelper.folderExists("my/hdfs/folder") +HdfsHelper.listFileNamesInFolder("my/folder/path") // List("file_name_1.txt", "file_name_2.csv") +HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") // "20170306" +HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") // 3 +HdfsHelper.deleteFile("my/hdfs/file/path.csv") // HdfsHelper.deleteFolder("my/hdfs/folder") +HdfsHelper.moveFolder("old/path", "new/path") // HdfsHelper.moveFile("old/path.txt", "new/path.txt") +HdfsHelper.createEmptyHdfsFile("/some/hdfs/file/path.token") // HdfsHelper.createFolder("my/hdfs/folder") + +// File content helpers: HdfsHelper.compressFile("hdfs/path/to/uncompressed_file.txt", classOf[GzipCodec]) HdfsHelper.appendHeader("my/hdfs/file/path.csv", "colum0,column1") // Some Xml/Typesafe helpers for hadoop as well: -HdfsHelper.isHdfsXmlCompliantWithXsd( - "my/hdfs/file/path.xml", getClass.getResource("/some_xml.xsd")) +HdfsHelper.isHdfsXmlCompliantWithXsd("my/hdfs/file/path.xml", getClass.getResource("/some_xml.xsd")) HdfsHelper.loadXmlFileFromHdfs("my/hdfs/file/path.xml") -// Very handy to load a config (typesafe format) stored on hdfs at the -// begining of a spark job: +// Very handy to load a config (typesafe format) stored on hdfs at the beginning of a spark job: HdfsHelper.loadTypesafeConfigFromHdfs("my/hdfs/file/path.conf"): Config -// In order to write small amount of data in a file on hdfs without the -// whole spark stack: -HdfsHelper.writeToHdfsFile( - Array("some", "relatively small", "text"), - "/some/hdfs/file/path.txt") +// In order to write small amount of data in a file on hdfs without the whole spark stack: +HdfsHelper.writeToHdfsFile(Array("some", "relatively small", "text"), "/some/hdfs/file/path.txt") +// or: +import com.spark_helper.HdfsHelper._ +Array("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt") +"hello world".writeToHdfs("/some/hdfs/file/path.txt") -// Deletes all files/folders in "hdfs/path/to/folder" for which the -// timestamp is older than 10 days: +// Deletes all files/folders in "hdfs/path/to/folder" for which the timestamp is older than 10 days: HdfsHelper.purgeFolder("hdfs/path/to/folder", 10)Source HdfsHelper -
2017-02
2017-02
Create a touch method
Appends a footer to a file.
Appends a footer to a file.
If the workingFolderPath parameter is provided, then the processing is done in a working/tmp folder and then only, the final file is moved to its final real location. This way, in case of cluster instability, i.e. in -case the Spark job is interupted, this avoids having a temporary or +case the Spark job is interrupted, this avoids having a temporary or corrupted file in output.
the path of the file for which to add the footer
the footer to add
the path where file manipulations will happen
Appends a header to a file.
Appends a header to a file.
Usefull when creating a csv file with spark and you need to add a header +
Appends a header to a file.
Appends a header to a file.
Useful when creating a csv file with spark and you need to add a header describing the different fields.
If the workingFolderPath parameter is provided, then the processing is done in a working/tmp folder and then only, the final file is moved to its final real location. This way, in case of cluster instability, i.e. in -case the Spark job is interupted, this avoids having a temporary or +case the Spark job is interrupted, this avoids having a temporary or corrupted file in output.
the path of the file for which to add the header
the header to add
the path where file manipulations will happen
Appends a header and a footer to a file.
Appends a header and a footer to a file.
Usefull when creating an xml file with spark and you need to add top level +
Appends a header and a footer to a file.
Appends a header and a footer to a file.
Useful when creating an xml file with spark and you need to add top level tags.
If the workingFolderPath parameter is provided, then the processing is done in a working/tmp folder and then only, the final file is moved to its final real location. This way, in case of cluster instability, i.e. in -case the Spark job is interupted, this avoids having a temporary or +case the Spark job is interrupted, this avoids having a temporary or corrupted file in output.
the path of the file for which to add the header and the footer.
the header to add
the footer to add
the path where file manipulations will happen
Creates an empty file on hdfs.
Creates an empty file on hdfs.
Might be usefull for token files. For instance a file which is only used -as a timestamp token of the last update of a processus, or a file which +
Creates an empty file on hdfs.
Creates an empty file on hdfs.
Might be useful for token files. For instance a file which is only used +as a timestamp token of the last update of a process, or a file which blocks the execution of an other instance of the same job, ...
Overwrites the file if it already exists.
HdfsHelper.createEmptyHdfsFile("/some/hdfs/file/path.token")
In case this is used as a timestamp container, you can then use the following methods to retrieve its timestamp:
val fileAge = HdfsHelper.nbrOfDaysSinceFileWasLastModified("/some/hdfs/file/path.token") val lastModificationDate = HdfsHelper.folderModificationDate("/some/hdfs/file/path.token")
the path of the empty file to create
Returns the stringified date of the last modification of the given file.
Returns the stringified date of the last modification of the given file.
assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306")
the path of the file for which to get the last +
Returns the formatted date of the last modification of the given file.
Returns the formatted date of the last modification of the given file.
assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306")
the path of the file for which to get the last modification date.
(default = "yyyyMMdd") the format under which to get the -modification date.
the stringified date of the last modification of the given file, +modification date.
the formatted date of the last modification of the given file, under the provided format.
Returns the stringified date of the last modification of the given folder.
Returns the stringified date of the last modification of the given folder.
assert(HdfsHelper.folderModificationDate("my/hdfs/folder") == "20170306")
the path of the folder for which to get the last +
Returns the formatted date of the last modification of the given folder.
Returns the formatted date of the last modification of the given folder.
assert(HdfsHelper.folderModificationDate("my/hdfs/folder") == "20170306")
the path of the folder for which to get the last modification date.
(default = "yyyyMMdd") the format under which to get the -modification date.
the stringified date of the last modification of the given folder, +modification date.
the formatted date of the last modification of the given folder, under the provided format.
Loads a typesafe config from Hdfs.
Loads a typesafe config from Hdfs.
The best way to load the configuration of your job from hdfs.
Typesafe is a config format which looks like this:
config { +Loads a Typesafe config from Hdfs.
+}Loads a Typesafe config from Hdfs.
The best way to load the configuration of your job from hdfs.
Typesafe is a config format which looks like this:
config { airlines = [ { code = QF @@ -702,8 +740,8 @@} } ] -}
- hdfsConfigPath
the absolute path of the typesafe config file on -hdfs we want to load as a typesafe Config object.
- returns
the com.typesafe.config.Config object which contains usable data
the absolute path of the Typesafe config file on +hdfs we want to load as a Typesafe Config object.
the com.typesafe.config.Config object which contains usable data
the path of the folder on hdfs to purge
the threshold (in nbr of days) above which a file is considered too old and thus deleted/purged.
Sets a specific Configuration
+used by the underlying FileSystem
+in case it requires some specificities.
Sets a specific Configuration
+used by the underlying FileSystem
+in case it requires some specificities.
If this setter is not used, the default Configuration is set with
+new Configuration()
.
+
the specific Configuration to use
Sets a specific FileSystem
+in case it requires some specificities.
Sets a specific FileSystem
+in case it requires some specificities.
If this setter is not used, the default FileSystem is set with
+FileSystem.get(new Configuration())
.
+
the specific FileSystem to use
Overwrites the file if it already exists.
HdfsHelper.writeToHdfsFile( Array("some", "relatively small", "text"), "/some/hdfs/file/path.txt") HdfsHelper.writeToHdfsFile( - List("some", "relatively small", "text"), "/some/hdfs/file/path.txt")
the array of strings to write in the file as one line per + List("some", "relatively small", "text"), "/some/hdfs/file/path.txt")
the seq of strings to write in the file as one line per string (this takes care of joining strings with "\n"s).
the path of the file in which to write the content
A logger dedicated to Spak jobs.
It's a simple logger/report which contains a report that one can update from +
A logger dedicated to Spark jobs.
It's a simple logger/report which contains a report that one can update from the driver and a success state. The idea is to persist job executions logs -and errors (and forget about grepping unreadable yarn logs).
It's designed for perdiodic spark jobs (handles storage and purge of logs) +and errors (and forget about grepping unreadable yarn logs).
It's designed for periodic spark jobs (handles storage and purge of logs) and provides a way to handle kpis validation.
Logs are stored on the go which means one can have a direct real time access -of the job logs/status and it's current state (which can overwise be a pain +of the job logs/status and it's current state (which can otherwise be a pain if it means going through yarn logs, or even for certain production environments going through additional layers of software logs to get to yarn logs).
One of the issues this logger aims at tackling is the handling of exceptions @@ -62,9 +62,9 @@
This is a "driver-only" logger and is not intended at logging concurrent -actions from executors.
Produced reports can easily be inserted in a notification email whenerver +actions from executors.
Produced reports can easily be inserted in a notification email whenever the job fails, which saves a lot of time to maintainers operating on heavy -production environements.
The produced persisted report is also a way for downstream jobs to know the +production environments.
The produced persisted report is also a way for downstream jobs to know the status of their input data.
Let's go through a simple Spark job example monitored with this Monitor facility:
Monitor.setTitle("My job title") Monitor.addDescription( @@ -83,7 +83,7 @@Test("Nbr of output records", processedData.count(), SUPERIOR_THAN, 10e6d, NBR), Test("Some pct of invalid output", your_complex_kpi, INFERIOR_THAN, 3, PCT) ), - "My pipeline descirption" + "My pipeline description" ) if (outputIsValid) @@ -91,9 +91,9 @@
} catch { case iie: InvalidInputException => - Monitor.error(iie, "My pipeline descirption", diagnostic = "No input data!") + Monitor.error(iie, "My pipeline description", diagnostic = "No input data!") case e: Throwable => - Monitor.error(e, "My pipeline descirption") // whatever unexpected error + Monitor.error(e, "My pipeline description") // whatever unexpected error } if (Monitor.isSuccess()) { @@ -105,7 +105,7 @@
// HDFS (this saves the logs in the folder set with Monitor.setLogFolder): Monitor.store() -// At the end of the job, if the job isn't successfull, you might want to +// At the end of the job, if the job isn't successful, you might want to // crash it (for instance to get a notification from your scheduler): if (!Monitor.isSuccess()) throw new Exception() // or send an email, or ...
At any time during the job, logs can be accessed from file path/to/log/folder/current.ongoing
If we were to read the stored report after this simple pipeline, here are @@ -113,8 +113,8 @@
Another scenario, successfull spark pipeline and KPIs are valid; all good!:
My job title +[10:36] Duration: 00:13:47
Another scenario, successful spark pipeline and KPIs are valid; all good!:
My job title My job description (whatever you want); for instance: Documentation: https://github.com/xavierguihot/spark_helper -[10:23] Begining -[10:23-10:41] My pipeline descirption: success +[10:23] Beginning +[10:23-10:41] My pipeline description: success KPI: Nbr of output records Value: 14669071.0 Must be superior than 10000000.0 @@ -147,7 +147,7 @@[10:41-10:42] My second pipeline description: success [10:42] Duration: 00:19:23
Source Monitor -
2017-02
2017-02
would a State monad be appropriate?
Sets the report's contact list.
Sets the report's contact list.
This will appear within the first lines of the report:
// Using: Monitor.setReportTitle("My Simple Job") Monitor.addContacts(List("x.guihot@gmail.com", "smbdy@gmail.com")) -// Produces this at the begining of the report: +// Produces this at the beginning of the report: " My Simple Job" "" "Point of contact: x.guihot@gmail.com, smbdy@gmail.com"
the list of points of contact
Sets the report's description.
Sets the report's description.
This will appear within the first lines of the report:
// Using: Monitor.setReportTitle("My Simple Job") Monitor.addDescription("Documentation: https://github.com/xavierguihot/spark_helper") -// Produces this at the begining of the report: +// Produces this at the beginning of the report: " My Simple Job" "" "Documentation: https://github.com/xavierguihot/spark_helper"
the description of the Spark job (or whatever)
Catching an error like this:
monitor.error( invalidInputException, - "My pipeline descirption", - diagnostic = "No input data!")
will result in this to be appended to the report:
[10:23-10:24] My pipeline descirption: failed + "My pipeline description", + diagnostic = "No input data!")
will result in this to be appended to the report:
[10:23-10:24] My pipeline description: failed Diagnostic: No input data! org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://my/hdfs/input/path at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:285) @@ -410,7 +410,7 @@
Updates the report with some text and a failure.
Updates the report with some text and a failure.
This sets the status of the monitoring to false. After that the status -will never be success again, even if you update the report with success().
Using this method like this:
monitor.error("Some text")
will result in this to be appended to the report:
"[10:35-10:37] Some text: failure\n"
Once the monitoring is a failure, then whatever following successfull +will never be success again, even if you update the report with success().
Using this method like this:
monitor.error("Some text")
will result in this to be appended to the report:
"[10:35-10:37] Some text: failure\n"
Once the monitoring is a failure, then whatever following successful action won't change the failed status of the monitoring.
the text to append to the report
false since it's a failure
Sets the report's title.
Sets the report's title.
This will be the first line of the report:
// Using: Monitor.setReportTitle("My Simple Job") -// Produces this at the begining of the report: +// Produces this at the beginning of the report: " My Simple Job" ""
the title of the report
Flattens an RDD of Option[T]
+to RDD[T]
.
Flattens an RDD of Option[T]
+to RDD[T]
.
sc.parallelize(Array(Some(1), None, Some(2))).flatten == sc.parallelize(Array(Seq(1, 2)))
the flat RDD as RDD.flatMap(x => x)
+or List.flatten
+would have.
Saves and repartitions a key/value RDD on files whose name is the key.
Saves and repartitions a key/value RDD on files whose name is the key.
Within the provided path, there will be one file per key in the given +keyValueRDD. And within a file for a given key are only stored values +for this key.
This is not scalable. This shouldn't be considered for any data flow +with normal or big volumes.
rdd.saveAsTextFileByKey("/my/output/folder/path", 12, classOf[BZip2Codec])
the folder where will be stored key files
the nbr of expected keys (which is the nbr of output +files)
the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))
Saves and repartitions a key/value RDD on files whose name is the key.
Saves and repartitions a key/value RDD on files whose name is the key.
Within the provided path, there will be one file per key in the given +keyValueRDD. And within a file for a given key are only stored values +for this key.
As this internally needs to know the nbr of keys, this will have to
+compute it. If this nbr of keys is known beforehand, it would spare
+resources to use
+saveAsTextFileByKey(path: String, keyNbr: Int, codec: Class[_ <: CompressionCodec])
+instead.
This is not scalable. This shouldn't be considered for any data flow +with normal or big volumes.
rdd.saveAsTextFileByKey("/my/output/folder/path", classOf[BZip2Codec])
the folder where will be stored key files
the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))
Saves and repartitions a key/value RDD on files whose name is the key.
Saves and repartitions a key/value RDD on files whose name is the key.
Within the provided path, there will be one file per key in the given +keyValueRDD. And within a file for a given key are only stored values +for this key.
This is not scalable. This shouldn't be considered for any data flow +with normal or big volumes.
rdd.saveAsTextFileByKey("/my/output/folder/path", 12)
the folder where will be stored key files
the nbr of expected keys (which is the nbr of output +files)
Saves and repartitions a key/value RDD on files whose name is the key.
Saves and repartitions a key/value RDD on files whose name is the key.
Within the provided path, there will be one file per key in the given +keyValueRDD. And within a file for a given key are only stored values +for this key.
As this internally needs to know the nbr of keys, this will have to
+compute it. If this nbr of keys is known beforehand, it would spare
+resources to use saveAsTextFileByKey(path: String, keyNbr: Int)
+instead.
This is not scalable. This shouldn't be considered for any data flow +with normal or big volumes.
rdd.saveAsTextFileByKey("/my/output/folder/path")
the folder where will be stored key files
Map an RDD to the same type, by applying a partial function and the +identity otherwise.
Map an RDD to the same type, by applying a partial function and the +identity otherwise.
Avoids having case x => x
.
Similar idea to .collect
,
+but instead of skipping non-matching items, it keeps them as-is.
sc.parallelize(Array(1, 3, 2, 7, 8)).partialMap { case a if a % 2 == 0 => 2 * a } +// is equivalent to: +sc.parallelize(Array(1, 3, 2, 7, 8)).map { + case a if a % 2 == 0 => 2 * a + case a => a +} +// in order to map to: +sc.parallelize(Array(1, 3, 4, 7, 16))
the partial function to apply
an rdd of the same type, for which each element is either the +application of the partial function where defined or the identity.
Flattens an RDD of Seq[T]
+to RDD[T]
.
Flattens an RDD of Seq[T]
+to RDD[T]
.
sc.parallelize(Array(Seq(1, 2, 3), Nil, Seq(4))).flatten == sc.parallelize(Array(Seq(1, 2, 3, 4)))
the flat RDD as RDD.flatMap(identity)
+or List.flatten
+would have.
Decreases the nbr of partitions of a folder.
Decreases the nbr of partitions of a folder.
This comes in handy when the last step of your job needs to run on +thousands of files, but you want to store your final output on let's say +only 30 files.
It's like a FileUtil.copyMerge()
+, but the merging produces more than one file.
Be aware that this methods deletes the provided input folder.
sc.decreaseCoalescence( + "/folder/path/with/2000/files", + "/produced/folder/path/with/only/30/files", + 30, + classOf[BZip2Codec] +)
the folder which contains 10000 files
the folder which will contain the same +data as highCoalescenceLevelFolder but spread on only 30 files (where 30 +is the finalCoalesceLevel parameter).
the nbr of files within the folder at the end +of this method.
the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))
Decreases the nbr of partitions of a folder.
Decreases the nbr of partitions of a folder.
This comes in handy when the last step of your job needs to run on +thousands of files, but you want to store your final output on let's say +only 30 files.
It's like a FileUtil.copyMerge()
+, but the merging produces more than one file.
Be aware that this methods deletes the provided input folder.
sc.decreaseCoalescence( + "/folder/path/with/2000/files", + "/produced/folder/path/with/only/30/files", + 30 +)
the folder which contains 10000 files
the folder which will contain the same +data as highCoalescenceLevelFolder but spread on only 30 files (where 30 +is the finalCoalesceLevel parameter).
the nbr of files within the folder at the end +of this method.
A replacement for sc.textFile()
+when files contains commas in their name.
A replacement for sc.textFile()
+when files contains commas in their name.
As sc.textFile()
+allows to provide several files at once by giving them as a string which
+is a list of strings joined with ,
,
+we can't give it files containing commas in their name.
This method aims at bypassing this limitation by passing paths as a +sequence of strings.
sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt"))
the paths of the file(s)/folder(s) to read
the nbr of partitions in which to split the input
A replacement for sc.textFile()
+when files contains commas in their name.
A replacement for sc.textFile()
+when files contains commas in their name.
As sc.textFile()
+allows to provide several files at once by giving them as a string which
+is a list of strings joined with ,
,
+we can't give it files containing commas in their name.
This method aims at bypassing this limitation by passing paths as a +sequence of strings.
sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt"))
the paths of the file(s)/folder(s) to read
Equivalent to sparkContext.textFile()
+, but for a specific record delimiter.
Equivalent to sparkContext.textFile()
+, but for a specific record delimiter.
By default, sparkContext.textFile()
+will provide one record per line (per '\n'
).
+But what if the format to read considers that one record is stored in
+more than one line (yml, custom format, ...)?
For instance in order to read a yml file, which is a format for which a
+record (a single entity) is spread other several lines, you can modify
+the record delimiter with "---\n"
+instead of "\n"
.
+Same goes when reading an xml file where a record might be spread over
+several lines or worse the whole xml file is one line.
// Let's say data we want to use with Spark looks like this (one record +// is a customer, but it's spread over several lines): +<Customers>\n +<Customer>\n +<Address>34 thingy street, someplace, sometown</Address>\n +</Customer>\n +<Customer>\n +<Address>12 thingy street, someplace, sometown</Address>\n +</Customer>\n +</Customers> +//Then you can use it this way: +val computedRecords = sc.textFile("my/path/to/customers.xml", "\n" ) +val expectedRecords = RDD( + <Customers>\n, + ( + <Address>34 thingy street, someplace, sometown</Address>\n + + </Customer>\n + ), + ( + <Address>12 thingy street, someplace, sometown</Address>\n + + </Customer>\n + + </Customers> + ) +) +assert(computedRecords == expectedRecords)
the path of the file to read (folder or file, '*' works +as well).
the specific record delimiter which replaces "\n"
the max length (not sure which unit) of a record +before considering the record too long to fit into memory.
the RDD of records
Equivalent to sparkContext.textFile()
+, but each record is associated with the file path it comes from.
Equivalent to sparkContext.textFile()
+, but each record is associated with the file path it comes from.
Produces an RDD[(file_name, line)]
+which provides a way to know from which file a given line comes from.
// Considering this folder: +// folder/file_1.txt whose content is data1\ndata2\ndata3 +// folder/file_2.txt whose content is data4\ndata4 +// folder/folder_1/file_3.txt whose content is data6\ndata7 +// then: +sc.textFileWithFileName("folder") +// will return: +RDD( + ("file:/path/on/machine/folder/file_1.txt", "data1"), + ("file:/path/on/machine/folder/file_1.txt", "data2"), + ("file:/path/on/machine/folder/file_1.txt", "data3"), + ("file:/path/on/machine/folder/file_2.txt", "data4"), + ("file:/path/on/machine/folder/file_2.txt", "data5"), + ("file:/path/on/machine/folder/folder_1/file_3.txt", "data6"), + ("file:/path/on/machine/folder/folder_1/file_3.txt", "data7") +)
the path of the folder (or structure of folders) to read
the RDD of records where a record is a tuple containing the path +of the file the record comes from and the record itself.
Saves an RDD in exactly one file.
Saves an RDD in exactly one file.
Allows one to save an RDD in one file, while keeping the processing +distributed.
This variant of saveAsSingleTextFile
+performs the storage in a temporary folder instead of directly in the
+final output folder. This way the risks of having corrupted files in the
+real output folder due to cluster interruptions is minimized.
rdd.saveAsSingleTextFile("/my/file/path.txt", "/my/working/folder/path", classOf[BZip2Codec])
the path of the produced file
the path where file manipulations will temporarily +happen.
the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))
Saves an RDD in exactly one file.
Saves an RDD in exactly one file.
Allows one to save an RDD in one file, while keeping the processing +distributed.
This variant of saveAsSingleTextFile
+performs the storage in a temporary folder instead of directly in the
+final output folder. This way the risks of having corrupted files in the
+real output folder due to cluster interruptions is minimized.
rdd.saveAsSingleTextFile("/my/file/path.txt", "/my/working/folder/path")
the path of the produced file
the path where file manipulations will temporarily +happen.
Saves an RDD in exactly one file.
Saves an RDD in exactly one file.
Allows one to save an RDD in one file, while keeping the processing +distributed.
rdd.saveAsSingleTextFile("/my/file/path.txt", classOf[BZip2Codec])
the path of the produced file
the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))
Saves an RDD in exactly one file.
Saves an RDD in exactly one file.
Allows one to save an RDD in one file, while keeping the processing +distributed.
rdd.saveAsSingleTextFile("/my/file/path.txt")
the path of the produced file
Saves as text file, and decreases the nbr of output partitions.
Saves as text file, and decreases the nbr of output partitions.
Same as rdd.saveAsTextFile()
+, but decreases the nbr of partitions in the output folder before doing
+so.
The result is equivalent to rdd.coalesce(x).saveAsTextFile()
+, but if x
+is very low, coalesce
+would make the processing time explode, whereas this methods keeps the
+processing distributed, save as text file and then only merges the
+result in a lower nbr of partitions.
rdd.saveAsTextFileAndCoalesce("/produced/folder/path/with/only/30/files", 30, classOf[BZip2Codec])
the folder where will finally be stored the RDD but spread +on only 30 files (where 30 is the value of the finalCoalesceLevel +parameter).
the nbr of files within the folder at the end +of this method.
the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))
Saves as text file, but by decreasing the nbr of partitions of the output.
Saves as text file, but by decreasing the nbr of partitions of the output.
Same as rdd.saveAsTextFile()
+, but decreases the nbr of partitions in the output folder before doing
+so.
The result is equivalent to rdd.coalesce(x).saveAsTextFile()
+, but if x
+is very low, coalesce
+would make the processing time explode, whereas this methods keeps the
+processing distributed, save as text file and then only merges the
+result in a lower nbr of partitions.
rdd.saveAsTextFileAndCoalesce("/produced/folder/path/with/only/30/files", 30)
the folder where will finally be stored the RDD but spread +on only 30 files (where 30 is the value of the finalCoalesceLevel +parameter).
the nbr of files within the folder at the end +of this method.
A facility to deal with RDD/file manipulations based on the Spark API.
The goal is to remove the maximum of highly used low-level code from your spark job and replace it with methods fully tested whose name is -self-explanatory/readable.
A few exemples:
// Same as SparkContext.saveAsTextFile, but the result is a single file: -SparkHelper.saveAsSingleTextFile(myOutputRDD, "/my/output/file/path.txt") -// Same as SparkContext.textFile, but instead of reading one record per -// line, it reads records spread over several lines. -// This way, xml, json, yml or any multi-line record file format can be used -// with Spark: -SparkHelper.textFileWithDelimiter("/my/input/folder/path", sparkContext, "---\n") -// Same as SparkContext.textFile, but instead of returning an RDD of -// records, it returns an RDD of tuples containing both the record and the -// path of the file it comes from: -SparkHelper.textFileWithFileName("folder", sparkContext)
Source import com.spark_helper.SparkHelper._
+
+// Same as rdd.saveAsTextFile("path"), but the result is a single file (while
+// keeping the processing distributed):
+rdd.saveAsSingleTextFile("/my/output/file/path.txt")
+rdd.saveAsSingleTextFile("/my/output/file/path.txt", classOf[BZip2Codec])
+
+// Same as sc.textFile("path"), but instead of reading one record per line (by
+// splitting the input with \n), it splits the file in records based on a custom
+// delimiter. This way, xml, json, yml or any multi-line record file format can
+// be used with Spark:
+sc.textFile("/my/input/folder/path", "---\n") // for a yml file for instance
+
+// Equivalent to rdd.flatMap(identity) for RDDs of Seqs or Options:
+rdd.flatten
+
+// Equivalent to sc.textFile(), but for each line is tupled with its file path:
+sc.textFileWithFileName("/my/input/folder/path")
+// which produces:
+// RDD(("folder/file_1.txt", "record1fromfile1"), ("folder/file_1.txt", "record2fromfile1"),
+// ("folder/file_2.txt", "record1fromfile2"), ...)
+
+// In the given folder, this generates one file per key in the given key/value
+// RDD. Within each file (named from the key) are all values for this key:
+rdd.saveAsTextFileByKey("/my/output/folder/path")
+
+// Concept mapper (the following example transforms RDD(1, 3, 2, 7, 8) into RDD(1, 3, 4, 7, 16)):
+rdd.partialMap { case a if a % 2 == 0 => 2 * a }
+
+// For when input files contain commas and textFile can't handle it:
+sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt")) Source SparkHelper
-
2017-02
2017-02
sc.parallelize[T](elmts: T*) instead of sc.parallelize[T](elmts: Array[T])
Decreases the nbr of partitions of a folder.
Decreases the nbr of partitions of a folder.
This is often handy when the last step of your job needs to run on -thousands of files, but you want to store your final output on let's say -only 300 files.
It's like a FileUtil.copyMerge, but the merging produces more than one -file.
Be aware that this methods deletes the provided input folder.
SparkHelper.decreaseCoalescence( - "/folder/path/with/2000/files", - "/produced/folder/path/with/only/300/files", - 300, - sparkContext, - classOf[BZip2Codec])
the folder which contains 10000 files
the folder which will contain the same -data as highCoalescenceLevelFolder but spread on only 300 files (where 300 -is the finalCoalescenceLevel parameter).
the nbr of files within the folder at the end -of this method.
the SparkContext
the type of compression to use (for instance -classOf[BZip2Codec] or classOf[GzipCodec]))
Decreases the nbr of partitions of a folder.
Decreases the nbr of partitions of a folder.
This is often handy when the last step of your job needs to run on -thousands of files, but you want to store your final output on let's say -only 300 files.
It's like a FileUtil.copyMerge, but the merging produces more than one -file.
Be aware that this methods deletes the provided input folder.
SparkHelper.decreaseCoalescence( - "/folder/path/with/2000/files", - "/produced/folder/path/with/only/300/files", - 300, - sparkContext)
the folder which contains 10000 files
the folder which will contain the same -data as highCoalescenceLevelFolder but spread on only 300 files (where 300 -is the finalCoalescenceLevel parameter).
the nbr of files within the folder at the end -of this method.
the SparkContext
Saves an RDD in exactly one file.
Saves an RDD in exactly one file.
Allows one to save an RDD in one file, while keeping the processing -parallelized.
This variant of saveAsSingleTextFile performs the storage in a temporary -folder instead of directly in the final output folder. This way the risks -of having corrupted files in the real output folder due to cluster -interruptions is minimized.
SparkHelper.saveAsSingleTextFile( - myRddToStore, - "/my/file/path.txt", - "/my/working/folder/path", - classOf[BZip2Codec])
the RDD of strings to store in one file
the path of the produced file
the path where file manipulations will temporarily -happen.
the type of compression to use (for instance -classOf[BZip2Codec] or classOf[GzipCodec]))
Saves an RDD in exactly one file.
Saves an RDD in exactly one file.
Allows one to save an RDD in one file, while keeping the processing -parallelized.
This variant of saveAsSingleTextFile performs the storage in a temporary -folder instead of directly in the final output folder. This way the -risks of having corrupted files in the real output folder due to cluster -interruptions is minimized.
SparkHelper.saveAsSingleTextFile( - myRddToStore, "/my/file/path.txt", "/my/working/folder/path")
the RDD of strings to store in one file
the path of the produced file
the path where file manipulations will temporarily -happen.
Saves an RDD in exactly one file.
Saves an RDD in exactly one file.
Allows one to save an RDD in one file, while keeping the processing -parallelized.
SparkHelper.saveAsSingleTextFile(
- myRddToStore, "/my/file/path.txt", classOf[BZip2Codec])
the RDD of strings to store in one file
the path of the produced file
the type of compression to use (for instance -classOf[BZip2Codec] or classOf[GzipCodec]))
Saves an RDD in exactly one file.
Saves an RDD in exactly one file.
Allows one to save an RDD in one file, while keeping the processing -parallelized.
SparkHelper.saveAsSingleTextFile(myRddToStore, "/my/file/path.txt")
the RDD of strings to store in one file
the path of the produced file
Saves as text file, but by decreasing the nbr of partitions of the output.
Saves as text file, but by decreasing the nbr of partitions of the output.
Same as decreaseCoalescence, but the storage of the RDD in an intermediate -folder is included.
This still makes the processing parallelized, but the output is coalesced.
SparkHelper.saveAsTextFileAndCoalesce( - myRddToStore, - "/produced/folder/path/with/only/300/files", - 300, - classOf[BZip2Codec])
the RDD to store, processed for instance on 10000 tasks -(which would thus be stored as 10000 files).
the folder where will finally be stored the RDD but -spread on only 300 files (where 300 is the value of the -finalCoalescenceLevel parameter).
the nbr of files within the folder at the end -of this method.
the type of compression to use (for instance -classOf[BZip2Codec] or classOf[GzipCodec]))
Saves as text file, but by decreasing the nbr of partitions of the output.
Saves as text file, but by decreasing the nbr of partitions of the output.
Same as decreaseCoalescence, but the storage of the RDD in an intermediate -folder is included.
This still makes the processing parallelized, but the output is coalesced.
SparkHelper.saveAsTextFileAndCoalesce( - myRddToStore, "/produced/folder/path/with/only/300/files", 300)
the RDD to store, processed for instance on 10000 tasks -(which would thus be stored as 10000 files).
the folder where will finally be stored the RDD but -spread on only 300 files (where 300 is the value of the -finalCoalescenceLevel parameter).
the nbr of files within the folder at the end -of this method.
Saves and repartitions a key/value RDD on files whose name is the key.
Saves and repartitions a key/value RDD on files whose name is the key.
Within the provided outputFolder, will be one file per key in your -keyValueRDD. And within a file for a given key are only values for this -key.
You need to know the nbr of keys beforehand (in general you use this to -split your dataset in subsets, or to output one file per client, so you -know how many keys you have). So you need to put as keyNbr the exact nbr -of keys you'll have.
This is not scalable. This shouldn't be considered for any data flow with -normal or big volumes.
SparkHelper.saveAsTextFileByKey( - myKeyValueRddToStore, "/my/output/folder/path", 12, classOf[BZip2Codec])
the key/value RDD
the foldder where will be storrred key files
the nbr of expected keys (which is the nbr of outputed files)
the type of compression to use (for instance -classOf[BZip2Codec] or classOf[GzipCodec]))
Saves and repartitions a key/value RDD on files whose name is the key.
Saves and repartitions a key/value RDD on files whose name is the key.
Within the provided outputFolder, will be one file per key in your -keyValueRDD. And within a file for a given key are only values for this -key.
You need to know the nbr of keys beforehand (in general you use this to -split your dataset in subsets, or to output one file per client, so you -know how many keys you have). So you need to put as keyNbr the exact nbr -of keys you'll have.
This is not scalable. This shouldn't be considered for any data flow with -normal or big volumes.
SparkHelper.saveAsTextFileByKey( - myKeyValueRddToStore, "/my/output/folder/path", 12)
the key/value RDD
the foldder where will be storrred key files
the nbr of expected keys (which is the nbr of outputed files)
Equivalent to sparkContext.textFile(), but for a specific record delimiter.
Equivalent to sparkContext.textFile(), but for a specific record delimiter.
By default, sparkContext.textFile() will provide one record per line. But -what if the format you want to read considers that one record (one entity) -is stored in more than one line (yml, xml, ...)?
For instance in order to read a yml file, which is a format for which a -record (a single entity) is spread other several lines, you can modify the -record delimiter with "---\n" instead of "\n". Same goes when reading an -xml file where a record might be spread over several lines or worse the -whole xml file is one line.
// Let's say data we want to use with Spark looks like this (one record is -// a customer, but it's spread over several lines): -<Customers>\n -<Customer>\n -<Address>34 thingy street, someplace, sometown</Address>\n -</Customer>\n -<Customer>\n -<Address>12 thingy street, someplace, sometown</Address>\n -</Customer>\n -</Customers> -//Then you can use it this way: -val computedRecords = SparkHelper.textFileWithDelimiter( - "my/path/to/customers.xml", sparkContext, <Customer>\n -).collect() -val expectedRecords = Array( - <Customers>\n, - ( - <Address>34 thingy street, someplace, sometown</Address>\n + - </Customer>\n - ), - ( - <Address>12 thingy street, someplace, sometown</Address>\n + - </Customer>\n + - </Customers> - ) -) -assert(computedRecords == expectedRecords)
the path of the file to read (folder or file, '*' works as -well).
the SparkContext
the specific record delimiter which replaces "\n"
the max length (not sure which unit) of a record -before considering the record too long to fit into memory.
the RDD of records
Equivalent to sparkContext.textFile(), but for each line is associated -with its file path.
Equivalent to sparkContext.textFile(), but for each line is associated -with its file path.
Produces a RDD[(file_name, line)] which provides a way to know from which -file a given line comes from.
// Considering this folder: -// folder/file_1.txt whose content is data1\ndata2\ndata3 -// folder/file_2.txt whose content is data4\ndata4 -// folder/folder_1/file_3.txt whose content is data6\ndata7 -// then: -SparkHelper.textFileWithFileName("folder", sparkContext) -// will return: -RDD( - ("file:/path/on/machine/folder/file_1.txt", "data1"), - ("file:/path/on/machine/folder/file_1.txt", "data2"), - ("file:/path/on/machine/folder/file_1.txt", "data3"), - ("file:/path/on/machine/folder/file_2.txt", "data4"), - ("file:/path/on/machine/folder/file_2.txt", "data5"), - ("file:/path/on/machine/folder/folder_1/file_3.txt", "data6"), - ("file:/path/on/machine/folder/folder_1/file_3.txt", "data7") -)
the path of the folder (or structure of folders) to read
the SparkContext
the RDD of records where a record is a tuple containing the path -of the file the record comes from and the record itself.
A class which represents a KPI to validate.
This is intended to be used as parameter of Monitor.updateByKpiValidation -and Monitor.updateByKpisValidation methods.
Some exemples of Test objects:
Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT) +and Monitor.updateByKpisValidation methods.Some examples of Test objects:
Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT) Test("pctOfSomethingElse", 0.27d, SUPERIOR_THAN, 0.3d, PCT) Test("someNbr", 1235d, EQUAL_TO, 1235d, NBR)
the name/description of the KPI which will appear on the validation report.
the value for this KPI
the type of threshold (SUPERIOR_THAN, INFERIOR_THAN or @@ -110,7 +110,7 @@
Creates a Test object.
Creates a Test object. -Some exemples of Test objects:
Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT) +Some examples of Test objects:Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT) Test("pctOfSomethingElse", 0.27d, SUPERIOR_THAN, 0.3d, PCT) Test("someNbr", 1235d, EQUAL_TO, 1235d, NBR)
the name/description of the KPI which will appear on the validation report.
the value for this KPI
the type of threshold (SUPERIOR_THAN, INFERIOR_THAN or diff --git a/docs/com/spark_helper/monitoring/ThresholdType.html b/docs/com/spark_helper/monitoring/ThresholdType.html index b0c6eef..61447b9 100644 --- a/docs/com/spark_helper/monitoring/ThresholdType.html +++ b/docs/com/spark_helper/monitoring/ThresholdType.html @@ -49,7 +49,7 @@
An enumeration which represents the type of threshol to use (EQUAL_TO, +
An enumeration which represents the type of threshold to use (EQUAL_TO, SUPERIOR_THAN or INFERIOR_THAN)
A class which represents a KPI to validate.
A class which represents a KPI to validate.
This is intended to be used as parameter of Monitor.updateByKpiValidation -and Monitor.updateByKpisValidation methods.
Some exemples of Test objects:
Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT) +and Monitor.updateByKpisValidation methods.Some examples of Test objects:
Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT) Test("pctOfSomethingElse", 0.27d, SUPERIOR_THAN, 0.3d, PCT) Test("someNbr", 1235d, EQUAL_TO, 1235d, NBR)
the name/description of the KPI which will appear on the validation report.
the value for this KPI
the type of threshold (SUPERIOR_THAN, INFERIOR_THAN or @@ -121,7 +121,7 @@
An enumeration which represents the type of threshol to use (EQUAL_TO, +
An enumeration which represents the type of threshold to use (EQUAL_TO, SUPERIOR_THAN or INFERIOR_THAN)
A facility which deals with usual date needs (wrapper around joda-time).
The goal is to remove the maximum of highly used low-level code from your spark job and replace it with methods fully tested whose name is -self-explanatory/readable.
A few exemples:
assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101")) -assert(DateHelper.today() == "20170310") // If today's "20170310" -assert(DateHelper.yesterday() == "20170309") // If today's "20170310" -assert(DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") == "170327") -assert(DateHelper.now("HH:mm") == "10:24") -assert(DateHelper.currentTimestamp() == "1493105229736") -assert(DateHelper.nDaysBefore(3) == "20170307") // If today's "20170310" -assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310")
Source import com.spark_helper.DateHelper
+
+DateHelper.daysBetween("20161230", "20170101") // List("20161230", "20161231", "20170101")
+DateHelper.today // "20170310"
+DateHelper.yesterday // "20170309"
+DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") // "170327"
+DateHelper.now("HH:mm") // "10:24"
+DateHelper.currentTimestamp // "1493105229736"
+DateHelper.nDaysBefore(3) // "20170307"
+DateHelper.nDaysAfterDate(3, "20170307") // "20170310"
+DateHelper.nextDay("20170310") // "20170311"
+DateHelper.nbrOfDaysSince("20170302") // 8
+DateHelper.nbrOfDaysBetween("20170327", "20170401") // 5
+DateHelper.dayOfWeek("20160614") // 2
+
+import com.spark_helper.DateHelper._
+
+2.daysAgo // "20170308"
+"20161230" to "20170101" // List("20161230", "20161231", "20170101")
+3.daysBefore("20170310") // "20170307"
+5.daysAfter // "20170315"
+4.daysAfter("20170310") // "20170314"
+"20170302".isCompliantWith("yyyyMMdd")
+"20170310".nextDay // "20170311"
+"20170310".previousDay // "20170309" Source DateHelper
2017-02
For instance, one don't want to remove a file from hdfs using 3 lines of code and thus could instead just use -HdfsHelper.deleteFile("my/hdfs/file/path.csv").
A few exemples:
import com.spark_helper.HdfsHelper +HdfsHelper.deleteFile("my/hdfs/file/path.csv").A few examples:
import com.spark_helper.HdfsHelper // A bunch of methods wrapping the FileSystem API, such as: -HdfsHelper.fileExists("my/hdfs/file/path.txt") -assert(HdfsHelper.listFileNamesInFolder("my/folder/path") == List("file_name_1.txt", "file_name_2.csv")) -assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306") -assert(HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") == 3) -HdfsHelper.deleteFile("my/hdfs/file/path.csv") -HdfsHelper.moveFolder("my/hdfs/folder") +HdfsHelper.fileExists("my/hdfs/file/path.txt") // HdfsHelper.folderExists("my/hdfs/folder") +HdfsHelper.listFileNamesInFolder("my/folder/path") // List("file_name_1.txt", "file_name_2.csv") +HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") // "20170306" +HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") // 3 +HdfsHelper.deleteFile("my/hdfs/file/path.csv") // HdfsHelper.deleteFolder("my/hdfs/folder") +HdfsHelper.moveFolder("old/path", "new/path") // HdfsHelper.moveFile("old/path.txt", "new/path.txt") +HdfsHelper.createEmptyHdfsFile("/some/hdfs/file/path.token") // HdfsHelper.createFolder("my/hdfs/folder") + +// File content helpers: HdfsHelper.compressFile("hdfs/path/to/uncompressed_file.txt", classOf[GzipCodec]) HdfsHelper.appendHeader("my/hdfs/file/path.csv", "colum0,column1") // Some Xml/Typesafe helpers for hadoop as well: -HdfsHelper.isHdfsXmlCompliantWithXsd( - "my/hdfs/file/path.xml", getClass.getResource("/some_xml.xsd")) +HdfsHelper.isHdfsXmlCompliantWithXsd("my/hdfs/file/path.xml", getClass.getResource("/some_xml.xsd")) HdfsHelper.loadXmlFileFromHdfs("my/hdfs/file/path.xml") -// Very handy to load a config (typesafe format) stored on hdfs at the -// begining of a spark job: +// Very handy to load a config (typesafe format) stored on hdfs at the beginning of a spark job: HdfsHelper.loadTypesafeConfigFromHdfs("my/hdfs/file/path.conf"): Config -// In order to write small amount of data in a file on hdfs without the -// whole spark stack: -HdfsHelper.writeToHdfsFile( - Array("some", "relatively small", "text"), - "/some/hdfs/file/path.txt") +// In order to write small amount of data in a file on hdfs without the whole spark stack: +HdfsHelper.writeToHdfsFile(Array("some", "relatively small", "text"), "/some/hdfs/file/path.txt") +// or: +import com.spark_helper.HdfsHelper._ +Array("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt") +"hello world".writeToHdfs("/some/hdfs/file/path.txt") -// Deletes all files/folders in "hdfs/path/to/folder" for which the -// timestamp is older than 10 days: +// Deletes all files/folders in "hdfs/path/to/folder" for which the timestamp is older than 10 days: HdfsHelper.purgeFolder("hdfs/path/to/folder", 10)Source HdfsHelper -
2017-02
2017-02
Create a touch method
A logger dedicated to Spak jobs.
A logger dedicated to Spak jobs.
It's a simple logger/report which contains a report that one can update from +
A logger dedicated to Spark jobs.
A logger dedicated to Spark jobs.
It's a simple logger/report which contains a report that one can update from the driver and a success state. The idea is to persist job executions logs -and errors (and forget about grepping unreadable yarn logs).
It's designed for perdiodic spark jobs (handles storage and purge of logs) +and errors (and forget about grepping unreadable yarn logs).
It's designed for periodic spark jobs (handles storage and purge of logs) and provides a way to handle kpis validation.
Logs are stored on the go which means one can have a direct real time access -of the job logs/status and it's current state (which can overwise be a pain +of the job logs/status and it's current state (which can otherwise be a pain if it means going through yarn logs, or even for certain production environments going through additional layers of software logs to get to yarn logs).
One of the issues this logger aims at tackling is the handling of exceptions @@ -183,9 +201,9 @@
This is a "driver-only" logger and is not intended at logging concurrent -actions from executors.
Produced reports can easily be inserted in a notification email whenerver +actions from executors.
Produced reports can easily be inserted in a notification email whenever the job fails, which saves a lot of time to maintainers operating on heavy -production environements.
The produced persisted report is also a way for downstream jobs to know the +production environments.
The produced persisted report is also a way for downstream jobs to know the status of their input data.
Let's go through a simple Spark job example monitored with this Monitor facility:
Monitor.setTitle("My job title") Monitor.addDescription( @@ -204,7 +222,7 @@Test("Nbr of output records", processedData.count(), SUPERIOR_THAN, 10e6d, NBR), Test("Some pct of invalid output", your_complex_kpi, INFERIOR_THAN, 3, PCT) ), - "My pipeline descirption" + "My pipeline description" ) if (outputIsValid) @@ -212,9 +230,9 @@
} catch { case iie: InvalidInputException => - Monitor.error(iie, "My pipeline descirption", diagnostic = "No input data!") + Monitor.error(iie, "My pipeline description", diagnostic = "No input data!") case e: Throwable => - Monitor.error(e, "My pipeline descirption") // whatever unexpected error + Monitor.error(e, "My pipeline description") // whatever unexpected error } if (Monitor.isSuccess()) { @@ -226,7 +244,7 @@
// HDFS (this saves the logs in the folder set with Monitor.setLogFolder): Monitor.store() -// At the end of the job, if the job isn't successfull, you might want to +// At the end of the job, if the job isn't successful, you might want to // crash it (for instance to get a notification from your scheduler): if (!Monitor.isSuccess()) throw new Exception() // or send an email, or ...
At any time during the job, logs can be accessed from file path/to/log/folder/current.ongoing
If we were to read the stored report after this simple pipeline, here are @@ -234,8 +252,8 @@
Another scenario, successfull spark pipeline and KPIs are valid; all good!:
My job title +[10:36] Duration: 00:13:47
Another scenario, successful spark pipeline and KPIs are valid; all good!:
My job title My job description (whatever you want); for instance: Documentation: https://github.com/xavierguihot/spark_helper -[10:23] Begining -[10:23-10:41] My pipeline descirption: success +[10:23] Beginning +[10:23-10:41] My pipeline description: success KPI: Nbr of output records Value: 14669071.0 Must be superior than 10000000.0 @@ -268,7 +286,7 @@[10:41-10:42] My second pipeline description: success [10:42] Duration: 00:19:23
Source Monitor -
2017-02
2017-02
would a State monad be appropriate?
A facility to deal with RDD/file manipulations based on the Spark API.
A facility to deal with RDD/file manipulations based on the Spark API.
The goal is to remove the maximum of highly used low-level code from your spark job and replace it with methods fully tested whose name is -self-explanatory/readable.
A few exemples:
// Same as SparkContext.saveAsTextFile, but the result is a single file: -SparkHelper.saveAsSingleTextFile(myOutputRDD, "/my/output/file/path.txt") -// Same as SparkContext.textFile, but instead of reading one record per -// line, it reads records spread over several lines. -// This way, xml, json, yml or any multi-line record file format can be used -// with Spark: -SparkHelper.textFileWithDelimiter("/my/input/folder/path", sparkContext, "---\n") -// Same as SparkContext.textFile, but instead of returning an RDD of -// records, it returns an RDD of tuples containing both the record and the -// path of the file it comes from: -SparkHelper.textFileWithFileName("folder", sparkContext)
Source import com.spark_helper.SparkHelper._
+
+// Same as rdd.saveAsTextFile("path"), but the result is a single file (while
+// keeping the processing distributed):
+rdd.saveAsSingleTextFile("/my/output/file/path.txt")
+rdd.saveAsSingleTextFile("/my/output/file/path.txt", classOf[BZip2Codec])
+
+// Same as sc.textFile("path"), but instead of reading one record per line (by
+// splitting the input with \n), it splits the file in records based on a custom
+// delimiter. This way, xml, json, yml or any multi-line record file format can
+// be used with Spark:
+sc.textFile("/my/input/folder/path", "---\n") // for a yml file for instance
+
+// Equivalent to rdd.flatMap(identity) for RDDs of Seqs or Options:
+rdd.flatten
+
+// Equivalent to sc.textFile(), but for each line is tupled with its file path:
+sc.textFileWithFileName("/my/input/folder/path")
+// which produces:
+// RDD(("folder/file_1.txt", "record1fromfile1"), ("folder/file_1.txt", "record2fromfile1"),
+// ("folder/file_2.txt", "record1fromfile2"), ...)
+
+// In the given folder, this generates one file per key in the given key/value
+// RDD. Within each file (named from the key) are all values for this key:
+rdd.saveAsTextFileByKey("/my/output/folder/path")
+
+// Concept mapper (the following example transforms RDD(1, 3, 2, 7, 8) into RDD(1, 3, 4, 7, 16)):
+rdd.partialMap { case a if a % 2 == 0 => 2 * a }
+
+// For when input files contain commas and textFile can't handle it:
+sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt")) Source SparkHelper
-
2017-02
2017-02
sc.parallelize[T](elmts: T*) instead of sc.parallelize[T](elmts: Array[T])
Configuration
+ * used by the underlying FileSystem
+ * in case it requires some specificities.
+ *
+ * If this setter is not used, the default Configuration is set with
+ * new Configuration()
.
+ *
+ * @param configuration the specific Configuration to use
+ */
+ def setConf(configuration: Configuration): Unit = {
+ conf = configuration
+ hdfs = FileSystem.get(configuration)
+ }
+
+ /** Sets a specific FileSystem
+ * in case it requires some specificities.
+ *
+ * If this setter is not used, the default FileSystem is set with
+ * FileSystem.get(new Configuration())
.
+ *
+ * @param fileSystem the specific FileSystem to use
+ */
+ def setFileSystem(fileSystem: FileSystem): Unit = hdfs = fileSystem
+
+ implicit class SeqExtensions[T <: Seq[String]: ClassTag](val seq: T) {
+
+ /** Saves list elements in a file on hdfs.
+ *
+ * Please only consider this way of storing data when the data set is small
+ * enough.
+ *
+ * Overwrites the file if it already exists.
+ *
+ * {{{
+ * Array("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt")
+ * List("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt")
+ * }}}
+ *
+ * @param filePath the path of the file in which to write the content of
+ * the List.
+ */
+ def writeToHdfs(filePath: String): Unit =
+ HdfsHelper.writeToHdfsFile(seq, filePath)
+ }
+
+ implicit class StringExtensions(val string: String) {
+
+ /** Saves the String in a file on hdfs.
+ *
+ * Overwrites the file if it already exists.
+ *
+ * {{{ "some\nrelatively small\ntext".writeToHdfsFile("/some/hdfs/file/path.txt") }}}
+ *
+ * @param filePath the path of the file in which to write the String
+ */
+ def writeToHdfs(filePath: String): Unit =
+ HdfsHelper.writeToHdfsFile(string, filePath)
+ }
+
/** Deletes a file on HDFS.
*
* Doesn't throw an exception if the file to delete doesn't exist.
@@ -85,17 +151,16 @@ object HdfsHelper extends Serializable {
*/
def deleteFile(hdfsPath: String): Unit = {
- val fileSystem = FileSystem.get(new Configuration())
-
val fileToDelete = new Path(hdfsPath)
- if (fileSystem.exists(fileToDelete)) {
+ if (hdfs.exists(fileToDelete)) {
require(
- fileSystem.isFile(fileToDelete),
- "to delete a folder, prefer using the deleteFolder() method.")
+ hdfs.isFile(fileToDelete),
+ "to delete a folder, prefer using the deleteFolder() method."
+ )
- fileSystem.delete(fileToDelete, true)
+ hdfs.delete(fileToDelete, true)
}
}
@@ -107,17 +172,16 @@ object HdfsHelper extends Serializable {
*/
def deleteFolder(hdfsPath: String): Unit = {
- val fileSystem = FileSystem.get(new Configuration())
-
val folderToDelete = new Path(hdfsPath)
- if (fileSystem.exists(folderToDelete)) {
+ if (hdfs.exists(folderToDelete)) {
require(
- !fileSystem.isFile(folderToDelete),
- "to delete a file, prefer using the deleteFile() method.")
+ !hdfs.isFile(folderToDelete),
+ "to delete a file, prefer using the deleteFile() method."
+ )
- fileSystem.delete(folderToDelete, true)
+ hdfs.delete(folderToDelete, true)
}
}
@@ -127,8 +191,7 @@ object HdfsHelper extends Serializable {
*
* @param hdfsPath the path of the folder to create
*/
- def createFolder(hdfsPath: String): Unit =
- FileSystem.get(new Configuration()).mkdirs(new Path(hdfsPath))
+ def createFolder(hdfsPath: String): Unit = hdfs.mkdirs(new Path(hdfsPath))
/** Checks if the file exists.
*
@@ -137,16 +200,15 @@ object HdfsHelper extends Serializable {
*/
def fileExists(hdfsPath: String): Boolean = {
- val fileSystem = FileSystem.get(new Configuration())
-
val fileToCheck = new Path(hdfsPath)
- if (fileSystem.exists(fileToCheck))
+ if (hdfs.exists(fileToCheck))
require(
- fileSystem.isFile(fileToCheck),
- "to check if a folder exists, prefer using the folderExists() method.")
+ hdfs.isFile(fileToCheck),
+ "to check if a folder exists, prefer using the folderExists() method."
+ )
- fileSystem.exists(fileToCheck)
+ hdfs.exists(fileToCheck)
}
/** Checks if the folder exists.
@@ -156,16 +218,15 @@ object HdfsHelper extends Serializable {
*/
def folderExists(hdfsPath: String): Boolean = {
- val fileSystem = FileSystem.get(new Configuration())
-
val folderToCheck = new Path(hdfsPath)
- if (fileSystem.exists(folderToCheck))
+ if (hdfs.exists(folderToCheck))
require(
- !fileSystem.isFile(folderToCheck),
- "to check if a file exists, prefer using the fileExists() method.")
+ !hdfs.isFile(folderToCheck),
+ "to check if a file exists, prefer using the fileExists() method."
+ )
- fileSystem.exists(folderToCheck)
+ hdfs.exists(folderToCheck)
}
/** Moves/renames a file.
@@ -184,30 +245,30 @@ object HdfsHelper extends Serializable {
overwrite: Boolean = false
): Unit = {
- val fileSystem = FileSystem.get(new Configuration())
-
val fileToRename = new Path(oldPath)
val renamedFile = new Path(newPath)
- if (fileSystem.exists(fileToRename))
+ if (hdfs.exists(fileToRename))
require(
- fileSystem.isFile(fileToRename),
- "to move a folder, prefer using the moveFolder() method.")
+ hdfs.isFile(fileToRename),
+ "to move a folder, prefer using the moveFolder() method."
+ )
if (overwrite)
- fileSystem.delete(renamedFile, true)
+ hdfs.delete(renamedFile, true)
else
require(
- !fileSystem.exists(renamedFile),
+ !hdfs.exists(renamedFile),
"overwrite option set to false, but a file already exists at target " +
- "location " + newPath)
+ "location " + newPath
+ )
// Before moving the file to its final destination, we check if the folder
// where to put the file exists, and if not we create it:
val targetContainerFolder = newPath.split("/").init.mkString("/")
createFolder(targetContainerFolder)
- fileSystem.rename(fileToRename, renamedFile)
+ hdfs.rename(fileToRename, renamedFile)
}
/** Moves/renames a folder.
@@ -226,36 +287,36 @@ object HdfsHelper extends Serializable {
overwrite: Boolean = false
): Unit = {
- val fileSystem = FileSystem.get(new Configuration())
-
val folderToRename = new Path(oldPath)
val renamedFolder = new Path(newPath)
- if (fileSystem.exists(folderToRename))
+ if (hdfs.exists(folderToRename))
require(
- !fileSystem.isFile(folderToRename),
- "to move a file, prefer using the moveFile() method.")
+ !hdfs.isFile(folderToRename),
+ "to move a file, prefer using the moveFile() method."
+ )
if (overwrite)
- fileSystem.delete(renamedFolder, true)
+ hdfs.delete(renamedFolder, true)
else
require(
- !fileSystem.exists(renamedFolder),
+ !hdfs.exists(renamedFolder),
"overwrite option set to false, but a folder already exists at target " +
- "location " + newPath)
+ "location " + newPath
+ )
// Before moving the folder to its final destination, we check if the folder
// where to put the folder exists, and if not we create it:
val targetContainerFolder = newPath.split("/").init.mkString("/")
createFolder(targetContainerFolder)
- fileSystem.rename(folderToRename, new Path(newPath))
+ hdfs.rename(folderToRename, new Path(newPath))
}
/** Creates an empty file on hdfs.
*
- * Might be usefull for token files. For instance a file which is only used
- * as a timestamp token of the last update of a processus, or a file which
+ * Might be useful for token files. For instance a file which is only used
+ * as a timestamp token of the last update of a process, or a file which
* blocks the execution of an other instance of the same job, ...
*
* Overwrites the file if it already exists.
@@ -272,7 +333,7 @@ object HdfsHelper extends Serializable {
* @param filePath the path of the empty file to create
*/
def createEmptyHdfsFile(filePath: String): Unit =
- FileSystem.get(new Configuration()).create(new Path(filePath)).close()
+ hdfs.create(new Path(filePath)).close()
/** Saves text in a file when content is too small to really require an RDD.
*
@@ -289,10 +350,7 @@ object HdfsHelper extends Serializable {
* @param filePath the path of the file in which to write the content
*/
def writeToHdfsFile(content: String, filePath: String): Unit = {
-
- val outputFile =
- FileSystem.get(new Configuration()).create(new Path(filePath))
-
+ val outputFile = hdfs.create(new Path(filePath))
outputFile.write(content.getBytes("UTF-8"))
outputFile.close()
}
@@ -311,7 +369,7 @@ object HdfsHelper extends Serializable {
* List("some", "relatively small", "text"), "/some/hdfs/file/path.txt")
* }}}
*
- * @param content the array of strings to write in the file as one line per
+ * @param content the seq of strings to write in the file as one line per
* string (this takes care of joining strings with "\n"s).
* @param filePath the path of the file in which to write the content
*/
@@ -337,11 +395,9 @@ object HdfsHelper extends Serializable {
onlyName: Boolean = true
): List[String] = {
- FileSystem
- .get(new Configuration())
+ hdfs
.listStatus(new Path(hdfsPath))
- .flatMap(status => {
-
+ .flatMap { status =>
// If it's a file:
if (status.isFile) {
if (onlyName) List(status.getPath.getName)
@@ -351,12 +407,13 @@ object HdfsHelper extends Serializable {
else if (recursive)
listFileNamesInFolder(
hdfsPath + "/" + status.getPath.getName,
- true,
- onlyName)
+ recursive = true,
+ onlyName
+ )
// If it's a dir and we're not in a recursive option:
else
Nil
- })
+ }
.toList
.sorted
}
@@ -371,8 +428,7 @@ object HdfsHelper extends Serializable {
* @return the list of folder names in the specified folder
*/
def listFolderNamesInFolder(hdfsPath: String): List[String] =
- FileSystem
- .get(new Configuration())
+ hdfs
.listStatus(new Path(hdfsPath))
.filter(!_.isFile)
.map(_.getPath.getName)
@@ -386,13 +442,9 @@ object HdfsHelper extends Serializable {
* @return the joda DateTime of the last modification of the given file
*/
def fileModificationDateTime(hdfsPath: String): DateTime =
- new DateTime(
- FileSystem
- .get(new Configuration())
- .getFileStatus(new Path(hdfsPath))
- .getModificationTime())
+ new DateTime(hdfs.getFileStatus(new Path(hdfsPath)).getModificationTime)
- /** Returns the stringified date of the last modification of the given file.
+ /** Returns the formatted date of the last modification of the given file.
*
* {{{
* assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306")
@@ -402,7 +454,7 @@ object HdfsHelper extends Serializable {
* modification date.
* @param format (default = "yyyyMMdd") the format under which to get the
* modification date.
- * @return the stringified date of the last modification of the given file,
+ * @return the formatted date of the last modification of the given file,
* under the provided format.
*/
def fileModificationDate(
@@ -420,7 +472,7 @@ object HdfsHelper extends Serializable {
def folderModificationDateTime(hdfsPath: String): DateTime =
fileModificationDateTime(hdfsPath)
- /** Returns the stringified date of the last modification of the given folder.
+ /** Returns the formatted date of the last modification of the given folder.
*
* {{{
* assert(HdfsHelper.folderModificationDate("my/hdfs/folder") == "20170306")
@@ -430,7 +482,7 @@ object HdfsHelper extends Serializable {
* modification date.
* @param format (default = "yyyyMMdd") the format under which to get the
* modification date.
- * @return the stringified date of the last modification of the given folder,
+ * @return the formatted date of the last modification of the given folder,
* under the provided format.
*/
def folderModificationDate(
@@ -452,17 +504,17 @@ object HdfsHelper extends Serializable {
def nbrOfDaysSinceFileWasLastModified(hdfsPath: String): Int =
Days
.daysBetween(fileModificationDateTime(hdfsPath), new DateTime())
- .getDays()
+ .getDays
/** Appends a header and a footer to a file.
*
- * Usefull when creating an xml file with spark and you need to add top level
+ * Useful when creating an xml file with spark and you need to add top level
* tags.
*
* If the workingFolderPath parameter is provided, then the processing is
* done in a working/tmp folder and then only, the final file is moved to its
* final real location. This way, in case of cluster instability, i.e. in
- * case the Spark job is interupted, this avoids having a temporary or
+ * case the Spark job is interrupted, this avoids having a temporary or
* corrupted file in output.
*
* @param filePath the path of the file for which to add the header and the
@@ -485,13 +537,13 @@ object HdfsHelper extends Serializable {
/** Appends a header to a file.
*
- * Usefull when creating a csv file with spark and you need to add a header
+ * Useful when creating a csv file with spark and you need to add a header
* describing the different fields.
*
* If the workingFolderPath parameter is provided, then the processing is
* done in a working/tmp folder and then only, the final file is moved to its
* final real location. This way, in case of cluster instability, i.e. in
- * case the Spark job is interupted, this avoids having a temporary or
+ * case the Spark job is interrupted, this avoids having a temporary or
* corrupted file in output.
*
* @param filePath the path of the file for which to add the header
@@ -514,7 +566,7 @@ object HdfsHelper extends Serializable {
* If the workingFolderPath parameter is provided, then the processing is
* done in a working/tmp folder and then only, the final file is moved to its
* final real location. This way, in case of cluster instability, i.e. in
- * case the Spark job is interupted, this avoids having a temporary or
+ * case the Spark job is interrupted, this avoids having a temporary or
* corrupted file in output.
*
* @param filePath the path of the file for which to add the footer
@@ -546,7 +598,7 @@ object HdfsHelper extends Serializable {
validateHdfsXmlWithXsd(hdfsXmlPath, xsdFile)
true
} catch {
- case saxe: SAXException => false
+ case _: SAXException => false
}
/** Validates an XML file on hdfs in regard to the given XSD.
@@ -562,9 +614,7 @@ object HdfsHelper extends Serializable {
*/
def validateHdfsXmlWithXsd(hdfsXmlPath: String, xsdFile: URL): Unit = {
- val fileSystem = FileSystem.get(new Configuration())
-
- val xmlFile = new StreamSource(fileSystem.open(new Path(hdfsXmlPath)))
+ val xmlFile = new StreamSource(hdfs.open(new Path(hdfsXmlPath)))
val schemaFactory =
SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI)
@@ -574,7 +624,7 @@ object HdfsHelper extends Serializable {
validator.validate(xmlFile)
}
- /** Loads a typesafe config from Hdfs.
+ /** Loads a Typesafe config from Hdfs.
*
* The best way to load the configuration of your job from hdfs.
*
@@ -602,15 +652,12 @@ object HdfsHelper extends Serializable {
* }
* }}}
*
- * @param hdfsConfigPath the absolute path of the typesafe config file on
- * hdfs we want to load as a typesafe Config object.
+ * @param hdfsConfigPath the absolute path of the Typesafe config file on
+ * hdfs we want to load as a Typesafe Config object.
* @return the com.typesafe.config.Config object which contains usable data
*/
def loadTypesafeConfigFromHdfs(hdfsConfigPath: String): Config = {
-
- val reader = new InputStreamReader(
- FileSystem.get(new Configuration()).open(new Path(hdfsConfigPath)))
-
+ val reader = new InputStreamReader(hdfs.open(new Path(hdfsConfigPath)))
try { ConfigFactory.parseReader(reader) } finally { reader.close() }
}
@@ -623,10 +670,7 @@ object HdfsHelper extends Serializable {
* @return the scala.xml.Elem object
*/
def loadXmlFileFromHdfs(hdfsXmlPath: String): Elem = {
-
- val reader = new InputStreamReader(
- FileSystem.get(new Configuration()).open(new Path(hdfsXmlPath)))
-
+ val reader = new InputStreamReader(hdfs.open(new Path(hdfsXmlPath)))
try { XML.load(reader) } finally { reader.close() }
}
@@ -655,22 +699,19 @@ object HdfsHelper extends Serializable {
deleteInputFile: Boolean = true
): Unit = {
- val fileSystem = FileSystem.get(new Configuration())
-
val ClassOfGzip = classOf[GzipCodec]
val ClassOfBZip2 = classOf[BZip2Codec]
val outputPath = compressionCodec match {
- case ClassOfGzip => inputPath + ".gz"
- case ClassOfBZip2 => inputPath + ".bz2"
+ case ClassOfGzip => s"$inputPath.gz"
+ case ClassOfBZip2 => s"$inputPath.bz2"
}
- val inputStream = fileSystem.open(new Path(inputPath))
- val outputStream = fileSystem.create(new Path(outputPath))
+ val inputStream = hdfs.open(new Path(inputPath))
+ val outputStream = hdfs.create(new Path(outputPath))
// The compression code:
- val codec = new CompressionCodecFactory(new Configuration())
- .getCodec(new Path(outputPath))
+ val codec = new CompressionCodecFactory(conf).getCodec(new Path(outputPath))
// We include the compression codec to the output stream:
val compressedOutputStream = codec.createOutputStream(outputStream)
@@ -678,7 +719,7 @@ object HdfsHelper extends Serializable {
IOUtils.copyBytes(
inputStream,
compressedOutputStream,
- new Configuration(),
+ conf,
false
)
} finally {
@@ -707,16 +748,16 @@ object HdfsHelper extends Serializable {
require(
purgeAge >= 0,
- "the purgeAge provided \"" + purgeAge.toString + "\" must be superior to 0.")
+ "the purgeAge provided \"" + purgeAge.toString + "\" must be superior to 0."
+ )
- FileSystem
- .get(new Configuration())
+ hdfs
.listStatus(new Path(folderPath))
.filter(path => {
val fileAgeInDays = Days
- .daysBetween(new DateTime(path.getModificationTime()), new DateTime())
- .getDays()
+ .daysBetween(new DateTime(path.getModificationTime), new DateTime())
+ .getDays
fileAgeInDays >= purgeAge
@@ -744,22 +785,20 @@ object HdfsHelper extends Serializable {
workingFolderPath: String
): Unit = {
- val fileSystem = FileSystem.get(new Configuration())
-
val tmpOutputPath = workingFolderPath match {
- case "" => filePath + ".tmp"
- case _ => workingFolderPath + "/xml.tmp"
+ case "" => s"$filePath.tmp"
+ case _ => s"$workingFolderPath/xml.tmp"
}
deleteFile(tmpOutputPath)
- val inputFile = fileSystem.open(new Path(filePath))
- val tmpOutputFile = fileSystem.create(new Path(tmpOutputPath))
+ val inputFile = hdfs.open(new Path(filePath))
+ val tmpOutputFile = hdfs.create(new Path(tmpOutputPath))
// If there is an header, we add it to the file:
header.foreach(h => tmpOutputFile.write((h + "\n").getBytes("UTF-8")))
try {
- IOUtils.copyBytes(inputFile, tmpOutputFile, new Configuration(), false)
+ IOUtils.copyBytes(inputFile, tmpOutputFile, conf, false)
} finally {
inputFile.close()
}
diff --git a/src/main/scala/com/spark_helper/Monitor.scala b/src/main/scala/com/spark_helper/Monitor.scala
index 3866cbb..060787c 100644
--- a/src/main/scala/com/spark_helper/Monitor.scala
+++ b/src/main/scala/com/spark_helper/Monitor.scala
@@ -6,19 +6,17 @@ import java.util.Calendar
import org.apache.commons.lang3.time.DurationFormatUtils
-import java.lang.Throwable
-
-/** A logger dedicated to Spak jobs.
+/** A logger dedicated to Spark jobs.
*
* It's a simple logger/report which contains a report that one can update from
* the driver and a success state. The idea is to persist job executions logs
* and errors (and forget about grepping unreadable yarn logs).
*
- * It's designed for perdiodic spark jobs (handles storage and purge of logs)
+ * It's designed for periodic spark jobs (handles storage and purge of logs)
* and provides a way to handle kpis validation.
*
* Logs are stored on the go which means one can have a direct real time access
- * of the job logs/status and it's current state (which can overwise be a pain
+ * of the job logs/status and it's current state (which can otherwise be a pain
* if it means going through yarn logs, or even for certain production
* environments going through additional layers of software logs to get to yarn
* logs).
@@ -33,9 +31,9 @@ import java.lang.Throwable
* This is a "driver-only" logger and is not intended at logging concurrent
* actions from executors.
*
- * Produced reports can easily be inserted in a notification email whenerver
+ * Produced reports can easily be inserted in a notification email whenever
* the job fails, which saves a lot of time to maintainers operating on heavy
- * production environements.
+ * production environments.
*
* The produced persisted report is also a way for downstream jobs to know the
* status of their input data.
@@ -61,7 +59,7 @@ import java.lang.Throwable
* Test("Nbr of output records", processedData.count(), SUPERIOR_THAN, 10e6d, NBR),
* Test("Some pct of invalid output", your_complex_kpi, INFERIOR_THAN, 3, PCT)
* ),
- * "My pipeline descirption"
+ * "My pipeline description"
* )
*
* if (outputIsValid)
@@ -69,9 +67,9 @@ import java.lang.Throwable
*
* } catch {
* case iie: InvalidInputException =>
- * Monitor.error(iie, "My pipeline descirption", diagnostic = "No input data!")
+ * Monitor.error(iie, "My pipeline description", diagnostic = "No input data!")
* case e: Throwable =>
- * Monitor.error(e, "My pipeline descirption") // whatever unexpected error
+ * Monitor.error(e, "My pipeline description") // whatever unexpected error
* }
*
* if (Monitor.isSuccess()) {
@@ -83,7 +81,7 @@ import java.lang.Throwable
* // HDFS (this saves the logs in the folder set with Monitor.setLogFolder):
* Monitor.store()
*
- * // At the end of the job, if the job isn't successfull, you might want to
+ * // At the end of the job, if the job isn't successful, you might want to
* // crash it (for instance to get a notification from your scheduler):
* if (!Monitor.isSuccess()) throw new Exception() // or send an email, or ...
* }}}
@@ -100,8 +98,8 @@ import java.lang.Throwable
*
* My job description (whatever you want); for instance:
* Documentation: https://github.com/xavierguihot/spark_helper
- * [10:23] Begining
- * [10:23-10:23] My pipeline descirption: failed
+ * [10:23] Beginning
+ * [10:23-10:23] My pipeline description: failed
* Diagnostic: No input data!
* org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://my/hdfs/input/path
* at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:285)
@@ -116,8 +114,8 @@ import java.lang.Throwable
*
* My job description (whatever you want); for instance:
* Documentation: https://github.com/xavierguihot/spark_helper
- * [10:23] Begining
- * [10:23-10:36] My pipeline descirption: failed
+ * [10:23] Beginning
+ * [10:23-10:36] My pipeline description: failed
* java.lang.NumberFormatException: For input string: "a"
* java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
* java.lang.Integer.parseInt(Integer.java:492)
@@ -125,14 +123,14 @@ import java.lang.Throwable
* [10:36] Duration: 00:13:47
* }}}
*
- * Another scenario, successfull spark pipeline and KPIs are valid; all good!:
+ * Another scenario, successful spark pipeline and KPIs are valid; all good!:
* {{{
* My job title
*
* My job description (whatever you want); for instance:
* Documentation: https://github.com/xavierguihot/spark_helper
- * [10:23] Begining
- * [10:23-10:41] My pipeline descirption: success
+ * [10:23] Beginning
+ * [10:23-10:41] My pipeline description: success
* KPI: Nbr of output records
* Value: 14669071.0
* Must be superior than 10000000.0
@@ -148,6 +146,7 @@ import java.lang.Throwable
* Source Monitor
*
+ * @todo would a State monad be appropriate?
* @author Xavier Guihot
* @since 2017-02
*/
@@ -159,13 +158,13 @@ object Monitor {
private var logDirectory: Option[String] = None
private var purgeWindow: Option[Int] = None
- private val jobStart = DateHelper.now("[HH:mm]") + " Begining"
+ private val jobStart = DateHelper.now("[HH:mm]") + " Beginning"
// Join of reportTitle, pointsOfContact, reportDescription, logDirectory and
// jobStart:
private var reportHeader = buildReportHeader()
- private val begining = Calendar.getInstance().getTimeInMillis()
+ private val beginning = Calendar.getInstance().getTimeInMillis
private var lastReportUpdate = DateHelper.now("HH:mm")
/** Sets the report's title.
@@ -175,7 +174,7 @@ object Monitor {
* {{{
* // Using:
* Monitor.setReportTitle("My Simple Job")
- * // Produces this at the begining of the report:
+ * // Produces this at the beginning of the report:
* " My Simple Job"
* ""
* }}}
@@ -196,7 +195,7 @@ object Monitor {
* // Using:
* Monitor.setReportTitle("My Simple Job")
* Monitor.addContacts(List("x.guihot@gmail.com", "smbdy@gmail.com"))
- * // Produces this at the begining of the report:
+ * // Produces this at the beginning of the report:
* " My Simple Job"
* ""
* "Point of contact: x.guihot@gmail.com, smbdy@gmail.com"
@@ -218,7 +217,7 @@ object Monitor {
* // Using:
* Monitor.setReportTitle("My Simple Job")
* Monitor.addDescription("Documentation: https://github.com/xavierguihot/spark_helper")
- * // Produces this at the begining of the report:
+ * // Produces this at the beginning of the report:
* " My Simple Job"
* ""
* "Documentation: https://github.com/xavierguihot/spark_helper"
@@ -269,7 +268,7 @@ object Monitor {
*
* @return if your spark job is successful.
*/
- def isSuccess(): Boolean = successful
+ def isSuccess: Boolean = successful
/** Returns the current state of the monitoring report.
*
@@ -286,7 +285,7 @@ object Monitor {
*
* @param text the text to append to the report
*/
- def log(text: String): Unit = log(text, true)
+ def log(text: String): Unit = log(text, withTimestamp = true)
/** Updates the report with some text and a success.
*
@@ -316,7 +315,7 @@ object Monitor {
* will result in this to be appended to the report:
* {{{ "[10:35-10:37] Some text: failure\n" }}}
*
- * Once the monitoring is a failure, then whatever following successfull
+ * Once the monitoring is a failure, then whatever following successful
* action won't change the failed status of the monitoring.
*
* @param taskDescription the text to append to the report
@@ -342,12 +341,12 @@ object Monitor {
* {{{
* monitor.error(
* invalidInputException,
- * "My pipeline descirption",
+ * "My pipeline description",
* diagnostic = "No input data!")
* }}}
* will result in this to be appended to the report:
* {{{
- * [10:23-10:24] My pipeline descirption: failed
+ * [10:23-10:24] My pipeline description: failed
* Diagnostic: No input data!
* org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://my/hdfs/input/path
* at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:285)
@@ -370,7 +369,7 @@ object Monitor {
successful = false
val serializedException =
- "\t\t" + exception.toString() + "\n" +
+ "\t\t" + exception.toString + "\n" +
exception.getStackTrace.map(line => s"\t\t$line").mkString("\n")
val update = List(
@@ -433,14 +432,13 @@ object Monitor {
if (!testsAreValid)
successful = false
- val seriralizedTests = tests.mkString("\n")
+ val serializedTests = tests.mkString("\n")
val update = testSuitName match {
- case "" => seriralizedTests
- case _ => {
+ case "" => serializedTests
+ case _ =>
val status = if (testsAreValid) "success" else "failed"
- s"$testSuitName: $status\n$seriralizedTests"
- }
+ s"$testSuitName: $status\n$serializedTests"
}
log(update)
@@ -506,11 +504,10 @@ object Monitor {
logDirectory match {
- case Some(logFolder) => {
-
+ case Some(logFolder) =>
// We add the job duration to the report:
val jobDuration = DurationFormatUtils.formatDuration(
- Calendar.getInstance().getTimeInMillis() - begining,
+ Calendar.getInstance().getTimeInMillis - beginning,
"HH:mm:ss")
var now = DateHelper.now("[HH:mm]")
@@ -532,13 +529,13 @@ object Monitor {
.writeToHdfsFile(finalReport, s"$logFolder/current.$reportExtension")
purgeWindow.foreach(window => purgeOutdatedLogs(logFolder, window))
- }
case None =>
require(
logDirectory.nonEmpty,
"to save the report, please specify the log folder using " +
- "Monitor.setLogFolder(\"hdfs/path/to/log/folder\")")
+ "Monitor.setLogFolder(\"hdfs/path/to/log/folder\")"
+ )
}
}
@@ -583,20 +580,17 @@ object Monitor {
/** Updates the current stored version of logs in file
* logFolder/current.ongoing */
private def storeCurrent(): Unit =
- logDirectory.foreach {
- case logFolder => {
-
- val warning =
- "WARNING: If this file exists it does not necessarily mean that " +
- "your job is still running. This file might persist if your job " +
- "has been killed and thus couldn't reach your call to the " +
- "Monitor.store()."
+ logDirectory.foreach { logFolder =>
+ val warning =
+ "WARNING: If this file exists it does not necessarily mean that " +
+ "your job is still running. This file might persist if your job " +
+ "has been killed and thus couldn't reach your call to the " +
+ "Monitor.store()."
- val ongoingReport =
- s"$reportHeader\n$report\n$warning"
+ val ongoingReport =
+ s"$reportHeader\n$report\n$warning"
- HdfsHelper.writeToHdfsFile(ongoingReport, s"$logFolder/current.ongoing")
- }
+ HdfsHelper.writeToHdfsFile(ongoingReport, s"$logFolder/current.ongoing")
}
private def purgeOutdatedLogs(logFolder: String, window: Int): Unit = {
diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala
index 3f612eb..9d05d96 100644
--- a/src/main/scala/com/spark_helper/SparkHelper.scala
+++ b/src/main/scala/com/spark_helper/SparkHelper.scala
@@ -1,5 +1,6 @@
package com.spark_helper
+import org.apache.spark.TextFileOverwrite
import org.apache.spark.{HashPartitioner, SparkContext}
import org.apache.spark.rdd.{RDD, HadoopRDD}
import org.apache.hadoop.conf.Configuration
@@ -10,6 +11,8 @@ import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.hadoop.mapred.{FileSplit, TextInputFormat => TextInputFormat2}
+import scala.reflect.ClassTag
+
import scala.util.Random
/** A facility to deal with RDD/file manipulations based on the Spark API.
@@ -18,611 +21,768 @@ import scala.util.Random
* spark job and replace it with methods fully tested whose name is
* self-explanatory/readable.
*
- * A few exemples:
+ * A few examples:
*
* {{{
- * // Same as SparkContext.saveAsTextFile, but the result is a single file:
- * SparkHelper.saveAsSingleTextFile(myOutputRDD, "/my/output/file/path.txt")
- * // Same as SparkContext.textFile, but instead of reading one record per
- * // line, it reads records spread over several lines.
- * // This way, xml, json, yml or any multi-line record file format can be used
- * // with Spark:
- * SparkHelper.textFileWithDelimiter("/my/input/folder/path", sparkContext, "---\n")
- * // Same as SparkContext.textFile, but instead of returning an RDD of
- * // records, it returns an RDD of tuples containing both the record and the
- * // path of the file it comes from:
- * SparkHelper.textFileWithFileName("folder", sparkContext)
+ * import com.spark_helper.SparkHelper._
+ *
+ * // Same as rdd.saveAsTextFile("path"), but the result is a single file (while
+ * // keeping the processing distributed):
+ * rdd.saveAsSingleTextFile("/my/output/file/path.txt")
+ * rdd.saveAsSingleTextFile("/my/output/file/path.txt", classOf[BZip2Codec])
+ *
+ * // Same as sc.textFile("path"), but instead of reading one record per line (by
+ * // splitting the input with \n), it splits the file in records based on a custom
+ * // delimiter. This way, xml, json, yml or any multi-line record file format can
+ * // be used with Spark:
+ * sc.textFile("/my/input/folder/path", "---\n") // for a yml file for instance
+ *
+ * // Equivalent to rdd.flatMap(identity) for RDDs of Seqs or Options:
+ * rdd.flatten
+ *
+ * // Equivalent to sc.textFile(), but for each line is tupled with its file path:
+ * sc.textFileWithFileName("/my/input/folder/path")
+ * // which produces:
+ * // RDD(("folder/file_1.txt", "record1fromfile1"), ("folder/file_1.txt", "record2fromfile1"),
+ * // ("folder/file_2.txt", "record1fromfile2"), ...)
+ *
+ * // In the given folder, this generates one file per key in the given key/value
+ * // RDD. Within each file (named from the key) are all values for this key:
+ * rdd.saveAsTextFileByKey("/my/output/folder/path")
+ *
+ * // Concept mapper (the following example transforms RDD(1, 3, 2, 7, 8) into RDD(1, 3, 4, 7, 16)):
+ * rdd.partialMap { case a if a % 2 == 0 => 2 * a }
+ *
+ * // For when input files contain commas and textFile can't handle it:
+ * sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt"))
* }}}
*
* Source SparkHelper
*
+ * @todo sc.parallelize[T](elmts: T*) instead of sc.parallelize[T](elmts: Array[T])
* @author Xavier Guihot
* @since 2017-02
*/
object SparkHelper extends Serializable {
- /** Saves an RDD in exactly one file.
- *
- * Allows one to save an RDD in one file, while keeping the processing
- * parallelized.
- *
- * {{{ SparkHelper.saveAsSingleTextFile(myRddToStore, "/my/file/path.txt") }}}
- *
- * @param outputRDD the RDD of strings to store in one file
- * @param outputFile the path of the produced file
- */
- def saveAsSingleTextFile(outputRDD: RDD[String], outputFile: String): Unit =
- saveAsSingleTextFileInternal(outputRDD, outputFile, None)
-
- /** Saves an RDD in exactly one file.
- *
- * Allows one to save an RDD in one file, while keeping the processing
- * parallelized.
- *
- * {{{
- * SparkHelper.saveAsSingleTextFile(
- * myRddToStore, "/my/file/path.txt", classOf[BZip2Codec])
- * }}}
- *
- * @param outputRDD the RDD of strings to store in one file
- * @param outputFile the path of the produced file
- * @param compressionCodec the type of compression to use (for instance
- * classOf[BZip2Codec] or classOf[GzipCodec]))
- */
- def saveAsSingleTextFile(
- outputRDD: RDD[String],
- outputFile: String,
- compressionCodec: Class[_ <: CompressionCodec]
- ): Unit =
- saveAsSingleTextFileInternal(outputRDD, outputFile, Some(compressionCodec))
-
- /** Saves an RDD in exactly one file.
- *
- * Allows one to save an RDD in one file, while keeping the processing
- * parallelized.
- *
- * This variant of saveAsSingleTextFile performs the storage in a temporary
- * folder instead of directly in the final output folder. This way the
- * risks of having corrupted files in the real output folder due to cluster
- * interruptions is minimized.
- *
- * {{{
- * SparkHelper.saveAsSingleTextFile(
- * myRddToStore, "/my/file/path.txt", "/my/working/folder/path")
- * }}}
- *
- * @param outputRDD the RDD of strings to store in one file
- * @param outputFile the path of the produced file
- * @param workingFolder the path where file manipulations will temporarily
- * happen.
- */
- def saveAsSingleTextFile(
- outputRDD: RDD[String],
- outputFile: String,
- workingFolder: String
- ): Unit =
- saveAsSingleTextFileWithWorkingFolderInternal(
- outputRDD,
- outputFile,
- workingFolder,
- None)
-
- /** Saves an RDD in exactly one file.
- *
- * Allows one to save an RDD in one file, while keeping the processing
- * parallelized.
- *
- * This variant of saveAsSingleTextFile performs the storage in a temporary
- * folder instead of directly in the final output folder. This way the risks
- * of having corrupted files in the real output folder due to cluster
- * interruptions is minimized.
- *
- * {{{
- * SparkHelper.saveAsSingleTextFile(
- * myRddToStore,
- * "/my/file/path.txt",
- * "/my/working/folder/path",
- * classOf[BZip2Codec])
- * }}}
- *
- * @param outputRDD the RDD of strings to store in one file
- * @param outputFile the path of the produced file
- * @param workingFolder the path where file manipulations will temporarily
- * happen.
- * @param compressionCodec the type of compression to use (for instance
- * classOf[BZip2Codec] or classOf[GzipCodec]))
- */
- def saveAsSingleTextFile(
- outputRDD: RDD[String],
- outputFile: String,
- workingFolder: String,
- compressionCodec: Class[_ <: CompressionCodec]
- ): Unit =
- saveAsSingleTextFileWithWorkingFolderInternal(
- outputRDD,
- outputFile,
- workingFolder,
- Some(compressionCodec))
-
- /** Equivalent to sparkContext.textFile(), but for a specific record delimiter.
- *
- * By default, sparkContext.textFile() will provide one record per line. But
- * what if the format you want to read considers that one record (one entity)
- * is stored in more than one line (yml, xml, ...)?
- *
- * For instance in order to read a yml file, which is a format for which a
- * record (a single entity) is spread other several lines, you can modify the
- * record delimiter with "---\n" instead of "\n". Same goes when reading an
- * xml file where a record might be spread over several lines or worse the
- * whole xml file is one line.
- *
- * {{{
- * // Let's say data we want to use with Spark looks like this (one record is
- * // a customer, but it's spread over several lines):
- * case x => x
.
+ *
+ * Similar idea to .collect
,
+ * but instead of skipping non-matching items, it keeps them as-is.
+ *
+ * {{{
+ * sc.parallelize(Array(1, 3, 2, 7, 8)).partialMap { case a if a % 2 == 0 => 2 * a }
+ * // is equivalent to:
+ * sc.parallelize(Array(1, 3, 2, 7, 8)).map {
+ * case a if a % 2 == 0 => 2 * a
+ * case a => a
+ * }
+ * // in order to map to:
+ * sc.parallelize(Array(1, 3, 4, 7, 16))
+ * }}}
+ *
+ * @param pf the partial function to apply
+ * @return an rdd of the same type, for which each element is either the
+ * application of the partial function where defined or the identity.
+ */
+ def partialMap(pf: PartialFunction[T, T]): RDD[T] =
+ rdd.map {
+ case x if pf.isDefinedAt(x) => pf(x)
+ case x => x
+ }
}
- /** Saves and repartitions a key/value RDD on files whose name is the key.
- *
- * Within the provided outputFolder, will be one file per key in your
- * keyValueRDD. And within a file for a given key are only values for this
- * key.
- *
- * You need to know the nbr of keys beforehand (in general you use this to
- * split your dataset in subsets, or to output one file per client, so you
- * know how many keys you have). So you need to put as keyNbr the exact nbr
- * of keys you'll have.
- *
- * This is not scalable. This shouldn't be considered for any data flow with
- * normal or big volumes.
- *
- * {{{
- * SparkHelper.saveAsTextFileByKey(
- * myKeyValueRddToStore, "/my/output/folder/path", 12)
- * }}}
- *
- * @param keyValueRDD the key/value RDD
- * @param outputFolder the foldder where will be storrred key files
- * @param keyNbr the nbr of expected keys (which is the nbr of outputed files)
- */
- def saveAsTextFileByKey(
- keyValueRDD: RDD[(String, String)],
- outputFolder: String,
- keyNbr: Int
- ): Unit = {
-
- HdfsHelper.deleteFolder(outputFolder)
-
- keyValueRDD
- .partitionBy(new HashPartitioner(keyNbr))
- .saveAsHadoopFile(
- outputFolder,
- classOf[String],
- classOf[String],
- classOf[KeyBasedOutput]
+ implicit class StringRDDExtensions(val rdd: RDD[String]) extends AnyVal {
+
+ /** Saves an RDD in exactly one file.
+ *
+ * Allows one to save an RDD in one file, while keeping the processing
+ * distributed.
+ *
+ * {{{ rdd.saveAsSingleTextFile("/my/file/path.txt") }}}
+ *
+ * @param path the path of the produced file
+ */
+ def saveAsSingleTextFile(path: String): Unit =
+ SparkHelper.saveAsSingleTextFileInternal(rdd, path, None)
+
+ /** Saves an RDD in exactly one file.
+ *
+ * Allows one to save an RDD in one file, while keeping the processing
+ * distributed.
+ *
+ * {{{ rdd.saveAsSingleTextFile("/my/file/path.txt", classOf[BZip2Codec]) }}}
+ *
+ * @param path the path of the produced file
+ * @param codec the type of compression to use (for instance
+ * classOf[BZip2Codec] or classOf[GzipCodec]))
+ */
+ def saveAsSingleTextFile(
+ path: String,
+ codec: Class[_ <: CompressionCodec]
+ ): Unit =
+ SparkHelper.saveAsSingleTextFileInternal(rdd, path, Some(codec))
+
+ /** Saves an RDD in exactly one file.
+ *
+ * Allows one to save an RDD in one file, while keeping the processing
+ * distributed.
+ *
+ * This variant of saveAsSingleTextFile
+ * performs the storage in a temporary folder instead of directly in the
+ * final output folder. This way the risks of having corrupted files in the
+ * real output folder due to cluster interruptions is minimized.
+ *
+ * {{{ rdd.saveAsSingleTextFile("/my/file/path.txt", "/my/working/folder/path") }}}
+ *
+ * @param path the path of the produced file
+ * @param workingFolder the path where file manipulations will temporarily
+ * happen.
+ */
+ def saveAsSingleTextFile(path: String, workingFolder: String): Unit =
+ SparkHelper.saveAsSingleTextFileWithWorkingFolderInternal(
+ rdd,
+ path,
+ workingFolder,
+ None
)
- }
- /** Saves and repartitions a key/value RDD on files whose name is the key.
- *
- * Within the provided outputFolder, will be one file per key in your
- * keyValueRDD. And within a file for a given key are only values for this
- * key.
- *
- * You need to know the nbr of keys beforehand (in general you use this to
- * split your dataset in subsets, or to output one file per client, so you
- * know how many keys you have). So you need to put as keyNbr the exact nbr
- * of keys you'll have.
- *
- * This is not scalable. This shouldn't be considered for any data flow with
- * normal or big volumes.
- *
- * {{{
- * SparkHelper.saveAsTextFileByKey(
- * myKeyValueRddToStore, "/my/output/folder/path", 12, classOf[BZip2Codec])
- * }}}
- *
- * @param keyValueRDD the key/value RDD
- * @param outputFolder the foldder where will be storrred key files
- * @param keyNbr the nbr of expected keys (which is the nbr of outputed files)
- * @param compressionCodec the type of compression to use (for instance
- * classOf[BZip2Codec] or classOf[GzipCodec]))
- */
- def saveAsTextFileByKey(
- keyValueRDD: RDD[(String, String)],
- outputFolder: String,
- keyNbr: Int,
- compressionCodec: Class[_ <: CompressionCodec]
- ): Unit = {
+ /** Saves an RDD in exactly one file.
+ *
+ * Allows one to save an RDD in one file, while keeping the processing
+ * distributed.
+ *
+ * This variant of saveAsSingleTextFile
+ * performs the storage in a temporary folder instead of directly in the
+ * final output folder. This way the risks of having corrupted files in the
+ * real output folder due to cluster interruptions is minimized.
+ *
+ * {{{
+ * rdd.saveAsSingleTextFile("/my/file/path.txt", "/my/working/folder/path", classOf[BZip2Codec])
+ * }}}
+ *
+ * @param path the path of the produced file
+ * @param workingFolder the path where file manipulations will temporarily
+ * happen.
+ * @param codec the type of compression to use (for instance
+ * classOf[BZip2Codec] or classOf[GzipCodec]))
+ */
+ def saveAsSingleTextFile(
+ path: String,
+ workingFolder: String,
+ codec: Class[_ <: CompressionCodec]
+ ): Unit =
+ SparkHelper.saveAsSingleTextFileWithWorkingFolderInternal(
+ rdd,
+ path,
+ workingFolder,
+ Some(codec)
+ )
- HdfsHelper.deleteFolder(outputFolder)
+ /** Saves as text file, but by decreasing the nbr of partitions of the output.
+ *
+ * Same as rdd.saveAsTextFile()
+ * , but decreases the nbr of partitions in the output folder before doing
+ * so.
+ *
+ * The result is equivalent to rdd.coalesce(x).saveAsTextFile()
+ * , but if x
+ * is very low, coalesce
+ * would make the processing time explode, whereas this methods keeps the
+ * processing distributed, save as text file and then only merges the
+ * result in a lower nbr of partitions.
+ *
+ * {{{ rdd.saveAsTextFileAndCoalesce("/produced/folder/path/with/only/30/files", 30) }}}
+ *
+ * @param path the folder where will finally be stored the RDD but spread
+ * on only 30 files (where 30 is the value of the finalCoalesceLevel
+ * parameter).
+ * @param finalCoalesceLevel the nbr of files within the folder at the end
+ * of this method.
+ */
+ def saveAsTextFileAndCoalesce(
+ path: String,
+ finalCoalesceLevel: Int
+ ): Unit = {
+
+ // We remove folders where to store data in case they already exist:
+ HdfsHelper.deleteFolder(s"${path}_tmp")
+ HdfsHelper.deleteFolder(path)
+
+ // We first save the rdd with the level of coalescence used during the
+ // processing. This way the processing is done with the right level of
+ // tasks:
+ rdd.saveAsTextFile(s"${path}_tmp")
+
+ // Then we read back this tmp folder, apply the coalesce and store it back:
+ SparkHelper.decreaseCoalescenceInternal(
+ s"${path}_tmp",
+ path,
+ finalCoalesceLevel,
+ rdd.context,
+ None
+ )
+ }
- keyValueRDD
- .partitionBy(new HashPartitioner(keyNbr))
- .saveAsHadoopFile(
- outputFolder,
- classOf[String],
- classOf[String],
- classOf[KeyBasedOutput],
- compressionCodec
+ /** Saves as text file, and decreases the nbr of output partitions.
+ *
+ * Same as rdd.saveAsTextFile()
+ * , but decreases the nbr of partitions in the output folder before doing
+ * so.
+ *
+ * The result is equivalent to rdd.coalesce(x).saveAsTextFile()
+ * , but if x
+ * is very low, coalesce
+ * would make the processing time explode, whereas this methods keeps the
+ * processing distributed, save as text file and then only merges the
+ * result in a lower nbr of partitions.
+ *
+ * {{{ rdd.saveAsTextFileAndCoalesce("/produced/folder/path/with/only/30/files", 30, classOf[BZip2Codec]) }}}
+ *
+ * @param path the folder where will finally be stored the RDD but spread
+ * on only 30 files (where 30 is the value of the finalCoalesceLevel
+ * parameter).
+ * @param finalCoalesceLevel the nbr of files within the folder at the end
+ * of this method.
+ * @param codec the type of compression to use (for instance
+ * classOf[BZip2Codec] or classOf[GzipCodec]))
+ */
+ def saveAsTextFileAndCoalesce(
+ path: String,
+ finalCoalesceLevel: Int,
+ codec: Class[_ <: CompressionCodec]
+ ): Unit = {
+
+ // We remove folders where to store data in case they already exist:
+ HdfsHelper.deleteFolder(s"${path}_tmp")
+ HdfsHelper.deleteFolder(path)
+
+ // We first save the rdd with the level of coalescence used during the
+ // processing. This way the processing is done with the right level of
+ // tasks:
+ rdd.saveAsTextFile(s"${path}_tmp")
+
+ // Then we read back this tmp folder, apply the coalesce and store it back:
+ decreaseCoalescenceInternal(
+ s"${path}_tmp",
+ path,
+ finalCoalesceLevel,
+ rdd.context,
+ Some(codec)
)
+ }
}
- /** Decreases the nbr of partitions of a folder.
- *
- * This is often handy when the last step of your job needs to run on
- * thousands of files, but you want to store your final output on let's say
- * only 300 files.
- *
- * It's like a FileUtil.copyMerge, but the merging produces more than one
- * file.
- *
- * Be aware that this methods deletes the provided input folder.
- *
- * {{{
- * SparkHelper.decreaseCoalescence(
- * "/folder/path/with/2000/files",
- * "/produced/folder/path/with/only/300/files",
- * 300,
- * sparkContext)
- * }}}
- *
- * @param highCoalescenceLevelFolder the folder which contains 10000 files
- * @param lowerCoalescenceLevelFolder the folder which will contain the same
- * data as highCoalescenceLevelFolder but spread on only 300 files (where 300
- * is the finalCoalescenceLevel parameter).
- * @param finalCoalescenceLevel the nbr of files within the folder at the end
- * of this method.
- * @param sparkContext the SparkContext
- */
- def decreaseCoalescence(
- highCoalescenceLevelFolder: String,
- lowerCoalescenceLevelFolder: String,
- finalCoalescenceLevel: Int,
- sparkContext: SparkContext
- ): Unit =
- decreaseCoalescenceInternal(
- highCoalescenceLevelFolder,
- lowerCoalescenceLevelFolder,
- finalCoalescenceLevel,
- sparkContext,
- None)
-
- /** Decreases the nbr of partitions of a folder.
- *
- * This is often handy when the last step of your job needs to run on
- * thousands of files, but you want to store your final output on let's say
- * only 300 files.
- *
- * It's like a FileUtil.copyMerge, but the merging produces more than one
- * file.
- *
- * Be aware that this methods deletes the provided input folder.
- *
- * {{{
- * SparkHelper.decreaseCoalescence(
- * "/folder/path/with/2000/files",
- * "/produced/folder/path/with/only/300/files",
- * 300,
- * sparkContext,
- * classOf[BZip2Codec])
- * }}}
- *
- * @param highCoalescenceLevelFolder the folder which contains 10000 files
- * @param lowerCoalescenceLevelFolder the folder which will contain the same
- * data as highCoalescenceLevelFolder but spread on only 300 files (where 300
- * is the finalCoalescenceLevel parameter).
- * @param finalCoalescenceLevel the nbr of files within the folder at the end
- * of this method.
- * @param sparkContext the SparkContext
- * @param compressionCodec the type of compression to use (for instance
- * classOf[BZip2Codec] or classOf[GzipCodec]))
- */
- def decreaseCoalescence(
- highCoalescenceLevelFolder: String,
- lowerCoalescenceLevelFolder: String,
- finalCoalescenceLevel: Int,
- sparkContext: SparkContext,
- compressionCodec: Class[_ <: CompressionCodec]
- ): Unit =
- decreaseCoalescenceInternal(
- highCoalescenceLevelFolder,
- lowerCoalescenceLevelFolder,
- finalCoalescenceLevel,
- sparkContext,
- Some(compressionCodec))
-
- /** Saves as text file, but by decreasing the nbr of partitions of the output.
- *
- * Same as decreaseCoalescence, but the storage of the RDD in an intermediate
- * folder is included.
- *
- * This still makes the processing parallelized, but the output is coalesced.
- *
- * {{{
- * SparkHelper.saveAsTextFileAndCoalesce(
- * myRddToStore, "/produced/folder/path/with/only/300/files", 300)
- * }}}
- *
- * @param outputRDD the RDD to store, processed for instance on 10000 tasks
- * (which would thus be stored as 10000 files).
- * @param outputFolder the folder where will finally be stored the RDD but
- * spread on only 300 files (where 300 is the value of the
- * finalCoalescenceLevel parameter).
- * @param finalCoalescenceLevel the nbr of files within the folder at the end
- * of this method.
- */
- def saveAsTextFileAndCoalesce(
- outputRDD: RDD[String],
- outputFolder: String,
- finalCoalescenceLevel: Int
- ): Unit = {
-
- val sparkContext = outputRDD.context
-
- // We remove folders where to store data in case they already exist:
- HdfsHelper.deleteFolder(outputFolder + "_tmp")
- HdfsHelper.deleteFolder(outputFolder)
-
- // We first save the rdd with the level of coalescence used during the
- // processing. This way the processing is done with the right level of
- // tasks:
- outputRDD.saveAsTextFile(outputFolder + "_tmp")
-
- // Then we read back this tmp folder, apply the coalesce and store it back:
- decreaseCoalescenceInternal(
- outputFolder + "_tmp",
- outputFolder,
- finalCoalescenceLevel,
- sparkContext,
- None)
+ implicit class SeqRDDExtensions[T: ClassTag](val rdd: RDD[Seq[T]]) {
+
+ /** Flattens an RDD of Seq[T]
+ * to RDD[T]
.
+ *
+ * {{{ sc.parallelize(Array(Seq(1, 2, 3), Nil, Seq(4))).flatten == sc.parallelize(Array(Seq(1, 2, 3, 4))) }}}
+ *
+ * @return the flat RDD as RDD.flatMap(identity)
+ * or List.flatten
+ * would have.
+ */
+ def flatten: RDD[T] = rdd.flatMap(identity)
}
- /** Saves as text file, but by decreasing the nbr of partitions of the output.
- *
- * Same as decreaseCoalescence, but the storage of the RDD in an intermediate
- * folder is included.
- *
- * This still makes the processing parallelized, but the output is coalesced.
- *
- * {{{
- * SparkHelper.saveAsTextFileAndCoalesce(
- * myRddToStore,
- * "/produced/folder/path/with/only/300/files",
- * 300,
- * classOf[BZip2Codec])
- * }}}
- *
- * @param outputRDD the RDD to store, processed for instance on 10000 tasks
- * (which would thus be stored as 10000 files).
- * @param outputFolder the folder where will finally be stored the RDD but
- * spread on only 300 files (where 300 is the value of the
- * finalCoalescenceLevel parameter).
- * @param finalCoalescenceLevel the nbr of files within the folder at the end
- * of this method.
- * @param compressionCodec the type of compression to use (for instance
- * classOf[BZip2Codec] or classOf[GzipCodec]))
- */
- def saveAsTextFileAndCoalesce(
- outputRDD: RDD[String],
- outputFolder: String,
- finalCoalescenceLevel: Int,
- compressionCodec: Class[_ <: CompressionCodec]
- ): Unit = {
+ implicit class OptionRDDExtensions[T: ClassTag](val rdd: RDD[Option[T]]) {
+
+ /** Flattens an RDD of Option[T]
+ * to RDD[T]
.
+ *
+ * {{{ sc.parallelize(Array(Some(1), None, Some(2))).flatten == sc.parallelize(Array(Seq(1, 2))) }}}
+ *
+ * @return the flat RDD as RDD.flatMap(x => x)
+ * or List.flatten
+ * would have.
+ */
+ def flatten: RDD[T] = rdd.flatMap(o => o)
+ }
- val sparkContext = outputRDD.context
+ implicit class PairRDDExtensions(val rdd: RDD[(String, String)])
+ extends AnyVal {
+
+ /** Saves and repartitions a key/value RDD on files whose name is the key.
+ *
+ * Within the provided path, there will be one file per key in the given
+ * keyValueRDD. And within a file for a given key are only stored values
+ * for this key.
+ *
+ * As this internally needs to know the nbr of keys, this will have to
+ * compute it. If this nbr of keys is known beforehand, it would spare
+ * resources to use saveAsTextFileByKey(path: String, keyNbr: Int)
+ * instead.
+ *
+ * This is not scalable. This shouldn't be considered for any data flow
+ * with normal or big volumes.
+ *
+ * {{{ rdd.saveAsTextFileByKey("/my/output/folder/path") }}}
+ *
+ * @param path the folder where will be stored key files
+ */
+ def saveAsTextFileByKey(path: String): Unit =
+ SparkHelper.saveAsTextFileByKeyInternal(rdd, path, None, None)
+
+ /** Saves and repartitions a key/value RDD on files whose name is the key.
+ *
+ * Within the provided path, there will be one file per key in the given
+ * keyValueRDD. And within a file for a given key are only stored values
+ * for this key.
+ *
+ * This is not scalable. This shouldn't be considered for any data flow
+ * with normal or big volumes.
+ *
+ * {{{ rdd.saveAsTextFileByKey("/my/output/folder/path", 12) }}}
+ *
+ * @param path the folder where will be stored key files
+ * @param keyNbr the nbr of expected keys (which is the nbr of output
+ * files)
+ */
+ def saveAsTextFileByKey(path: String, keyNbr: Int): Unit =
+ SparkHelper.saveAsTextFileByKeyInternal(rdd, path, Some(keyNbr), None)
+
+ /** Saves and repartitions a key/value RDD on files whose name is the key.
+ *
+ * Within the provided path, there will be one file per key in the given
+ * keyValueRDD. And within a file for a given key are only stored values
+ * for this key.
+ *
+ * As this internally needs to know the nbr of keys, this will have to
+ * compute it. If this nbr of keys is known beforehand, it would spare
+ * resources to use
+ * saveAsTextFileByKey(path: String, keyNbr: Int, codec: Class[_ <: CompressionCodec])
+ * instead.
+ *
+ * This is not scalable. This shouldn't be considered for any data flow
+ * with normal or big volumes.
+ *
+ * {{{ rdd.saveAsTextFileByKey("/my/output/folder/path", classOf[BZip2Codec]) }}}
+ *
+ * @param path the folder where will be stored key files
+ * @param codec the type of compression to use (for instance
+ * classOf[BZip2Codec] or classOf[GzipCodec]))
+ */
+ def saveAsTextFileByKey(
+ path: String,
+ codec: Class[_ <: CompressionCodec]
+ ): Unit =
+ SparkHelper.saveAsTextFileByKeyInternal(rdd, path, None, Some(codec))
+
+ /** Saves and repartitions a key/value RDD on files whose name is the key.
+ *
+ * Within the provided path, there will be one file per key in the given
+ * keyValueRDD. And within a file for a given key are only stored values
+ * for this key.
+ *
+ * This is not scalable. This shouldn't be considered for any data flow
+ * with normal or big volumes.
+ *
+ * {{{ rdd.saveAsTextFileByKey("/my/output/folder/path", 12, classOf[BZip2Codec]) }}}
+ *
+ * @param path the folder where will be stored key files
+ * @param keyNbr the nbr of expected keys (which is the nbr of output
+ * files)
+ * @param codec the type of compression to use (for instance
+ * classOf[BZip2Codec] or classOf[GzipCodec]))
+ */
+ def saveAsTextFileByKey(
+ path: String,
+ keyNbr: Int,
+ codec: Class[_ <: CompressionCodec]
+ ): Unit =
+ SparkHelper
+ .saveAsTextFileByKeyInternal(rdd, path, Some(keyNbr), Some(codec))
+ }
- // We remove folders where to store data in case they already exist:
- HdfsHelper.deleteFolder(outputFolder + "_tmp")
- HdfsHelper.deleteFolder(outputFolder)
+ implicit class SparkContextExtensions(val sc: SparkContext) extends AnyVal {
+
+ /** Equivalent to sparkContext.textFile()
+ * , but for a specific record delimiter.
+ *
+ * By default, sparkContext.textFile()
+ * will provide one record per line (per '\n'
).
+ * But what if the format to read considers that one record is stored in
+ * more than one line (yml, custom format, ...)?
+ *
+ * For instance in order to read a yml file, which is a format for which a
+ * record (a single entity) is spread other several lines, you can modify
+ * the record delimiter with "---\n"
+ * instead of "\n"
.
+ * Same goes when reading an xml file where a record might be spread over
+ * several lines or worse the whole xml file is one line.
+ *
+ * {{{
+ * // Let's say data we want to use with Spark looks like this (one record
+ * // is a customer, but it's spread over several lines):
+ * sparkContext.textFile()
+ * , but each record is associated with the file path it comes from.
+ *
+ * Produces an RDD[(file_name, line)]
+ * which provides a way to know from which file a given line comes from.
+ *
+ * {{{
+ * // Considering this folder:
+ * // folder/file_1.txt whose content is data1\ndata2\ndata3
+ * // folder/file_2.txt whose content is data4\ndata4
+ * // folder/folder_1/file_3.txt whose content is data6\ndata7
+ * // then:
+ * sc.textFileWithFileName("folder")
+ * // will return:
+ * RDD(
+ * ("file:/path/on/machine/folder/file_1.txt", "data1"),
+ * ("file:/path/on/machine/folder/file_1.txt", "data2"),
+ * ("file:/path/on/machine/folder/file_1.txt", "data3"),
+ * ("file:/path/on/machine/folder/file_2.txt", "data4"),
+ * ("file:/path/on/machine/folder/file_2.txt", "data5"),
+ * ("file:/path/on/machine/folder/folder_1/file_3.txt", "data6"),
+ * ("file:/path/on/machine/folder/folder_1/file_3.txt", "data7")
+ * )
+ * }}}
+ *
+ * @param path the path of the folder (or structure of folders) to read
+ * @return the RDD of records where a record is a tuple containing the path
+ * of the file the record comes from and the record itself.
+ */
+ def textFileWithFileName(path: String): RDD[(String, String)] = {
+
+ // In order to go through the folder structure recursively:
+ sc.hadoopConfiguration
+ .set("mapreduce.input.fileinputformat.input.dir.recursive", "true")
+
+ sc.hadoopFile(
+ path,
+ classOf[TextInputFormat2],
+ classOf[LongWritable],
+ classOf[Text],
+ sc.defaultMinPartitions
+ )
+ .asInstanceOf[HadoopRDD[LongWritable, Text]]
+ .mapPartitionsWithInputSplit {
+ case (inputSplit, iterator) =>
+ val file = inputSplit.asInstanceOf[FileSplit]
+ iterator.map(tpl => (file.getPath.toString, tpl._2.toString))
+ }
+
+ // An other way of doing would be:
+ //
+ // import org.apache.spark.sql.functions.input_file_name
+ // import spark.implicits._
+ //
+ // spark.read
+ // .text(testFolder)
+ // .select(input_file_name, $"value")
+ // .as[(String, String)]
+ // .rdd
+ }
- // Then we read back this tmp folder, apply the coalesce and store it back:
- decreaseCoalescenceInternal(
- outputFolder + "_tmp",
- outputFolder,
- finalCoalescenceLevel,
- sparkContext,
- Some(compressionCodec))
- }
+ /** A replacement for sc.textFile()
+ * when files contains commas in their name.
+ *
+ * As sc.textFile()
+ * allows to provide several files at once by giving them as a string which
+ * is a list of strings joined with ,
,
+ * we can't give it files containing commas in their name.
+ *
+ * This method aims at bypassing this limitation by passing paths as a
+ * sequence of strings.
+ *
+ * {{{ sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt")) }}}
+ *
+ * @param paths the paths of the file(s)/folder(s) to read
+ */
+ def textFile(paths: Seq[String]): RDD[String] =
+ TextFileOverwrite.textFile(paths, sc.defaultMinPartitions, sc)
+
+ /** A replacement for sc.textFile()
+ * when files contains commas in their name.
+ *
+ * As sc.textFile()
+ * allows to provide several files at once by giving them as a string which
+ * is a list of strings joined with ,
,
+ * we can't give it files containing commas in their name.
+ *
+ * This method aims at bypassing this limitation by passing paths as a
+ * sequence of strings.
+ *
+ * {{{ sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt")) }}}
+ *
+ * @param paths the paths of the file(s)/folder(s) to read
+ * @param minPartitions the nbr of partitions in which to split the input
+ */
+ def textFile(paths: Seq[String], minPartitions: Int): RDD[String] =
+ TextFileOverwrite.textFile(paths, minPartitions, sc)
+
+ /** Decreases the nbr of partitions of a folder.
+ *
+ * This comes in handy when the last step of your job needs to run on
+ * thousands of files, but you want to store your final output on let's say
+ * only 30 files.
+ *
+ * It's like a FileUtil.copyMerge()
+ * , but the merging produces more than one file.
+ *
+ * Be aware that this methods deletes the provided input folder.
+ *
+ * {{{
+ * sc.decreaseCoalescence(
+ * "/folder/path/with/2000/files",
+ * "/produced/folder/path/with/only/30/files",
+ * 30
+ * )
+ * }}}
+ *
+ * @param highCoalescenceLevelFolder the folder which contains 10000 files
+ * @param lowerCoalescenceLevelFolder the folder which will contain the same
+ * data as highCoalescenceLevelFolder but spread on only 30 files (where 30
+ * is the finalCoalesceLevel parameter).
+ * @param finalCoalesceLevel the nbr of files within the folder at the end
+ * of this method.
+ */
+ def decreaseCoalescence(
+ highCoalescenceLevelFolder: String,
+ lowerCoalescenceLevelFolder: String,
+ finalCoalesceLevel: Int
+ ): Unit =
+ SparkHelper.decreaseCoalescenceInternal(
+ highCoalescenceLevelFolder,
+ lowerCoalescenceLevelFolder,
+ finalCoalesceLevel,
+ sc,
+ None
+ )
- /** Equivalent to sparkContext.textFile(), but for each line is associated
- * with its file path.
- *
- * Produces a RDD[(file_name, line)] which provides a way to know from which
- * file a given line comes from.
- *
- * {{{
- * // Considering this folder:
- * // folder/file_1.txt whose content is data1\ndata2\ndata3
- * // folder/file_2.txt whose content is data4\ndata4
- * // folder/folder_1/file_3.txt whose content is data6\ndata7
- * // then:
- * SparkHelper.textFileWithFileName("folder", sparkContext)
- * // will return:
- * RDD(
- * ("file:/path/on/machine/folder/file_1.txt", "data1"),
- * ("file:/path/on/machine/folder/file_1.txt", "data2"),
- * ("file:/path/on/machine/folder/file_1.txt", "data3"),
- * ("file:/path/on/machine/folder/file_2.txt", "data4"),
- * ("file:/path/on/machine/folder/file_2.txt", "data5"),
- * ("file:/path/on/machine/folder/folder_1/file_3.txt", "data6"),
- * ("file:/path/on/machine/folder/folder_1/file_3.txt", "data7")
- * )
- * }}}
- *
- * @param hdfsPath the path of the folder (or structure of folders) to read
- * @param sparkContext the SparkContext
- * @return the RDD of records where a record is a tuple containing the path
- * of the file the record comes from and the record itself.
- */
- def textFileWithFileName(
- hdfsPath: String,
- sparkContext: SparkContext
- ): RDD[(String, String)] = {
-
- // In order to go through the folder structure recursively:
- sparkContext.hadoopConfiguration
- .set("mapreduce.input.fileinputformat.input.dir.recursive", "true")
-
- sparkContext
- .hadoopFile(
- hdfsPath,
- classOf[TextInputFormat2],
- classOf[LongWritable],
- classOf[Text],
- sparkContext.defaultMinPartitions
+ /** Decreases the nbr of partitions of a folder.
+ *
+ * This comes in handy when the last step of your job needs to run on
+ * thousands of files, but you want to store your final output on let's say
+ * only 30 files.
+ *
+ * It's like a FileUtil.copyMerge()
+ * , but the merging produces more than one file.
+ *
+ * Be aware that this methods deletes the provided input folder.
+ *
+ * {{{
+ * sc.decreaseCoalescence(
+ * "/folder/path/with/2000/files",
+ * "/produced/folder/path/with/only/30/files",
+ * 30,
+ * classOf[BZip2Codec]
+ * )
+ * }}}
+ *
+ * @param highCoalescenceLevelFolder the folder which contains 10000 files
+ * @param lowerCoalescenceLevelFolder the folder which will contain the same
+ * data as highCoalescenceLevelFolder but spread on only 30 files (where 30
+ * is the finalCoalesceLevel parameter).
+ * @param finalCoalesceLevel the nbr of files within the folder at the end
+ * of this method.
+ * @param codec the type of compression to use (for instance
+ * classOf[BZip2Codec] or classOf[GzipCodec]))
+ */
+ def decreaseCoalescence(
+ highCoalescenceLevelFolder: String,
+ lowerCoalescenceLevelFolder: String,
+ finalCoalesceLevel: Int,
+ codec: Class[_ <: CompressionCodec]
+ ): Unit =
+ SparkHelper.decreaseCoalescenceInternal(
+ highCoalescenceLevelFolder,
+ lowerCoalescenceLevelFolder,
+ finalCoalesceLevel,
+ sc,
+ Some(codec)
)
- .asInstanceOf[HadoopRDD[LongWritable, Text]]
- .mapPartitionsWithInputSplit {
- case (inputSplit, iterator) =>
- val file = inputSplit.asInstanceOf[FileSplit]
- iterator.map(tpl => (file.getPath.toString, tpl._2.toString))
- }
}
// Internal core:
private def saveAsSingleTextFileWithWorkingFolderInternal(
outputRDD: RDD[String],
- outputFile: String,
+ path: String,
workingFolder: String,
- compressionCodec: Option[Class[_ <: CompressionCodec]]
+ codec: Option[Class[_ <: CompressionCodec]]
): Unit = {
// We chose a random name for the temporary file:
val temporaryName = Random.alphanumeric.take(10).mkString("")
- val temporaryFile = workingFolder + "/" + temporaryName
+ val temporaryFile = s"$workingFolder/$temporaryName"
// We perform the merge into a temporary single text file:
- saveAsSingleTextFileInternal(outputRDD, temporaryFile, compressionCodec)
+ saveAsSingleTextFileInternal(outputRDD, temporaryFile, codec)
// And then only we put the resulting file in its final real location:
- HdfsHelper.moveFile(temporaryFile, outputFile, overwrite = true)
+ HdfsHelper.moveFile(temporaryFile, path, overwrite = true)
}
/** Saves RDD in exactly one file.
*
* Allows one to save an RDD as one text file, but at the same time to keep
- * the processing parallelized.
+ * the processing distributed.
*
* @param outputRDD the RDD of strings to save as text file
- * @param outputFile the path where to save the file
- * @param compression the compression codec to use (can be left to None)
+ * @param path the path where to save the file
+ * @param codec the compression codec to use (can be left to None)
*/
private def saveAsSingleTextFileInternal(
outputRDD: RDD[String],
- outputFile: String,
- compressionCodec: Option[Class[_ <: CompressionCodec]]
+ path: String,
+ codec: Option[Class[_ <: CompressionCodec]]
): Unit = {
- val fileSystem = FileSystem.get(new Configuration())
+ val hadoopConfiguration = outputRDD.sparkContext.hadoopConfiguration
+ val fileSystem = FileSystem.get(hadoopConfiguration)
// Classic saveAsTextFile in a temporary folder:
- HdfsHelper.deleteFolder(outputFile + ".tmp")
- compressionCodec match {
- case Some(compressionCodec) =>
- outputRDD.saveAsTextFile(outputFile + ".tmp", compressionCodec)
+ HdfsHelper.deleteFolder(s"$path.tmp")
+ codec match {
+ case Some(compression) =>
+ outputRDD.saveAsTextFile(s"$path.tmp", compression)
case None =>
- outputRDD.saveAsTextFile(outputFile + ".tmp")
+ outputRDD.saveAsTextFile(s"$path.tmp")
}
// Merge the folder into a single file:
- HdfsHelper.deleteFile(outputFile)
+ HdfsHelper.deleteFile(path)
FileUtil.copyMerge(
fileSystem,
- new Path(outputFile + ".tmp"),
+ new Path(s"$path.tmp"),
fileSystem,
- new Path(outputFile),
+ new Path(path),
true,
- new Configuration(),
+ hadoopConfiguration,
null)
- HdfsHelper.deleteFolder(outputFile + ".tmp")
+ HdfsHelper.deleteFolder(s"$path.tmp")
+ }
+
+ private def saveAsTextFileByKeyInternal(
+ rdd: RDD[(String, String)],
+ path: String,
+ optKeyNbr: Option[Int],
+ codec: Option[Class[_ <: CompressionCodec]]
+ ): Unit = {
+
+ HdfsHelper.deleteFolder(path)
+
+ // Whether the rdd was already cached or not (used to unpersist it if we
+ // have to get the nbr of keys):
+ val isCached = rdd.getStorageLevel.useMemory
+
+ // If the nbr of keys isn't provided, we have to get it ourselves:
+ val keyNbr = optKeyNbr.getOrElse {
+ if (!isCached)
+ rdd.cache()
+ rdd.keys.distinct.count.toInt
+ }
+
+ val prdd = rdd.partitionBy(new HashPartitioner(keyNbr))
+
+ codec match {
+ case Some(compression) =>
+ prdd.saveAsHadoopFile(
+ path,
+ classOf[String],
+ classOf[String],
+ classOf[KeyBasedOutput],
+ compression
+ )
+ case None =>
+ prdd.saveAsHadoopFile(
+ path,
+ classOf[String],
+ classOf[String],
+ classOf[KeyBasedOutput]
+ )
+ }
+
+ if (optKeyNbr.isEmpty && !isCached)
+ rdd.unpersist()
}
private def decreaseCoalescenceInternal(
highCoalescenceLevelFolder: String,
lowerCoalescenceLevelFolder: String,
- finalCoalescenceLevel: Int,
- sparkContext: SparkContext,
- compressionCodec: Option[Class[_ <: CompressionCodec]]
+ finalCoalesceLevel: Int,
+ sc: SparkContext,
+ codec: Option[Class[_ <: CompressionCodec]]
): Unit = {
- val intermediateRDD = sparkContext
+ val intermediateRDD = sc
.textFile(highCoalescenceLevelFolder)
- .coalesce(finalCoalescenceLevel)
+ .coalesce(finalCoalesceLevel)
- compressionCodec match {
- case Some(compressionCodec) =>
- intermediateRDD
- .saveAsTextFile(lowerCoalescenceLevelFolder, compressionCodec)
+ codec match {
+ case Some(compression) =>
+ intermediateRDD.saveAsTextFile(lowerCoalescenceLevelFolder, compression)
case None =>
intermediateRDD.saveAsTextFile(lowerCoalescenceLevelFolder)
}
diff --git a/src/main/scala/com/spark_helper/monitoring/Test.scala b/src/main/scala/com/spark_helper/monitoring/Test.scala
index 97942e2..80b3ad7 100644
--- a/src/main/scala/com/spark_helper/monitoring/Test.scala
+++ b/src/main/scala/com/spark_helper/monitoring/Test.scala
@@ -7,7 +7,7 @@ import java.lang.Math.abs
* This is intended to be used as parameter of Monitor.updateByKpiValidation
* and Monitor.updateByKpisValidation methods.
*
- * Some exemples of Test objects:
+ * Some examples of Test objects:
* {{{
* Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT)
* Test("pctOfSomethingElse", 0.27d, SUPERIOR_THAN, 0.3d, PCT)
@@ -19,7 +19,7 @@ import java.lang.Math.abs
*
* @constructor Creates a Test object.
*
- * Some exemples of Test objects:
+ * Some examples of Test objects:
* {{{
* Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT)
* Test("pctOfSomethingElse", 0.27d, SUPERIOR_THAN, 0.3d, PCT)
@@ -42,22 +42,22 @@ final case class Test(
kpiType: KpiType
) {
- private[spark_helper] def isSuccess(): Boolean = thresholdType match {
+ private[spark_helper] def isSuccess: Boolean = thresholdType match {
case EQUAL_TO => kpiValue == appliedThreshold
case SUPERIOR_THAN => abs(kpiValue) >= appliedThreshold
case INFERIOR_THAN => abs(kpiValue) <= appliedThreshold
}
- override def toString(): String =
+ override def toString: String =
List(
"\tKPI: " + description,
"\t\tValue: " + kpiValue.toString + kpiType.name,
"\t\tMust be " + thresholdType.name + " " + appliedThreshold.toString + kpiType.name,
- "\t\tValidated: " + isSuccess().toString
+ "\t\tValidated: " + isSuccess.toString
).mkString("\n")
}
-/** An enumeration which represents the type of threshol to use (EQUAL_TO,
+/** An enumeration which represents the type of threshold to use (EQUAL_TO,
* SUPERIOR_THAN or INFERIOR_THAN) */
sealed trait ThresholdType { def name: String }
diff --git a/src/main/scala/org/apache/spark/TextFileOverwrite.scala b/src/main/scala/org/apache/spark/TextFileOverwrite.scala
new file mode 100644
index 0000000..28935ea
--- /dev/null
+++ b/src/main/scala/org/apache/spark/TextFileOverwrite.scala
@@ -0,0 +1,54 @@
+package org.apache.spark
+
+import org.apache.spark.rdd.{RDD, HadoopRDD}
+import org.apache.spark.util.SerializableConfiguration
+import org.apache.hadoop.mapred.{FileInputFormat, JobConf, TextInputFormat}
+import org.apache.hadoop.io.{LongWritable, Text}
+import org.apache.hadoop.fs.Path
+
+object TextFileOverwrite {
+
+ def textFile(
+ paths: Seq[String],
+ minPartitions: Int,
+ sc: SparkContext
+ ): RDD[String] = {
+
+ /* Private notes:
+ *
+ * * Compared to sc.textFile(), the only difference in the implementation is
+ * the call to FileInputFormat.setInputPaths which takes Paths in input
+ * instead of a comma-separated String.
+ *
+ * * I use the package org.apache.spark to store this function, because
+ * SerializableConfiguration has the visibility private[spark] in spark's
+ * code base.
+ *
+ * * I would have preferred giving Seq[Path] instead of Seq[String] as an
+ * input of this method, but Path is not yet Serializable in the current
+ * version of hadoop-common used by Spark (it will become Serializable
+ * starting version 3 of hadoop-common).
+ *
+ * * I don't use String* (instead of Seq[String]) as for 1 String only it
+ * would confuse the compiler as to which sc.textFile to use (the default
+ * one or this one).
+ */
+
+ val confBroadcast =
+ sc.broadcast(new SerializableConfiguration(sc.hadoopConfiguration))
+
+ val setInputPathsFunc =
+ (jobConf: JobConf) =>
+ FileInputFormat.setInputPaths(jobConf, paths.map(p => new Path(p)): _*)
+
+ new HadoopRDD(
+ sc,
+ confBroadcast,
+ Some(setInputPathsFunc),
+ classOf[TextInputFormat],
+ classOf[LongWritable],
+ classOf[Text],
+ minPartitions
+ ).map(pair => pair._2.toString)
+ }
+}
diff --git a/src/test/scala/com/spark_helper/DateHelperTest.scala b/src/test/scala/com/spark_helper/DateHelperTest.scala
index 7154831..5c68404 100644
--- a/src/test/scala/com/spark_helper/DateHelperTest.scala
+++ b/src/test/scala/com/spark_helper/DateHelperTest.scala
@@ -1,7 +1,11 @@
package com.spark_helper
+import com.spark_helper.DateHelper._
+
import org.scalatest.FunSuite
+import com.spark_helper.{DateHelper => DH}
+
/** Testing facility for date helpers.
*
* @author Xavier Guihot
@@ -23,7 +27,11 @@ class DateHelperTest extends FunSuite {
)
assert(dates === expectedDates)
- // 2: With a custom formatter:
+ // 2: Same as 1, but using the pimped String:
+ dates = "20161229" to "20170103"
+ assert(dates === expectedDates)
+
+ // 3: With a custom formatter:
dates = DateHelper.daysBetween("29Dec16", "03Jan17", "ddMMMyy")
expectedDates = List(
"29Dec16",
@@ -37,25 +45,20 @@ class DateHelperTest extends FunSuite {
}
test("Reformat date") {
- assert(
- DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") === "170327")
- assert(
- DateHelper.reformatDate("20170327", "yyyyMMdd", "MMddyy") === "032717")
+ assert(DH.reformatDate("20170327", "yyyyMMdd", "yyMMdd") === "170327")
+ assert(DH.reformatDate("20170327", "yyyyMMdd", "MMddyy") === "032717")
}
test("Next day") {
- assert(DateHelper.nextDay("20170310") === "20170311")
- assert(DateHelper.nextDay("170310", "yyMMdd") === "170311")
- assert(
- DateHelper.nextDay("20170310_0000", "yyyyMMdd_HHmm") === "20170311_0000")
+ assert(DH.nextDay("20170310") === "20170311")
+ assert(DH.nextDay("170310", "yyMMdd") === "170311")
+ assert(DH.nextDay("20170310_0000", "yyyyMMdd_HHmm") === "20170311_0000")
}
test("Previous day") {
- assert(DateHelper.previousDay("20170310") === "20170309")
- assert(DateHelper.previousDay("170310", "yyMMdd") === "170309")
- assert(
- DateHelper
- .previousDay("20170310_0000", "yyyyMMdd_HHmm") === "20170309_0000")
+ assert(DH.previousDay("20170310") === "20170309")
+ assert(DH.previousDay("170310", "yyMMdd") === "170309")
+ assert(DH.previousDay("20170310_0000", "yyyyMMdd_HHmm") === "20170309_0000")
}
test("Nbr of days between two dates") {
@@ -78,7 +81,7 @@ class DateHelperTest extends FunSuite {
assert(DateHelper.nDaysBeforeDate(5, "170310", "yyMMdd") === "170305")
}
- test("Date it will be N days affter date") {
+ test("Date it will be N days after date") {
assert(DateHelper.nDaysAfterDate(3, "20170307") === "20170310")
assert(DateHelper.nDaysAfterDate(5, "170305", "yyMMdd") === "170310")
}
@@ -88,6 +91,7 @@ class DateHelperTest extends FunSuite {
}
test("Date versus provided format") {
+
assert(DateHelper.isDateCompliantWithFormat("20170302", "yyyyMMdd"))
assert(!DateHelper.isDateCompliantWithFormat("20170333", "yyyyMMdd"))
assert(DateHelper.isDateCompliantWithFormat("20170228", "yyyyMMdd"))
@@ -96,5 +100,14 @@ class DateHelperTest extends FunSuite {
assert(!DateHelper.isDateCompliantWithFormat("", "yyyyMMdd"))
assert(!DateHelper.isDateCompliantWithFormat("a", "yyyyMMdd"))
assert(!DateHelper.isDateCompliantWithFormat("24JAN17", "yyyyMMdd"))
+
+ assert("20170302".isCompliantWith("yyyyMMdd"))
+ assert(!"20170333".isCompliantWith("yyyyMMdd"))
+ assert("20170228".isCompliantWith("yyyyMMdd"))
+ assert(!"20170229".isCompliantWith("yyyyMMdd"))
+ assert(!"170228".isCompliantWith("yyyyMMdd"))
+ assert(!"".isCompliantWith("yyyyMMdd"))
+ assert(!"a".isCompliantWith("yyyyMMdd"))
+ assert(!"24JAN17".isCompliantWith("yyyyMMdd"))
}
}
diff --git a/src/test/scala/com/spark_helper/HdfsHelperTest.scala b/src/test/scala/com/spark_helper/HdfsHelperTest.scala
index 1935e89..b64306d 100644
--- a/src/test/scala/com/spark_helper/HdfsHelperTest.scala
+++ b/src/test/scala/com/spark_helper/HdfsHelperTest.scala
@@ -1,5 +1,9 @@
package com.spark_helper
+import com.spark_helper.HdfsHelper._
+
+import org.apache.hadoop.io.compress.GzipCodec
+
import com.holdenkarau.spark.testing.SharedSparkContext
import org.scalatest.FunSuite
@@ -11,60 +15,67 @@ import org.scalatest.FunSuite
*/
class HdfsHelperTest extends FunSuite with SharedSparkContext {
+ val resourceFolder = "src/test/resources"
+ val testFolder = s"$resourceFolder/folder"
+
test("Delete file/folder") {
+ val filePath = s"$testFolder/file.txt"
+
// Let's try to delete a file:
- HdfsHelper.writeToHdfsFile("", "src/test/resources/file_to_delete.txt")
+ HdfsHelper.createEmptyHdfsFile(filePath)
// 1: Let's try to delete it with the deleteFolder method:
var messageThrown = intercept[IllegalArgumentException] {
- HdfsHelper.deleteFolder("src/test/resources/file_to_delete.txt")
+ HdfsHelper.deleteFolder(filePath)
}
var expectedMessage =
"requirement failed: to delete a file, prefer using the " +
"deleteFile() method."
assert(messageThrown.getMessage === expectedMessage)
- assert(HdfsHelper.fileExists("src/test/resources/file_to_delete.txt"))
+ assert(HdfsHelper.fileExists(filePath))
// 2: Let's delete it with the deleteFile method:
- HdfsHelper.deleteFile("src/test/resources/file_to_delete.txt")
- assert(!HdfsHelper.fileExists("src/test/resources/file_to_delete.txt"))
+ HdfsHelper.deleteFile(filePath)
+ assert(!HdfsHelper.fileExists(filePath))
// Let's try to delete a folder:
- HdfsHelper
- .writeToHdfsFile("", "src/test/resources/folder_to_delete/file.txt")
+ HdfsHelper.createEmptyHdfsFile(s"$testFolder/file.txt")
// 3: Let's try to delete it with the deleteFile method:
messageThrown = intercept[IllegalArgumentException] {
- HdfsHelper.deleteFile("src/test/resources/folder_to_delete")
+ HdfsHelper.deleteFile(testFolder)
}
expectedMessage =
"requirement failed: to delete a folder, prefer using the " +
"deleteFolder() method."
assert(messageThrown.getMessage === expectedMessage)
- assert(HdfsHelper.folderExists("src/test/resources/folder_to_delete"))
+ assert(HdfsHelper.folderExists(testFolder))
// 4: Let's delete it with the deleteFolder method:
- HdfsHelper.deleteFolder("src/test/resources/folder_to_delete")
- assert(!HdfsHelper.folderExists("src/test/resources/folder_to_delete"))
+ HdfsHelper.deleteFolder(testFolder)
+ assert(!HdfsHelper.folderExists(testFolder))
}
test("File/folder exists") {
- HdfsHelper.deleteFile("src/test/resources/file_to_check.txt")
- HdfsHelper.deleteFolder("src/test/resources/folder_to_check")
+ val folderPath = s"$resourceFolder/folder"
+ val filePath = s"$folderPath/file.txt"
+
+ HdfsHelper.deleteFile(filePath)
+ HdfsHelper.deleteFolder(folderPath)
// Let's try to check if a file exists:
- assert(!HdfsHelper.fileExists("src/test/resources/file_to_check.txt"))
+ assert(!HdfsHelper.fileExists(filePath))
- HdfsHelper.writeToHdfsFile("", "src/test/resources/file_to_check.txt")
+ HdfsHelper.createEmptyHdfsFile(filePath)
// 1: Let's try to check it exists with the folderExists method:
var messageThrown = intercept[IllegalArgumentException] {
- HdfsHelper.folderExists("src/test/resources/file_to_check.txt")
+ HdfsHelper.folderExists(filePath)
}
var expectedMessage =
"requirement failed: to check if a file exists, prefer using the " +
@@ -72,18 +83,18 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext {
assert(messageThrown.getMessage === expectedMessage)
// 2: Let's try to check it exists with the fileExists method:
- assert(HdfsHelper.fileExists("src/test/resources/file_to_check.txt"))
+ assert(HdfsHelper.fileExists(filePath))
// Let's try to check if a folder exists:
- assert(!HdfsHelper.folderExists("src/test/resources/folder_to_check"))
+ HdfsHelper.deleteFolder(folderPath)
+ assert(!HdfsHelper.folderExists(folderPath))
- HdfsHelper
- .writeToHdfsFile("", "src/test/resources/folder_to_check/file.txt")
+ HdfsHelper.createEmptyHdfsFile(filePath)
// 3: Let's try to check it exists with the fileExists method:
messageThrown = intercept[IllegalArgumentException] {
- HdfsHelper.fileExists("src/test/resources/folder_to_check")
+ HdfsHelper.fileExists(folderPath)
}
expectedMessage =
"requirement failed: to check if a folder exists, prefer using " +
@@ -91,377 +102,325 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext {
assert(messageThrown.getMessage === expectedMessage)
// 2: Let's try to check it exists with the folderExists method:
- assert(HdfsHelper.folderExists("src/test/resources/folder_to_check"))
+ assert(HdfsHelper.folderExists(folderPath))
- HdfsHelper.deleteFile("src/test/resources/file_to_check.txt")
- HdfsHelper.deleteFolder("src/test/resources/folder_to_check")
+ HdfsHelper.deleteFile(filePath)
+ HdfsHelper.deleteFolder(folderPath)
}
test("Create an empty file on hdfs") {
- HdfsHelper.deleteFile("src/test/resources/empty_file.token")
+ val filePath = s"$testFolder/empty_file.token"
- HdfsHelper.createEmptyHdfsFile("src/test/resources/empty_file.token")
+ HdfsHelper.deleteFile(filePath)
- assert(HdfsHelper.fileExists("src/test/resources/empty_file.token"))
+ HdfsHelper.createEmptyHdfsFile(filePath)
- val tokenContent = sc
- .textFile("src/test/resources/empty_file.token")
- .collect()
- .sorted
- .mkString("\n")
+ assert(HdfsHelper.fileExists(filePath))
+ val tokenContent = sc.textFile(filePath).collect().sorted.mkString("\n")
assert(tokenContent === "")
- HdfsHelper.deleteFile("src/test/resources/empty_file.token")
+ HdfsHelper.deleteFile(filePath)
}
test(
"Save text in HDFS file with the fileSystem API instead of the Spark API") {
- // 1: Stores using a "\n"-joined string:
+ val filePath = s"$testFolder/small_file.txt"
- HdfsHelper.deleteFile("src/test/resources/folder/small_file.txt")
+ HdfsHelper.deleteFolder(testFolder)
- val contentToStore = "Hello World\nWhatever"
+ // 1: Stores using a "\n"-joined string:
- HdfsHelper.writeToHdfsFile(
- contentToStore,
- "src/test/resources/folder/small_file.txt")
+ val contentToStore = "Hello World\nWhatever"
- assert(HdfsHelper.fileExists("src/test/resources/folder/small_file.txt"))
+ HdfsHelper.writeToHdfsFile(contentToStore, filePath)
- var storedContent = sc
- .textFile("src/test/resources/folder/small_file.txt")
- .collect()
- .sorted
- .mkString("\n")
+ assert(HdfsHelper.fileExists(filePath))
+ var storedContent = sc.textFile(filePath).collect().sorted.mkString("\n")
assert(storedContent === contentToStore)
- HdfsHelper.deleteFolder("src/test/resources/folder")
+ HdfsHelper.deleteFolder(testFolder)
// 2: Stores using a list of strings to be "\n"-joined:
- HdfsHelper.deleteFile("src/test/resources/folder/small_file.txt")
-
val listToStore = List("Hello World", "Whatever")
+ HdfsHelper.writeToHdfsFile(listToStore, filePath)
- HdfsHelper
- .writeToHdfsFile(listToStore, "src/test/resources/folder/small_file.txt")
+ assert(HdfsHelper.fileExists(filePath))
- assert(HdfsHelper.fileExists("src/test/resources/folder/small_file.txt"))
+ storedContent = sc.textFile(filePath).collect().sorted.mkString("\n")
+ assert(storedContent === listToStore.mkString("\n"))
- storedContent = sc
- .textFile("src/test/resources/folder/small_file.txt")
- .collect()
- .sorted
- .mkString("\n")
+ HdfsHelper.deleteFolder(testFolder)
- assert(storedContent === listToStore.mkString("\n"))
+ // 3: Using the pimped Seq/String:
- HdfsHelper.deleteFolder("src/test/resources/folder")
+ val seqToStore = Seq("Hello World", "Whatever")
+ seqToStore.writeToHdfs(filePath)
+ assert(HdfsHelper.fileExists(filePath))
+ storedContent = sc.textFile(filePath).collect().sorted.mkString("\n")
+ assert(storedContent === contentToStore)
+ HdfsHelper.deleteFolder(testFolder)
+
+ listToStore.writeToHdfs(filePath)
+ assert(HdfsHelper.fileExists(filePath))
+ storedContent = sc.textFile(filePath).collect().sorted.mkString("\n")
+ assert(storedContent === contentToStore)
+ HdfsHelper.deleteFolder(testFolder)
+
+ contentToStore.writeToHdfs(filePath)
+ assert(HdfsHelper.fileExists(filePath))
+ storedContent = sc.textFile(filePath).collect().sorted.mkString("\n")
+ assert(storedContent === contentToStore)
+ HdfsHelper.deleteFolder(testFolder)
}
test("List file names in Hdfs folder") {
- HdfsHelper.writeToHdfsFile("", "src/test/resources/folder_1/file_1.txt")
- HdfsHelper.writeToHdfsFile("", "src/test/resources/folder_1/file_2.csv")
- HdfsHelper
- .writeToHdfsFile("", "src/test/resources/folder_1/folder_2/file_3.txt")
+ val folder1 = s"$resourceFolder/folder_1"
+
+ HdfsHelper.createEmptyHdfsFile(s"$folder1/file_1.txt")
+ HdfsHelper.createEmptyHdfsFile(s"$folder1/file_2.csv")
+ HdfsHelper.createEmptyHdfsFile(s"$folder1/folder_2/file_3.txt")
// 1: Not recursive, names only:
- var fileNames =
- HdfsHelper.listFileNamesInFolder("src/test/resources/folder_1")
+ var fileNames = HdfsHelper.listFileNamesInFolder(folder1)
var expectedFileNames = List("file_1.txt", "file_2.csv")
assert(fileNames === expectedFileNames)
// 2: Not recursive, full paths:
- fileNames = HdfsHelper
- .listFileNamesInFolder("src/test/resources/folder_1", onlyName = false)
- expectedFileNames = List(
- "src/test/resources/folder_1/file_1.txt",
- "src/test/resources/folder_1/file_2.csv"
- )
+ fileNames = HdfsHelper.listFileNamesInFolder(folder1, onlyName = false)
+ expectedFileNames = List(s"$folder1/file_1.txt", s"$folder1/file_2.csv")
assert(fileNames === expectedFileNames)
// 3: Recursive, names only:
- fileNames = HdfsHelper
- .listFileNamesInFolder("src/test/resources/folder_1", recursive = true)
+ fileNames = HdfsHelper.listFileNamesInFolder(folder1, recursive = true)
expectedFileNames = List("file_1.txt", "file_2.csv", "file_3.txt")
assert(fileNames === expectedFileNames)
// 4: Recursive, full paths:
- fileNames = HdfsHelper.listFileNamesInFolder(
- "src/test/resources/folder_1",
- recursive = true,
- onlyName = false)
+ fileNames = HdfsHelper
+ .listFileNamesInFolder(folder1, recursive = true, onlyName = false)
expectedFileNames = List(
- "src/test/resources/folder_1/file_1.txt",
- "src/test/resources/folder_1/file_2.csv",
- "src/test/resources/folder_1/folder_2/file_3.txt"
+ s"$folder1/file_1.txt",
+ s"$folder1/file_2.csv",
+ s"$folder1/folder_2/file_3.txt"
)
assert(fileNames === expectedFileNames)
- HdfsHelper.deleteFolder("src/test/resources/folder_1")
+ HdfsHelper.deleteFolder(folder1)
}
test("List folder names in Hdfs folder") {
- HdfsHelper.writeToHdfsFile("", "src/test/resources/folder_1/file_1.txt")
- HdfsHelper
- .writeToHdfsFile("", "src/test/resources/folder_1/folder_2/file_2.txt")
- HdfsHelper
- .writeToHdfsFile("", "src/test/resources/folder_1/folder_3/file_3.txt")
+ val folder1 = s"$resourceFolder/folder_1"
- val folderNames = HdfsHelper.listFolderNamesInFolder(
- "src/test/resources/folder_1"
- )
+ HdfsHelper.createEmptyHdfsFile(s"$folder1/file_1.txt")
+ HdfsHelper.createEmptyHdfsFile(s"$folder1/folder_2/file_2.txt")
+ HdfsHelper.createEmptyHdfsFile(s"$folder1/folder_3/file_3.txt")
+
+ val folderNames = HdfsHelper.listFolderNamesInFolder(folder1)
val expectedFolderNames = List("folder_2", "folder_3")
assert(folderNames === expectedFolderNames)
- HdfsHelper.deleteFolder("src/test/resources/folder_1")
+ HdfsHelper.deleteFolder(folder1)
}
test("Move file") {
+ val filePath = s"$testFolder/some_file.txt"
+ val renamedPath = s"$testFolder/renamed_file.txt"
+
// Let's remove possible previous stuff:
- HdfsHelper.deleteFile("src/test/resources/some_file.txt")
- HdfsHelper.deleteFile("src/test/resources/renamed_file.txt")
+ HdfsHelper.deleteFolder(testFolder)
// Let's create the file to rename:
- HdfsHelper.writeToHdfsFile("whatever", "src/test/resources/some_file.txt")
+ HdfsHelper.writeToHdfsFile("whatever", filePath)
// 1: Let's try to move the file on a file which already exists without
// the overwrite option:
- assert(HdfsHelper.fileExists("src/test/resources/some_file.txt"))
- assert(!HdfsHelper.fileExists("src/test/resources/renamed_file.txt"))
+ assert(HdfsHelper.fileExists(filePath))
+ assert(!HdfsHelper.fileExists(renamedPath))
// Let's create the existing file where we want to move our file:
- HdfsHelper.writeToHdfsFile("", "src/test/resources/renamed_file.txt")
+ HdfsHelper.createEmptyHdfsFile(renamedPath)
// Let's rename the file to the path where a file already exists:
val ioExceptionThrown = intercept[IllegalArgumentException] {
- HdfsHelper.moveFile(
- "src/test/resources/some_file.txt",
- "src/test/resources/renamed_file.txt")
+ HdfsHelper.moveFile(filePath, renamedPath)
}
var expectedMessage =
"requirement failed: overwrite option set to false, but a file " +
- "already exists at target location src/test/resources/renamed_file.txt"
+ "already exists at target location " +
+ "src/test/resources/folder/renamed_file.txt"
assert(ioExceptionThrown.getMessage === expectedMessage)
- assert(HdfsHelper.fileExists("src/test/resources/some_file.txt"))
- assert(HdfsHelper.fileExists("src/test/resources/renamed_file.txt"))
+ assert(HdfsHelper.fileExists(filePath))
+ assert(HdfsHelper.fileExists(renamedPath))
- HdfsHelper.deleteFile("src/test/resources/renamed_file.txt")
+ HdfsHelper.deleteFile(renamedPath)
// 2: Let's fail to move the file with the moveFolder() method:
- assert(HdfsHelper.fileExists("src/test/resources/some_file.txt"))
- assert(!HdfsHelper.fileExists("src/test/resources/renamed_file.txt"))
+ assert(HdfsHelper.fileExists(filePath))
+ assert(!HdfsHelper.fileExists(renamedPath))
// Let's rename the file:
val illegalArgExceptionThrown = intercept[IllegalArgumentException] {
- HdfsHelper.moveFolder(
- "src/test/resources/some_file.txt",
- "src/test/resources/renamed_file.txt")
+ HdfsHelper.moveFolder(filePath, renamedPath)
}
expectedMessage =
"requirement failed: to move a file, prefer using the " +
"moveFile() method."
assert(illegalArgExceptionThrown.getMessage === expectedMessage)
- assert(HdfsHelper.fileExists("src/test/resources/some_file.txt"))
- assert(!HdfsHelper.fileExists("src/test/resources/renamed_file.txt"))
+ assert(HdfsHelper.fileExists(filePath))
+ assert(!HdfsHelper.fileExists(renamedPath))
- // 3: Let's successfuly move the file with the moveFile() method:
+ // 3: Let's successfully move the file with the moveFile() method:
// Let's rename the file:
- HdfsHelper.moveFile(
- "src/test/resources/some_file.txt",
- "src/test/resources/renamed_file.txt")
+ HdfsHelper.moveFile(filePath, renamedPath)
- assert(!HdfsHelper.fileExists("src/test/resources/some_file.txt"))
- assert(HdfsHelper.fileExists("src/test/resources/renamed_file.txt"))
-
- val newContent = sc.textFile("src/test/resources/renamed_file.txt").collect
+ assert(!HdfsHelper.fileExists(filePath))
+ assert(HdfsHelper.fileExists(renamedPath))
+ val newContent = sc.textFile(renamedPath).collect
assert(Array("whatever") === newContent)
- HdfsHelper.deleteFile("src/test/resources/renamed_file.txt")
+ HdfsHelper.deleteFolder(testFolder)
}
test("Move folder") {
+ val folderToMove = s"$testFolder/folder_to_move"
+ val renamedFolder = s"$testFolder/renamed_folder"
+
// Let's remove possible previous stuff:
- HdfsHelper.deleteFolder("src/test/resources/some_folder_to_move")
- HdfsHelper.deleteFolder("src/test/resources/renamed_folder")
+ HdfsHelper.deleteFolder(testFolder)
// Let's create the folder to rename:
- HdfsHelper.writeToHdfsFile(
- "whatever",
- "src/test/resources/some_folder_to_move/file_1.txt")
- HdfsHelper.writeToHdfsFile(
- "something",
- "src/test/resources/some_folder_to_move/file_2.txt")
+ HdfsHelper.writeToHdfsFile("whatever", s"$folderToMove/file_1.txt")
+ HdfsHelper.writeToHdfsFile("something", s"$folderToMove/file_2.txt")
// 1: Let's fail to move the folder with the moveFile() method:
- assert(
- HdfsHelper.fileExists(
- "src/test/resources/some_folder_to_move/file_1.txt"))
- assert(
- HdfsHelper.fileExists(
- "src/test/resources/some_folder_to_move/file_2.txt"))
- assert(!HdfsHelper.folderExists("src/test/resources/renamed_folder"))
+ assert(HdfsHelper.fileExists(s"$folderToMove/file_1.txt"))
+ assert(HdfsHelper.fileExists(s"$folderToMove/file_2.txt"))
+ assert(!HdfsHelper.folderExists(renamedFolder))
// Let's rename the folder:
val messageThrown = intercept[IllegalArgumentException] {
- HdfsHelper.moveFile(
- "src/test/resources/some_folder_to_move",
- "src/test/resources/renamed_folder")
+ HdfsHelper.moveFile(folderToMove, renamedFolder)
}
val expectedMessage =
"requirement failed: to move a folder, prefer using the " +
"moveFolder() method."
assert(messageThrown.getMessage === expectedMessage)
- assert(
- HdfsHelper.fileExists(
- "src/test/resources/some_folder_to_move/file_1.txt"))
- assert(
- HdfsHelper.fileExists(
- "src/test/resources/some_folder_to_move/file_2.txt"))
- assert(!HdfsHelper.folderExists("src/test/resources/renamed_folder"))
+ assert(HdfsHelper.fileExists(s"$folderToMove/file_1.txt"))
+ assert(HdfsHelper.fileExists(s"$folderToMove/file_2.txt"))
+ assert(!HdfsHelper.folderExists(renamedFolder))
- // 2: Let's successfuly move the folder with the moveFolder() method:
+ // 2: Let's successfully move the folder with the moveFolder() method:
// Let's rename the folder:
- HdfsHelper.moveFolder(
- "src/test/resources/some_folder_to_move",
- "src/test/resources/renamed_folder")
+ HdfsHelper.moveFolder(folderToMove, renamedFolder)
- assert(!HdfsHelper.folderExists("src/test/resources/some_folder_to_move"))
- assert(
- HdfsHelper.fileExists("src/test/resources/renamed_folder/file_1.txt"))
- assert(
- HdfsHelper.fileExists("src/test/resources/renamed_folder/file_2.txt"))
-
- val newContent =
- sc.textFile("src/test/resources/renamed_folder").collect().sorted
+ assert(!HdfsHelper.folderExists(folderToMove))
+ assert(HdfsHelper.fileExists(s"$renamedFolder/file_1.txt"))
+ assert(HdfsHelper.fileExists(s"$renamedFolder/file_2.txt"))
+ val newContent = sc.textFile(renamedFolder).collect().sorted
assert(newContent === Array("something", "whatever"))
- HdfsHelper.deleteFolder("src/test/resources/renamed_folder")
+ HdfsHelper.deleteFolder(testFolder)
}
test("Append header and footer to file") {
+ val filePath = s"$testFolder/header_footer_file.txt"
+ val tmpFolder = s"$testFolder/header_footer_tmp"
+
// 1: Without the tmp/working folder:
- HdfsHelper.deleteFile("src/test/resources/header_footer_file.txt")
+ HdfsHelper.deleteFolder(testFolder)
// Let's create the file for which to add header and footer:
- HdfsHelper.writeToHdfsFile(
- "whatever\nsomething else\n",
- "src/test/resources/header_footer_file.txt")
+ HdfsHelper.writeToHdfsFile("whatever\nsomething else\n", filePath)
- HdfsHelper.appendHeaderAndFooter(
- "src/test/resources/header_footer_file.txt",
- "my_header",
- "my_footer")
+ HdfsHelper.appendHeaderAndFooter(filePath, "my_header", "my_footer")
- var newContent = sc
- .textFile("src/test/resources/header_footer_file.txt")
- .collect
- .mkString("\n")
+ var newContent = sc.textFile(filePath).collect.mkString("\n")
- var expectedNewContent = (
+ var expectedNewContent =
"my_header\n" +
"whatever\n" +
"something else\n" +
"my_footer"
- )
assert(newContent === expectedNewContent)
- HdfsHelper.deleteFile("src/test/resources/header_footer_file.txt")
+ HdfsHelper.deleteFile(filePath)
// 2: With the tmp/working folder:
// Let's create the file for which to add header and footer:
- HdfsHelper.writeToHdfsFile(
- "whatever\nsomething else\n",
- "src/test/resources/header_footer_file.txt")
+ HdfsHelper.writeToHdfsFile("whatever\nsomething else\n", filePath)
- HdfsHelper.appendHeaderAndFooter(
- "src/test/resources/header_footer_file.txt",
- "my_header",
- "my_footer",
- workingFolderPath = "src/test/resources/header_footer_tmp")
+ HdfsHelper
+ .appendHeaderAndFooter(filePath, "my_header", "my_footer", tmpFolder)
- assert(HdfsHelper.folderExists("src/test/resources/header_footer_tmp"))
- assert(
- !HdfsHelper.fileExists("src/test/resources/header_footer_tmp/xml.tmp"))
+ assert(HdfsHelper.folderExists(tmpFolder))
+ assert(!HdfsHelper.fileExists(s"$tmpFolder/xml.tmp"))
- newContent = sc
- .textFile("src/test/resources/header_footer_file.txt")
- .collect
- .mkString("\n")
+ newContent = sc.textFile(filePath).collect.mkString("\n")
- expectedNewContent = (
+ expectedNewContent =
"my_header\n" +
"whatever\n" +
"something else\n" +
"my_footer"
- )
assert(newContent === expectedNewContent)
- HdfsHelper.deleteFile("src/test/resources/header_footer_file.txt")
- HdfsHelper.deleteFolder("src/test/resources/header_footer_tmp")
+ HdfsHelper.deleteFolder(testFolder)
}
test("Validate Xml Hdfs file with Xsd") {
+ val xmlPath = s"$testFolder/file.xml"
+
// 1: Valid xml:
- HdfsHelper.deleteFile("src/test/resources/xml_file.txt")
+ HdfsHelper.deleteFolder(testFolder)
HdfsHelper.writeToHdfsFile(
"