From c6c3f8cd2b0b5b0b03a3e046671450809ea013af Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Thu, 24 May 2018 20:08:28 +0100 Subject: [PATCH 01/25] Create implicit class to attach saveAsSingleTextFile to RDDs --- .../scala/com/spark_helper/SparkHelper.scala | 79 +++++++++++++++++++ .../com/spark_helper/SparkHelperTest.scala | 77 +++++++++++------- 2 files changed, 126 insertions(+), 30 deletions(-) diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index 3f612eb..dcc7a8e 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -42,6 +42,85 @@ import scala.util.Random */ object SparkHelper extends Serializable { + implicit class RDDExtensions(val rdd: RDD[String]) extends AnyVal { + + /** Saves an RDD in exactly one file. + * + * Allows one to save an RDD in one file, while keeping the processing + * parallelized. + * + * {{{ rdd.saveAsSingleTextFile("/my/file/path.txt") }}} + * + * @param outputFile the path of the produced file + */ + def saveAsSingleTextFile(outputFile: String): Unit = + SparkHelper.saveAsSingleTextFile(rdd, outputFile) + + /** Saves an RDD in exactly one file. + * + * Allows one to save an RDD in one file, while keeping the processing + * parallelized. + * + * {{{ rdd.saveAsSingleTextFile("/my/file/path.txt", classOf[BZip2Codec]) }}} + * + * @param outputFile the path of the produced file + * @param compressionCodec the type of compression to use (for instance + * classOf[BZip2Codec] or classOf[GzipCodec])) + */ + def saveAsSingleTextFile( + outputFile: String, + compressionCodec: Class[_ <: CompressionCodec] + ): Unit = + SparkHelper.saveAsSingleTextFile(rdd, outputFile, compressionCodec) + + /** Saves an RDD in exactly one file. + * + * Allows one to save an RDD in one file, while keeping the processing + * parallelized. + * + * This variant of saveAsSingleTextFile performs the storage in a temporary + * folder instead of directly in the final output folder. This way the + * risks of having corrupted files in the real output folder due to cluster + * interruptions is minimized. + * + * {{{ rdd.saveAsSingleTextFile("/my/file/path.txt", "/my/working/folder/path") }}} + * + * @param outputFile the path of the produced file + * @param workingFolder the path where file manipulations will temporarily + * happen. + */ + def saveAsSingleTextFile(outputFile: String, workingFolder: String): Unit = + SparkHelper.saveAsSingleTextFile(rdd, outputFile, workingFolder) + + /** Saves an RDD in exactly one file. + * + * Allows one to save an RDD in one file, while keeping the processing + * parallelized. + * + * This variant of saveAsSingleTextFile performs the storage in a temporary + * folder instead of directly in the final output folder. This way the risks + * of having corrupted files in the real output folder due to cluster + * interruptions is minimized. + * + * {{{ + * rdd.saveAsSingleTextFile("/my/file/path.txt", "/my/working/folder/path", classOf[BZip2Codec]) + * }}} + * + * @param outputFile the path of the produced file + * @param workingFolder the path where file manipulations will temporarily + * happen. + * @param compressionCodec the type of compression to use (for instance + * classOf[BZip2Codec] or classOf[GzipCodec])) + */ + def saveAsSingleTextFile( + outputFile: String, + workingFolder: String, + compressionCodec: Class[_ <: CompressionCodec] + ): Unit = + SparkHelper + .saveAsSingleTextFile(rdd, outputFile, workingFolder, compressionCodec) + } + /** Saves an RDD in exactly one file. * * Allows one to save an RDD in one file, while keeping the processing diff --git a/src/test/scala/com/spark_helper/SparkHelperTest.scala b/src/test/scala/com/spark_helper/SparkHelperTest.scala index b3578df..3e60a7a 100644 --- a/src/test/scala/com/spark_helper/SparkHelperTest.scala +++ b/src/test/scala/com/spark_helper/SparkHelperTest.scala @@ -1,5 +1,9 @@ package com.spark_helper +import com.spark_helper.SparkHelper.RDDExtensions + +import org.apache.hadoop.io.compress.GzipCodec + import com.holdenkarau.spark.testing.{SharedSparkContext, RDDComparisons} import org.scalatest.FunSuite @@ -14,54 +18,67 @@ class SparkHelperTest with SharedSparkContext with RDDComparisons { + val resourceFolder = "src/test/resources" + test("Save as single text file") { + val testFolder = s"$resourceFolder/folder" + val singleTextFilePath = s"$testFolder/single_text_file.txt" + val tmpFolder = s"$resourceFolder/tmp" + + HdfsHelper.deleteFolder(testFolder) + HdfsHelper.deleteFolder(tmpFolder) + + val rddToStore = + sc.parallelize(Array("data_a", "data_b", "data_c")).repartition(3) + // 1: Without an intermediate working dir: - var repartitionedDataToStore = sc - .parallelize(Array("data_a", "data_b", "data_c")) - .repartition(3) + SparkHelper.saveAsSingleTextFile(rddToStore, singleTextFilePath) - HdfsHelper.deleteFile("src/test/resources/single_text_file.txt") - SparkHelper.saveAsSingleTextFile( - repartitionedDataToStore, - "src/test/resources/single_text_file.txt") + var singleFileStoredData = sc.textFile(singleTextFilePath).collect().sorted - var singleFileStoredData = sc - .textFile("src/test/resources/single_text_file.txt") - .collect() - .sorted + assert(singleFileStoredData === Array("data_a", "data_b", "data_c")) + + HdfsHelper.deleteFolder(testFolder) + + // 1-bis: same, but using the implicit RDD extension: + + rddToStore.saveAsSingleTextFile(singleTextFilePath) + + singleFileStoredData = sc.textFile(singleTextFilePath).collect().sorted assert(singleFileStoredData === Array("data_a", "data_b", "data_c")) - HdfsHelper.deleteFile("src/test/resources/single_text_file.txt") + HdfsHelper.deleteFolder(testFolder) // 2: With an intermediate working dir: // Notice as well that we test by moving the single file in a folder // which doesn't exists. - repartitionedDataToStore = sc - .parallelize(Array("data_a", "data_b", "data_c")) - .repartition(3) - - HdfsHelper.deleteFile("src/test/resources/folder/single_text_file.txt") - HdfsHelper.deleteFolder("src/test/resources/folder") SparkHelper.saveAsSingleTextFile( - repartitionedDataToStore, - "src/test/resources/folder/single_text_file.txt", - workingFolder = "src/test/resources/tmp") - assert( - HdfsHelper.fileExists("src/test/resources/folder/single_text_file.txt")) - - singleFileStoredData = sc - .textFile("src/test/resources/folder/single_text_file.txt") - .collect() - .sorted + rddToStore, + singleTextFilePath, + workingFolder = tmpFolder) + + singleFileStoredData = sc.textFile(singleTextFilePath).collect().sorted + + assert(singleFileStoredData === Array("data_a", "data_b", "data_c")) + + HdfsHelper.deleteFolder(testFolder) + HdfsHelper.deleteFolder(tmpFolder) + + // 3: With a compression codec: + + rddToStore + .saveAsSingleTextFile(s"$singleTextFilePath.gz", classOf[GzipCodec]) + + singleFileStoredData = + sc.textFile(s"$singleTextFilePath.gz").collect().sorted assert(singleFileStoredData === Array("data_a", "data_b", "data_c")) - HdfsHelper.deleteFolder("src/test/resources/folder") - HdfsHelper.deleteFolder("src/test/resources/tmp") + HdfsHelper.deleteFolder(testFolder) } test("Read text file with specific record delimiter") { From a23924667af312246d1b474d942aa730e4fe71be Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Thu, 24 May 2018 20:18:00 +0100 Subject: [PATCH 02/25] Remove previous saveAsSingleTextFile by the ones implicitely attached to the SparkContext --- .../scala/com/spark_helper/SparkHelper.scala | 123 +++--------------- .../com/spark_helper/SparkHelperTest.scala | 39 +++--- 2 files changed, 31 insertions(+), 131 deletions(-) diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index dcc7a8e..03d2764 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -54,7 +54,7 @@ object SparkHelper extends Serializable { * @param outputFile the path of the produced file */ def saveAsSingleTextFile(outputFile: String): Unit = - SparkHelper.saveAsSingleTextFile(rdd, outputFile) + SparkHelper.saveAsSingleTextFileInternal(rdd, outputFile, None) /** Saves an RDD in exactly one file. * @@ -71,7 +71,8 @@ object SparkHelper extends Serializable { outputFile: String, compressionCodec: Class[_ <: CompressionCodec] ): Unit = - SparkHelper.saveAsSingleTextFile(rdd, outputFile, compressionCodec) + SparkHelper + .saveAsSingleTextFileInternal(rdd, outputFile, Some(compressionCodec)) /** Saves an RDD in exactly one file. * @@ -90,7 +91,12 @@ object SparkHelper extends Serializable { * happen. */ def saveAsSingleTextFile(outputFile: String, workingFolder: String): Unit = - SparkHelper.saveAsSingleTextFile(rdd, outputFile, workingFolder) + SparkHelper.saveAsSingleTextFileWithWorkingFolderInternal( + rdd, + outputFile, + workingFolder, + None + ) /** Saves an RDD in exactly one file. * @@ -117,113 +123,14 @@ object SparkHelper extends Serializable { workingFolder: String, compressionCodec: Class[_ <: CompressionCodec] ): Unit = - SparkHelper - .saveAsSingleTextFile(rdd, outputFile, workingFolder, compressionCodec) + SparkHelper.saveAsSingleTextFileWithWorkingFolderInternal( + rdd, + outputFile, + workingFolder, + Some(compressionCodec) + ) } - /** Saves an RDD in exactly one file. - * - * Allows one to save an RDD in one file, while keeping the processing - * parallelized. - * - * {{{ SparkHelper.saveAsSingleTextFile(myRddToStore, "/my/file/path.txt") }}} - * - * @param outputRDD the RDD of strings to store in one file - * @param outputFile the path of the produced file - */ - def saveAsSingleTextFile(outputRDD: RDD[String], outputFile: String): Unit = - saveAsSingleTextFileInternal(outputRDD, outputFile, None) - - /** Saves an RDD in exactly one file. - * - * Allows one to save an RDD in one file, while keeping the processing - * parallelized. - * - * {{{ - * SparkHelper.saveAsSingleTextFile( - * myRddToStore, "/my/file/path.txt", classOf[BZip2Codec]) - * }}} - * - * @param outputRDD the RDD of strings to store in one file - * @param outputFile the path of the produced file - * @param compressionCodec the type of compression to use (for instance - * classOf[BZip2Codec] or classOf[GzipCodec])) - */ - def saveAsSingleTextFile( - outputRDD: RDD[String], - outputFile: String, - compressionCodec: Class[_ <: CompressionCodec] - ): Unit = - saveAsSingleTextFileInternal(outputRDD, outputFile, Some(compressionCodec)) - - /** Saves an RDD in exactly one file. - * - * Allows one to save an RDD in one file, while keeping the processing - * parallelized. - * - * This variant of saveAsSingleTextFile performs the storage in a temporary - * folder instead of directly in the final output folder. This way the - * risks of having corrupted files in the real output folder due to cluster - * interruptions is minimized. - * - * {{{ - * SparkHelper.saveAsSingleTextFile( - * myRddToStore, "/my/file/path.txt", "/my/working/folder/path") - * }}} - * - * @param outputRDD the RDD of strings to store in one file - * @param outputFile the path of the produced file - * @param workingFolder the path where file manipulations will temporarily - * happen. - */ - def saveAsSingleTextFile( - outputRDD: RDD[String], - outputFile: String, - workingFolder: String - ): Unit = - saveAsSingleTextFileWithWorkingFolderInternal( - outputRDD, - outputFile, - workingFolder, - None) - - /** Saves an RDD in exactly one file. - * - * Allows one to save an RDD in one file, while keeping the processing - * parallelized. - * - * This variant of saveAsSingleTextFile performs the storage in a temporary - * folder instead of directly in the final output folder. This way the risks - * of having corrupted files in the real output folder due to cluster - * interruptions is minimized. - * - * {{{ - * SparkHelper.saveAsSingleTextFile( - * myRddToStore, - * "/my/file/path.txt", - * "/my/working/folder/path", - * classOf[BZip2Codec]) - * }}} - * - * @param outputRDD the RDD of strings to store in one file - * @param outputFile the path of the produced file - * @param workingFolder the path where file manipulations will temporarily - * happen. - * @param compressionCodec the type of compression to use (for instance - * classOf[BZip2Codec] or classOf[GzipCodec])) - */ - def saveAsSingleTextFile( - outputRDD: RDD[String], - outputFile: String, - workingFolder: String, - compressionCodec: Class[_ <: CompressionCodec] - ): Unit = - saveAsSingleTextFileWithWorkingFolderInternal( - outputRDD, - outputFile, - workingFolder, - Some(compressionCodec)) - /** Equivalent to sparkContext.textFile(), but for a specific record delimiter. * * By default, sparkContext.textFile() will provide one record per line. But diff --git a/src/test/scala/com/spark_helper/SparkHelperTest.scala b/src/test/scala/com/spark_helper/SparkHelperTest.scala index 3e60a7a..4272a73 100644 --- a/src/test/scala/com/spark_helper/SparkHelperTest.scala +++ b/src/test/scala/com/spark_helper/SparkHelperTest.scala @@ -34,19 +34,9 @@ class SparkHelperTest // 1: Without an intermediate working dir: - SparkHelper.saveAsSingleTextFile(rddToStore, singleTextFilePath) - - var singleFileStoredData = sc.textFile(singleTextFilePath).collect().sorted - - assert(singleFileStoredData === Array("data_a", "data_b", "data_c")) - - HdfsHelper.deleteFolder(testFolder) - - // 1-bis: same, but using the implicit RDD extension: - rddToStore.saveAsSingleTextFile(singleTextFilePath) - singleFileStoredData = sc.textFile(singleTextFilePath).collect().sorted + var singleFileStoredData = sc.textFile(singleTextFilePath).collect().sorted assert(singleFileStoredData === Array("data_a", "data_b", "data_c")) @@ -56,10 +46,10 @@ class SparkHelperTest // Notice as well that we test by moving the single file in a folder // which doesn't exists. - SparkHelper.saveAsSingleTextFile( - rddToStore, + rddToStore.saveAsSingleTextFile( singleTextFilePath, - workingFolder = tmpFolder) + workingFolder = tmpFolder + ) singleFileStoredData = sc.textFile(singleTextFilePath).collect().sorted @@ -229,15 +219,18 @@ class SparkHelperTest HdfsHelper.deleteFolder("src/test/resources/re_coalescence_test_output") // Let's create the folder with high level of coalescence (3 files): - SparkHelper.saveAsSingleTextFile( - sc.parallelize[String](Array("data_1_a", "data_1_b", "data_1_c")), - "src/test/resources/re_coalescence_test_input/input_file_1") - SparkHelper.saveAsSingleTextFile( - sc.parallelize[String](Array("data_2_a", "data_2_b")), - "src/test/resources/re_coalescence_test_input/input_file_2") - SparkHelper.saveAsSingleTextFile( - sc.parallelize[String](Array("data_3_a", "data_3_b", "data_3_c")), - "src/test/resources/re_coalescence_test_input/input_file_3") + sc.parallelize[String](Array("data_1_a", "data_1_b", "data_1_c")) + .saveAsSingleTextFile( + "src/test/resources/re_coalescence_test_input/input_file_1" + ) + sc.parallelize[String](Array("data_2_a", "data_2_b")) + .saveAsSingleTextFile( + "src/test/resources/re_coalescence_test_input/input_file_2" + ) + sc.parallelize[String](Array("data_3_a", "data_3_b", "data_3_c")) + .saveAsSingleTextFile( + "src/test/resources/re_coalescence_test_input/input_file_3" + ) // Let's decrease the coalescence level in order to only have 2 files: SparkHelper.decreaseCoalescence( From 592564f69b084b20ce3f333b9188a0b7fd7ac6fb Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Thu, 24 May 2018 20:32:16 +0100 Subject: [PATCH 03/25] Clean-up saveAsSingleTextFile internals --- .../scala/com/spark_helper/SparkHelper.scala | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index 03d2764..b30400e 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -545,7 +545,7 @@ object SparkHelper extends Serializable { // We chose a random name for the temporary file: val temporaryName = Random.alphanumeric.take(10).mkString("") - val temporaryFile = workingFolder + "/" + temporaryName + val temporaryFile = s"$workingFolder/$temporaryName" // We perform the merge into a temporary single text file: saveAsSingleTextFileInternal(outputRDD, temporaryFile, compressionCodec) @@ -569,28 +569,29 @@ object SparkHelper extends Serializable { compressionCodec: Option[Class[_ <: CompressionCodec]] ): Unit = { - val fileSystem = FileSystem.get(new Configuration()) + val hadoopConfiguration = outputRDD.sparkContext.hadoopConfiguration + val fileSystem = FileSystem.get(hadoopConfiguration) // Classic saveAsTextFile in a temporary folder: - HdfsHelper.deleteFolder(outputFile + ".tmp") + HdfsHelper.deleteFolder(s"$outputFile.tmp") compressionCodec match { case Some(compressionCodec) => - outputRDD.saveAsTextFile(outputFile + ".tmp", compressionCodec) + outputRDD.saveAsTextFile(s"$outputFile.tmp", compressionCodec) case None => - outputRDD.saveAsTextFile(outputFile + ".tmp") + outputRDD.saveAsTextFile(s"$outputFile.tmp") } // Merge the folder into a single file: HdfsHelper.deleteFile(outputFile) FileUtil.copyMerge( fileSystem, - new Path(outputFile + ".tmp"), + new Path(s"$outputFile.tmp"), fileSystem, new Path(outputFile), true, - new Configuration(), + hadoopConfiguration, null) - HdfsHelper.deleteFolder(outputFile + ".tmp") + HdfsHelper.deleteFolder(s"$outputFile.tmp") } private def decreaseCoalescenceInternal( From 19b88c35671037b45bc0bc48d7eada7351fd7358 Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Thu, 24 May 2018 20:56:50 +0100 Subject: [PATCH 04/25] Create implicit class to attach an additional textFile to SparkContext which splits record on a specific delimiter --- .../scala/com/spark_helper/SparkHelper.scala | 153 +++++++++--------- .../com/spark_helper/SparkHelperTest.scala | 39 ++--- 2 files changed, 90 insertions(+), 102 deletions(-) diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index b30400e..12e0164 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -131,84 +131,83 @@ object SparkHelper extends Serializable { ) } - /** Equivalent to sparkContext.textFile(), but for a specific record delimiter. - * - * By default, sparkContext.textFile() will provide one record per line. But - * what if the format you want to read considers that one record (one entity) - * is stored in more than one line (yml, xml, ...)? - * - * For instance in order to read a yml file, which is a format for which a - * record (a single entity) is spread other several lines, you can modify the - * record delimiter with "---\n" instead of "\n". Same goes when reading an - * xml file where a record might be spread over several lines or worse the - * whole xml file is one line. - * - * {{{ - * // Let's say data we want to use with Spark looks like this (one record is - * // a customer, but it's spread over several lines): - * \n - * \n - *
34 thingy street, someplace, sometown
\n - *
\n - * \n - *
12 thingy street, someplace, sometown
\n - *
\n - *
- * //Then you can use it this way: - * val computedRecords = SparkHelper.textFileWithDelimiter( - * "my/path/to/customers.xml", sparkContext, \n - * ).collect() - * val expectedRecords = Array( - * \n, - * ( - *
34 thingy street, someplace, sometown
\n + - *
\n - * ), - * ( - *
12 thingy street, someplace, sometown
\n + - * \n + - * - * ) - * ) - * assert(computedRecords == expectedRecords) - * }}} - * - * @param hdfsPath the path of the file to read (folder or file, '*' works as - * well). - * @param sparkContext the SparkContext - * @param delimiter the specific record delimiter which replaces "\n" - * @param maxRecordLength the max length (not sure which unit) of a record - * before considering the record too long to fit into memory. - * @return the RDD of records - */ - def textFileWithDelimiter( - hdfsPath: String, - sparkContext: SparkContext, - delimiter: String, - maxRecordLength: String = "1000000" - ): RDD[String] = { - - val conf = new Configuration(sparkContext.hadoopConfiguration) - - // This configuration sets the record delimiter: - conf.set("textinputformat.record.delimiter", delimiter) - - // and this one limits the size of one record. This is necessary in order to - // avoid reading from a corrupted file from which a record could be too long - // to fit in memory. This way, when reading a corrupted file, this will - // throw an exception (java.io.IOException - thus catchable) rather than - // having a messy out of memory which will stop the sparkContext: - conf.set("mapreduce.input.linerecordreader.line.maxlength", maxRecordLength) + implicit class SparkContextExtensions(val sc: SparkContext) extends AnyVal { - sparkContext - .newAPIHadoopFile( - hdfsPath, - classOf[TextInputFormat], - classOf[LongWritable], - classOf[Text], - conf - ) - .map { case (_, text) => text.toString } + /** Equivalent to sparkContext.textFile(), but for a specific record delimiter. + * + * By default, sparkContext.textFile() will provide one record per line + * (per '\n'). But what if the format to read considers that one record + * is stored in more than one line (yml, custom format, ...)? + * + * For instance in order to read a yml file, which is a format for which a + * record (a single entity) is spread other several lines, you can modify + * the record delimiter with "---\n" instead of "\n". Same goes when + * reading an xml file where a record might be spread over several lines or + * worse the whole xml file is one line. + * + * {{{ + * // Let's say data we want to use with Spark looks like this (one record + * // is a customer, but it's spread over several lines): + * \n + * \n + *
34 thingy street, someplace, sometown
\n + *
\n + * \n + *
12 thingy street, someplace, sometown
\n + *
\n + *
+ * //Then you can use it this way: + * val computedRecords = sc.textFile("my/path/to/customers.xml", "\n") + * val expectedRecords = RDD( + * \n, + * ( + *
34 thingy street, someplace, sometown
\n + + *
\n + * ), + * ( + *
12 thingy street, someplace, sometown
\n + + * \n + + * + * ) + * ) + * assert(computedRecords == expectedRecords) + * }}} + * + * @param hdfsPath the path of the file to read (folder or file, '*' works + * as well). + * @param delimiter the specific record delimiter which replaces "\n" + * @param maxRecordLength the max length (not sure which unit) of a record + * before considering the record too long to fit into memory. + * @return the RDD of records + */ + def textFile( + hdfsPath: String, + delimiter: String, + maxRecordLength: String = "1000000" + ): RDD[String] = { + + val conf = new Configuration(sc.hadoopConfiguration) + + // This configuration sets the record delimiter: + conf.set("textinputformat.record.delimiter", delimiter) + + // and this one limits the size of one record. This is necessary in order + // to avoid reading from a corrupted file from which a record could be too + // long to fit in memory. This way, when reading a corrupted file, this + // will throw an exception (java.io.IOException - thus catchable) rather + // than having a messy out of memory which will stop the sparkContext: + conf + .set("mapreduce.input.linerecordreader.line.maxlength", maxRecordLength) + + sc.newAPIHadoopFile( + hdfsPath, + classOf[TextInputFormat], + classOf[LongWritable], + classOf[Text], + conf + ) + .map { case (_, text) => text.toString } + } } /** Saves and repartitions a key/value RDD on files whose name is the key. diff --git a/src/test/scala/com/spark_helper/SparkHelperTest.scala b/src/test/scala/com/spark_helper/SparkHelperTest.scala index 4272a73..c476928 100644 --- a/src/test/scala/com/spark_helper/SparkHelperTest.scala +++ b/src/test/scala/com/spark_helper/SparkHelperTest.scala @@ -1,6 +1,6 @@ package com.spark_helper -import com.spark_helper.SparkHelper.RDDExtensions +import com.spark_helper.SparkHelper.{RDDExtensions, SparkContextExtensions} import org.apache.hadoop.io.compress.GzipCodec @@ -73,10 +73,12 @@ class SparkHelperTest test("Read text file with specific record delimiter") { + val weirdFormatFilePath = s"$resourceFolder/some_weird_format.txt" + // 1: Let's read a file where a record begins with a line begining with // 3 and other lines begining by 4: - HdfsHelper.deleteFile("src/test/resources/some_weird_format.txt") + HdfsHelper.deleteFile(weirdFormatFilePath) val textContent = ( "3 first line of the first record\n" + @@ -87,16 +89,9 @@ class SparkHelperTest "4 another line for the third record" ) - HdfsHelper - .writeToHdfsFile(textContent, "src/test/resources/some_weird_format.txt") + HdfsHelper.writeToHdfsFile(textContent, weirdFormatFilePath) - var computedRecords = SparkHelper - .textFileWithDelimiter( - "src/test/resources/some_weird_format.txt", - sc, - "\n3" - ) - .collect() + var computedRecords = sc.textFile(weirdFormatFilePath, "\n3").collect() var expectedRecords = Array( ( @@ -113,13 +108,15 @@ class SparkHelperTest assert(computedRecords === expectedRecords) - HdfsHelper.deleteFile("src/test/resources/some_weird_format.txt") + HdfsHelper.deleteFile(weirdFormatFilePath) // 2: Let's read an xml file: - HdfsHelper.deleteFile("src/test/resources/some_basic_xml.xml") + val xmlFilePath = s"$resourceFolder/some_basic_xml.xml" + + HdfsHelper.deleteFile(xmlFilePath) - val xmlTextContent = ( + val xmlTextContent = "\n" + "\n" + "
34 thingy street, someplace, sometown
\n" + @@ -128,18 +125,10 @@ class SparkHelperTest "
12 thingy street, someplace, sometown
\n" + "
\n" + "
" - ) - HdfsHelper - .writeToHdfsFile(xmlTextContent, "src/test/resources/some_basic_xml.xml") + HdfsHelper.writeToHdfsFile(xmlTextContent, xmlFilePath) - computedRecords = SparkHelper - .textFileWithDelimiter( - "src/test/resources/some_basic_xml.xml", - sc, - "\n" - ) - .collect() + computedRecords = sc.textFile(xmlFilePath, "\n").collect() expectedRecords = Array( "\n", @@ -156,7 +145,7 @@ class SparkHelperTest assert(computedRecords === expectedRecords) - HdfsHelper.deleteFile("src/test/resources/some_basic_xml.xml") + HdfsHelper.deleteFile(xmlFilePath) } test("Save as text file by key") { From 04a089cdf8d5625281170d70866db4cad7ee9dd3 Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Thu, 24 May 2018 22:47:44 +0100 Subject: [PATCH 05/25] Refactor saveAsTextFileByKey and place it in an implicit class --- .../scala/com/spark_helper/SparkHelper.scala | 280 +++++++++++------- .../com/spark_helper/SparkHelperTest.scala | 74 +++-- 2 files changed, 218 insertions(+), 136 deletions(-) diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index 12e0164..5cecf78 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -64,15 +64,14 @@ object SparkHelper extends Serializable { * {{{ rdd.saveAsSingleTextFile("/my/file/path.txt", classOf[BZip2Codec]) }}} * * @param outputFile the path of the produced file - * @param compressionCodec the type of compression to use (for instance + * @param codec the type of compression to use (for instance * classOf[BZip2Codec] or classOf[GzipCodec])) */ def saveAsSingleTextFile( outputFile: String, - compressionCodec: Class[_ <: CompressionCodec] + codec: Class[_ <: CompressionCodec] ): Unit = - SparkHelper - .saveAsSingleTextFileInternal(rdd, outputFile, Some(compressionCodec)) + SparkHelper.saveAsSingleTextFileInternal(rdd, outputFile, Some(codec)) /** Saves an RDD in exactly one file. * @@ -115,20 +114,116 @@ object SparkHelper extends Serializable { * @param outputFile the path of the produced file * @param workingFolder the path where file manipulations will temporarily * happen. - * @param compressionCodec the type of compression to use (for instance + * @param codec the type of compression to use (for instance * classOf[BZip2Codec] or classOf[GzipCodec])) */ def saveAsSingleTextFile( outputFile: String, workingFolder: String, - compressionCodec: Class[_ <: CompressionCodec] + codec: Class[_ <: CompressionCodec] ): Unit = SparkHelper.saveAsSingleTextFileWithWorkingFolderInternal( rdd, outputFile, workingFolder, - Some(compressionCodec) + Some(codec) ) + + } + + implicit class PairRDDExtensions(val rdd: RDD[(String, String)]) + extends AnyVal { + + /** Saves and repartitions a key/value RDD on files whose name is the key. + * + * Within the provided path, there will be one file per key in the given + * keyValueRDD. And within a file for a given key are only stored values + * for this key. + * + * As this internally needs to know the nbr of keys, this will have to + * compute it. If this nbr of keys is known beforehand, it would spare + * resources to use saveAsTextFileByKey(path: String, keyNbr: Int) + * instead. + * + * This is not scalable. This shouldn't be considered for any data flow + * with normal or big volumes. + * + * {{{ rdd.saveAsTextFileByKey("/my/output/folder/path") }}} + * + * @param path the folder where will be storrred key files + */ + def saveAsTextFileByKey(path: String): Unit = + SparkHelper.saveAsTextFileByKeyInternal(rdd, path, None, None) + + /** Saves and repartitions a key/value RDD on files whose name is the key. + * + * Within the provided path, there will be one file per key in the given + * keyValueRDD. And within a file for a given key are only stored values + * for this key. + * + * This is not scalable. This shouldn't be considered for any data flow + * with normal or big volumes. + * + * {{{ rdd.saveAsTextFileByKey("/my/output/folder/path", 12) }}} + * + * @param path the folder where will be storrred key files + * @param keyNbr the nbr of expected keys (which is the nbr of outputed + * files) + */ + def saveAsTextFileByKey(path: String, keyNbr: Int): Unit = + SparkHelper.saveAsTextFileByKeyInternal(rdd, path, Some(keyNbr), None) + + /** Saves and repartitions a key/value RDD on files whose name is the key. + * + * Within the provided path, there will be one file per key in the given + * keyValueRDD. And within a file for a given key are only stored values + * for this key. + * + * As this internally needs to know the nbr of keys, this will have to + * compute it. If this nbr of keys is known beforehand, it would spare + * resources to use + * saveAsTextFileByKey(path: String, keyNbr: Int, codec: Class[_ <: CompressionCodec]) + * instead. + * + * This is not scalable. This shouldn't be considered for any data flow + * with normal or big volumes. + * + * {{{ rdd.saveAsTextFileByKey("/my/output/folder/path", classOf[BZip2Codec]) }}} + * + * @param path the folder where will be storrred key files + * @param codec the type of compression to use (for instance + * classOf[BZip2Codec] or classOf[GzipCodec])) + */ + def saveAsTextFileByKey( + path: String, + codec: Class[_ <: CompressionCodec] + ): Unit = + SparkHelper.saveAsTextFileByKeyInternal(rdd, path, None, Some(codec)) + + /** Saves and repartitions a key/value RDD on files whose name is the key. + * + * Within the provided path, there will be one file per key in the given + * keyValueRDD. And within a file for a given key are only stored values + * for this key. + * + * This is not scalable. This shouldn't be considered for any data flow + * with normal or big volumes. + * + * {{{ rdd.saveAsTextFileByKey("/my/output/folder/path", 12, classOf[BZip2Codec]) }}} + * + * @param path the folder where will be storrred key files + * @param keyNbr the nbr of expected keys (which is the nbr of outputed + * files) + * @param codec the type of compression to use (for instance + * classOf[BZip2Codec] or classOf[GzipCodec])) + */ + def saveAsTextFileByKey( + path: String, + keyNbr: Int, + codec: Class[_ <: CompressionCodec] + ): Unit = + SparkHelper + .saveAsTextFileByKeyInternal(rdd, path, Some(keyNbr), Some(codec)) } implicit class SparkContextExtensions(val sc: SparkContext) extends AnyVal { @@ -210,92 +305,6 @@ object SparkHelper extends Serializable { } } - /** Saves and repartitions a key/value RDD on files whose name is the key. - * - * Within the provided outputFolder, will be one file per key in your - * keyValueRDD. And within a file for a given key are only values for this - * key. - * - * You need to know the nbr of keys beforehand (in general you use this to - * split your dataset in subsets, or to output one file per client, so you - * know how many keys you have). So you need to put as keyNbr the exact nbr - * of keys you'll have. - * - * This is not scalable. This shouldn't be considered for any data flow with - * normal or big volumes. - * - * {{{ - * SparkHelper.saveAsTextFileByKey( - * myKeyValueRddToStore, "/my/output/folder/path", 12) - * }}} - * - * @param keyValueRDD the key/value RDD - * @param outputFolder the foldder where will be storrred key files - * @param keyNbr the nbr of expected keys (which is the nbr of outputed files) - */ - def saveAsTextFileByKey( - keyValueRDD: RDD[(String, String)], - outputFolder: String, - keyNbr: Int - ): Unit = { - - HdfsHelper.deleteFolder(outputFolder) - - keyValueRDD - .partitionBy(new HashPartitioner(keyNbr)) - .saveAsHadoopFile( - outputFolder, - classOf[String], - classOf[String], - classOf[KeyBasedOutput] - ) - } - - /** Saves and repartitions a key/value RDD on files whose name is the key. - * - * Within the provided outputFolder, will be one file per key in your - * keyValueRDD. And within a file for a given key are only values for this - * key. - * - * You need to know the nbr of keys beforehand (in general you use this to - * split your dataset in subsets, or to output one file per client, so you - * know how many keys you have). So you need to put as keyNbr the exact nbr - * of keys you'll have. - * - * This is not scalable. This shouldn't be considered for any data flow with - * normal or big volumes. - * - * {{{ - * SparkHelper.saveAsTextFileByKey( - * myKeyValueRddToStore, "/my/output/folder/path", 12, classOf[BZip2Codec]) - * }}} - * - * @param keyValueRDD the key/value RDD - * @param outputFolder the foldder where will be storrred key files - * @param keyNbr the nbr of expected keys (which is the nbr of outputed files) - * @param compressionCodec the type of compression to use (for instance - * classOf[BZip2Codec] or classOf[GzipCodec])) - */ - def saveAsTextFileByKey( - keyValueRDD: RDD[(String, String)], - outputFolder: String, - keyNbr: Int, - compressionCodec: Class[_ <: CompressionCodec] - ): Unit = { - - HdfsHelper.deleteFolder(outputFolder) - - keyValueRDD - .partitionBy(new HashPartitioner(keyNbr)) - .saveAsHadoopFile( - outputFolder, - classOf[String], - classOf[String], - classOf[KeyBasedOutput], - compressionCodec - ) - } - /** Decreases the nbr of partitions of a folder. * * This is often handy when the last step of your job needs to run on @@ -363,7 +372,7 @@ object SparkHelper extends Serializable { * @param finalCoalescenceLevel the nbr of files within the folder at the end * of this method. * @param sparkContext the SparkContext - * @param compressionCodec the type of compression to use (for instance + * @param codec the type of compression to use (for instance * classOf[BZip2Codec] or classOf[GzipCodec])) */ def decreaseCoalescence( @@ -371,14 +380,15 @@ object SparkHelper extends Serializable { lowerCoalescenceLevelFolder: String, finalCoalescenceLevel: Int, sparkContext: SparkContext, - compressionCodec: Class[_ <: CompressionCodec] + codec: Class[_ <: CompressionCodec] ): Unit = decreaseCoalescenceInternal( highCoalescenceLevelFolder, lowerCoalescenceLevelFolder, finalCoalescenceLevel, sparkContext, - Some(compressionCodec)) + Some(codec) + ) /** Saves as text file, but by decreasing the nbr of partitions of the output. * @@ -423,7 +433,8 @@ object SparkHelper extends Serializable { outputFolder, finalCoalescenceLevel, sparkContext, - None) + None + ) } /** Saves as text file, but by decreasing the nbr of partitions of the output. @@ -448,14 +459,14 @@ object SparkHelper extends Serializable { * finalCoalescenceLevel parameter). * @param finalCoalescenceLevel the nbr of files within the folder at the end * of this method. - * @param compressionCodec the type of compression to use (for instance + * @param codec the type of compression to use (for instance * classOf[BZip2Codec] or classOf[GzipCodec])) */ def saveAsTextFileAndCoalesce( outputRDD: RDD[String], outputFolder: String, finalCoalescenceLevel: Int, - compressionCodec: Class[_ <: CompressionCodec] + codec: Class[_ <: CompressionCodec] ): Unit = { val sparkContext = outputRDD.context @@ -475,7 +486,8 @@ object SparkHelper extends Serializable { outputFolder, finalCoalescenceLevel, sparkContext, - Some(compressionCodec)) + Some(codec) + ) } /** Equivalent to sparkContext.textFile(), but for each line is associated @@ -539,7 +551,7 @@ object SparkHelper extends Serializable { outputRDD: RDD[String], outputFile: String, workingFolder: String, - compressionCodec: Option[Class[_ <: CompressionCodec]] + codec: Option[Class[_ <: CompressionCodec]] ): Unit = { // We chose a random name for the temporary file: @@ -547,7 +559,7 @@ object SparkHelper extends Serializable { val temporaryFile = s"$workingFolder/$temporaryName" // We perform the merge into a temporary single text file: - saveAsSingleTextFileInternal(outputRDD, temporaryFile, compressionCodec) + saveAsSingleTextFileInternal(outputRDD, temporaryFile, codec) // And then only we put the resulting file in its final real location: HdfsHelper.moveFile(temporaryFile, outputFile, overwrite = true) @@ -565,7 +577,7 @@ object SparkHelper extends Serializable { private def saveAsSingleTextFileInternal( outputRDD: RDD[String], outputFile: String, - compressionCodec: Option[Class[_ <: CompressionCodec]] + codec: Option[Class[_ <: CompressionCodec]] ): Unit = { val hadoopConfiguration = outputRDD.sparkContext.hadoopConfiguration @@ -573,9 +585,9 @@ object SparkHelper extends Serializable { // Classic saveAsTextFile in a temporary folder: HdfsHelper.deleteFolder(s"$outputFile.tmp") - compressionCodec match { - case Some(compressionCodec) => - outputRDD.saveAsTextFile(s"$outputFile.tmp", compressionCodec) + codec match { + case Some(codec) => + outputRDD.saveAsTextFile(s"$outputFile.tmp", codec) case None => outputRDD.saveAsTextFile(s"$outputFile.tmp") } @@ -593,22 +605,68 @@ object SparkHelper extends Serializable { HdfsHelper.deleteFolder(s"$outputFile.tmp") } + private def saveAsTextFileByKeyInternal( + rdd: RDD[(String, String)], + path: String, + optKeyNbr: Option[Int], + codec: Option[Class[_ <: CompressionCodec]] + ): Unit = { + + HdfsHelper.deleteFolder(path) + + // Whether the rdd was already cached or not (used to unpersist it if we + // have to get the nbr of keys): + val isCached = rdd.getStorageLevel.useMemory + + // If the nbr of keys isn't provided, we have to get it ourselves: + val keyNbr = optKeyNbr match { + case Some(keyNbr) => + keyNbr + case None => + if (!isCached) + rdd.cache() + rdd.keys.distinct.count.toInt + } + + val prdd = rdd.partitionBy(new HashPartitioner(keyNbr)) + + codec match { + case Some(codec) => + prdd.saveAsHadoopFile( + path, + classOf[String], + classOf[String], + classOf[KeyBasedOutput], + codec + ) + case None => + prdd.saveAsHadoopFile( + path, + classOf[String], + classOf[String], + classOf[KeyBasedOutput] + ) + } + + if (optKeyNbr.isEmpty && !isCached) + rdd.unpersist() + } + private def decreaseCoalescenceInternal( highCoalescenceLevelFolder: String, lowerCoalescenceLevelFolder: String, finalCoalescenceLevel: Int, sparkContext: SparkContext, - compressionCodec: Option[Class[_ <: CompressionCodec]] + codec: Option[Class[_ <: CompressionCodec]] ): Unit = { val intermediateRDD = sparkContext .textFile(highCoalescenceLevelFolder) .coalesce(finalCoalescenceLevel) - compressionCodec match { - case Some(compressionCodec) => - intermediateRDD - .saveAsTextFile(lowerCoalescenceLevelFolder, compressionCodec) + codec match { + case Some(codec) => + intermediateRDD.saveAsTextFile(lowerCoalescenceLevelFolder, codec) case None => intermediateRDD.saveAsTextFile(lowerCoalescenceLevelFolder) } diff --git a/src/test/scala/com/spark_helper/SparkHelperTest.scala b/src/test/scala/com/spark_helper/SparkHelperTest.scala index c476928..2fbcdf0 100644 --- a/src/test/scala/com/spark_helper/SparkHelperTest.scala +++ b/src/test/scala/com/spark_helper/SparkHelperTest.scala @@ -1,6 +1,7 @@ package com.spark_helper -import com.spark_helper.SparkHelper.{RDDExtensions, SparkContextExtensions} +import com.spark_helper.SparkHelper.{RDDExtensions, PairRDDExtensions} +import com.spark_helper.SparkHelper.SparkContextExtensions import org.apache.hadoop.io.compress.GzipCodec @@ -150,7 +151,11 @@ class SparkHelperTest test("Save as text file by key") { - HdfsHelper.deleteFolder("src/test/resources/key_value_storage") + val keyValueFolder = s"$resourceFolder/key_value_storage" + + // 1: Let's strore key values per file: + + HdfsHelper.deleteFolder(keyValueFolder) val someKeyValueRdd = sc.parallelize[(String, String)]( Array( @@ -164,42 +169,61 @@ class SparkHelperTest ) ) - SparkHelper.saveAsTextFileByKey( - someKeyValueRdd, - "src/test/resources/key_value_storage", - 3) + someKeyValueRdd.saveAsTextFileByKey(keyValueFolder, 3) // The folder key_value_storage has been created: - assert(HdfsHelper.folderExists("src/test/resources/key_value_storage")) + assert(HdfsHelper.folderExists(keyValueFolder)) // And it contains one file per key: - val genratedKeyFiles = HdfsHelper - .listFileNamesInFolder("src/test/resources/key_value_storage") - val expectedKeyFiles = List("_SUCCESS", "key_1", "key_2", "key_3") + var genratedKeyFiles = HdfsHelper.listFileNamesInFolder(keyValueFolder) + var expectedKeyFiles = List("_SUCCESS", "key_1", "key_2", "key_3") assert(genratedKeyFiles === expectedKeyFiles) - val valuesForKey1 = sc - .textFile("src/test/resources/key_value_storage/key_1") - .collect() - .sorted + var valuesForKey1 = sc.textFile(s"$keyValueFolder/key_1").collect().sorted + assert(valuesForKey1 === Array("value_a", "value_b")) + + val valuesForKey2 = sc.textFile(s"$keyValueFolder/key_2").collect().sorted + assert(valuesForKey2 === Array("value_b", "value_c", "value_d")) + + val valuesForKey3 = sc.textFile(s"$keyValueFolder/key_3").collect().sorted + assert(valuesForKey3 === Array("value_a", "value_b")) + + // 2: Let's strore key values per file; but without providing the nbr of + // keys: + + HdfsHelper.deleteFolder(keyValueFolder) + + someKeyValueRdd.saveAsTextFileByKey(keyValueFolder) + + // The folder key_value_storage has been created: + assert(HdfsHelper.folderExists(keyValueFolder)) + // And it contains one file per key: + genratedKeyFiles = HdfsHelper.listFileNamesInFolder(keyValueFolder) + expectedKeyFiles = List("_SUCCESS", "key_1", "key_2", "key_3") + assert(genratedKeyFiles === expectedKeyFiles) + + valuesForKey1 = sc.textFile(s"$keyValueFolder/key_1").collect().sorted assert(valuesForKey1 === Array("value_a", "value_b")) - val valuesForKey2 = sc - .textFile("src/test/resources/key_value_storage/key_2") - .collect() - .sorted + // 3: Let's strore key values per file and compress these files: - assert(valuesForKey2 === Array("value_b", "value_c", "value_d")) + HdfsHelper.deleteFolder(keyValueFolder) - val valuesForKey3 = sc - .textFile("src/test/resources/key_value_storage/key_3") - .collect() - .sorted + someKeyValueRdd.saveAsTextFileByKey(keyValueFolder, 3, classOf[GzipCodec]) - assert(valuesForKey3 === Array("value_a", "value_b")) + // The folder key_value_storage has been created: + assert(HdfsHelper.folderExists(keyValueFolder)) + + // And it contains one file per key: + genratedKeyFiles = HdfsHelper.listFileNamesInFolder(keyValueFolder) + expectedKeyFiles = List("_SUCCESS", "key_1.gz", "key_2.gz", "key_3.gz") + assert(genratedKeyFiles === expectedKeyFiles) + + valuesForKey1 = sc.textFile(s"$keyValueFolder/key_1.gz").collect().sorted + assert(valuesForKey1 === Array("value_a", "value_b")) - HdfsHelper.deleteFolder("src/test/resources/key_value_storage") + HdfsHelper.deleteFolder(keyValueFolder) } test("Decrease coalescence level") { From d756f0288a99f67efe081e529c9b49178f47fa62 Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Sat, 26 May 2018 11:02:26 +0100 Subject: [PATCH 06/25] Test saveAsTextFileAndCoalesce and place it in an implicit class --- .../scala/com/spark_helper/SparkHelper.scala | 270 +++++++++--------- .../com/spark_helper/SparkHelperTest.scala | 40 +++ 2 files changed, 171 insertions(+), 139 deletions(-) diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index 5cecf78..8c253d4 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -51,10 +51,10 @@ object SparkHelper extends Serializable { * * {{{ rdd.saveAsSingleTextFile("/my/file/path.txt") }}} * - * @param outputFile the path of the produced file + * @param path the path of the produced file */ - def saveAsSingleTextFile(outputFile: String): Unit = - SparkHelper.saveAsSingleTextFileInternal(rdd, outputFile, None) + def saveAsSingleTextFile(path: String): Unit = + SparkHelper.saveAsSingleTextFileInternal(rdd, path, None) /** Saves an RDD in exactly one file. * @@ -63,15 +63,15 @@ object SparkHelper extends Serializable { * * {{{ rdd.saveAsSingleTextFile("/my/file/path.txt", classOf[BZip2Codec]) }}} * - * @param outputFile the path of the produced file + * @param path the path of the produced file * @param codec the type of compression to use (for instance * classOf[BZip2Codec] or classOf[GzipCodec])) */ def saveAsSingleTextFile( - outputFile: String, + path: String, codec: Class[_ <: CompressionCodec] ): Unit = - SparkHelper.saveAsSingleTextFileInternal(rdd, outputFile, Some(codec)) + SparkHelper.saveAsSingleTextFileInternal(rdd, path, Some(codec)) /** Saves an RDD in exactly one file. * @@ -85,14 +85,14 @@ object SparkHelper extends Serializable { * * {{{ rdd.saveAsSingleTextFile("/my/file/path.txt", "/my/working/folder/path") }}} * - * @param outputFile the path of the produced file + * @param path the path of the produced file * @param workingFolder the path where file manipulations will temporarily * happen. */ - def saveAsSingleTextFile(outputFile: String, workingFolder: String): Unit = + def saveAsSingleTextFile(path: String, workingFolder: String): Unit = SparkHelper.saveAsSingleTextFileWithWorkingFolderInternal( rdd, - outputFile, + path, workingFolder, None ) @@ -111,24 +111,116 @@ object SparkHelper extends Serializable { * rdd.saveAsSingleTextFile("/my/file/path.txt", "/my/working/folder/path", classOf[BZip2Codec]) * }}} * - * @param outputFile the path of the produced file + * @param path the path of the produced file * @param workingFolder the path where file manipulations will temporarily * happen. * @param codec the type of compression to use (for instance * classOf[BZip2Codec] or classOf[GzipCodec])) */ def saveAsSingleTextFile( - outputFile: String, + path: String, workingFolder: String, codec: Class[_ <: CompressionCodec] ): Unit = SparkHelper.saveAsSingleTextFileWithWorkingFolderInternal( rdd, - outputFile, + path, workingFolder, Some(codec) ) + /** Saves as text file, but by decreasing the nbr of partitions of the output. + * + * Same as rdd.saveAsTextFile() + * , but decreases the nbr of partitions in the output folder before doing + * so. + * + * The result is equivalent to rdd.coalesce(x).saveAsTextFile() + * , but if x + * is very low, coalesce + * would make the processing time explode, wherease this methods keeps the + * processing parallelized, save as text file and then only merges the + * result in a lower nbr of partitions. + * + * {{{ rdd.saveAsTextFileAndCoalesce("/produced/folder/path/with/only/30/files", 30) }}} + * + * @param path the folder where will finally be stored the RDD but spread + * on only 30 files (where 30 is the value of the finalCoalesceLevel + * parameter). + * @param finalCoalesceLevel the nbr of files within the folder at the end + * of this method. + */ + def saveAsTextFileAndCoalesce( + path: String, + finalCoalesceLevel: Int + ): Unit = { + + // We remove folders where to store data in case they already exist: + HdfsHelper.deleteFolder(s"${path}_tmp") + HdfsHelper.deleteFolder(path) + + // We first save the rdd with the level of coalescence used during the + // processing. This way the processing is done with the right level of + // tasks: + rdd.saveAsTextFile(s"${path}_tmp") + + // Then we read back this tmp folder, apply the coalesce and store it back: + SparkHelper.decreaseCoalescenceInternal( + s"${path}_tmp", + path, + finalCoalesceLevel, + rdd.context, + None + ) + } + + /** Saves as text file, but by decreasing the nbr of partitions of the output. + * + * Same as rdd.saveAsTextFile() + * , but decreases the nbr of partitions in the output folder before doing + * so. + * + * The result is equivalent to rdd.coalesce(x).saveAsTextFile() + * , but if x + * is very low, coalesce + * would make the processing time explode, wherease this methods keeps the + * processing parallelized, save as text file and then only merges the + * result in a lower nbr of partitions. + * + * {{{ rdd.saveAsTextFileAndCoalesce("/produced/folder/path/with/only/30/files", 30, classOf[BZip2Codec]) }}} + * + * @param path the folder where will finally be stored the RDD but spread + * on only 30 files (where 30 is the value of the finalCoalesceLevel + * parameter). + * @param finalCoalesceLevel the nbr of files within the folder at the end + * of this method. + * @param codec the type of compression to use (for instance + * classOf[BZip2Codec] or classOf[GzipCodec])) + */ + def saveAsTextFileAndCoalesce( + path: String, + finalCoalesceLevel: Int, + codec: Class[_ <: CompressionCodec] + ): Unit = { + + // We remove folders where to store data in case they already exist: + HdfsHelper.deleteFolder(s"${path}_tmp") + HdfsHelper.deleteFolder(path) + + // We first save the rdd with the level of coalescence used during the + // processing. This way the processing is done with the right level of + // tasks: + rdd.saveAsTextFile(s"${path}_tmp") + + // Then we read back this tmp folder, apply the coalesce and store it back: + decreaseCoalescenceInternal( + s"${path}_tmp", + path, + finalCoalesceLevel, + rdd.context, + Some(codec) + ) + } } implicit class PairRDDExtensions(val rdd: RDD[(String, String)]) @@ -268,7 +360,7 @@ object SparkHelper extends Serializable { * assert(computedRecords == expectedRecords) * }}} * - * @param hdfsPath the path of the file to read (folder or file, '*' works + * @param path the path of the file to read (folder or file, '*' works * as well). * @param delimiter the specific record delimiter which replaces "\n" * @param maxRecordLength the max length (not sure which unit) of a record @@ -276,7 +368,7 @@ object SparkHelper extends Serializable { * @return the RDD of records */ def textFile( - hdfsPath: String, + path: String, delimiter: String, maxRecordLength: String = "1000000" ): RDD[String] = { @@ -295,7 +387,7 @@ object SparkHelper extends Serializable { .set("mapreduce.input.linerecordreader.line.maxlength", maxRecordLength) sc.newAPIHadoopFile( - hdfsPath, + path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], @@ -327,21 +419,21 @@ object SparkHelper extends Serializable { * @param highCoalescenceLevelFolder the folder which contains 10000 files * @param lowerCoalescenceLevelFolder the folder which will contain the same * data as highCoalescenceLevelFolder but spread on only 300 files (where 300 - * is the finalCoalescenceLevel parameter). - * @param finalCoalescenceLevel the nbr of files within the folder at the end + * is the finalCoalesceLevel parameter). + * @param finalCoalesceLevel the nbr of files within the folder at the end * of this method. * @param sparkContext the SparkContext */ def decreaseCoalescence( highCoalescenceLevelFolder: String, lowerCoalescenceLevelFolder: String, - finalCoalescenceLevel: Int, + finalCoalesceLevel: Int, sparkContext: SparkContext ): Unit = decreaseCoalescenceInternal( highCoalescenceLevelFolder, lowerCoalescenceLevelFolder, - finalCoalescenceLevel, + finalCoalesceLevel, sparkContext, None) @@ -368,8 +460,8 @@ object SparkHelper extends Serializable { * @param highCoalescenceLevelFolder the folder which contains 10000 files * @param lowerCoalescenceLevelFolder the folder which will contain the same * data as highCoalescenceLevelFolder but spread on only 300 files (where 300 - * is the finalCoalescenceLevel parameter). - * @param finalCoalescenceLevel the nbr of files within the folder at the end + * is the finalCoalesceLevel parameter). + * @param finalCoalesceLevel the nbr of files within the folder at the end * of this method. * @param sparkContext the SparkContext * @param codec the type of compression to use (for instance @@ -378,117 +470,17 @@ object SparkHelper extends Serializable { def decreaseCoalescence( highCoalescenceLevelFolder: String, lowerCoalescenceLevelFolder: String, - finalCoalescenceLevel: Int, + finalCoalesceLevel: Int, sparkContext: SparkContext, codec: Class[_ <: CompressionCodec] ): Unit = decreaseCoalescenceInternal( highCoalescenceLevelFolder, lowerCoalescenceLevelFolder, - finalCoalescenceLevel, - sparkContext, - Some(codec) - ) - - /** Saves as text file, but by decreasing the nbr of partitions of the output. - * - * Same as decreaseCoalescence, but the storage of the RDD in an intermediate - * folder is included. - * - * This still makes the processing parallelized, but the output is coalesced. - * - * {{{ - * SparkHelper.saveAsTextFileAndCoalesce( - * myRddToStore, "/produced/folder/path/with/only/300/files", 300) - * }}} - * - * @param outputRDD the RDD to store, processed for instance on 10000 tasks - * (which would thus be stored as 10000 files). - * @param outputFolder the folder where will finally be stored the RDD but - * spread on only 300 files (where 300 is the value of the - * finalCoalescenceLevel parameter). - * @param finalCoalescenceLevel the nbr of files within the folder at the end - * of this method. - */ - def saveAsTextFileAndCoalesce( - outputRDD: RDD[String], - outputFolder: String, - finalCoalescenceLevel: Int - ): Unit = { - - val sparkContext = outputRDD.context - - // We remove folders where to store data in case they already exist: - HdfsHelper.deleteFolder(outputFolder + "_tmp") - HdfsHelper.deleteFolder(outputFolder) - - // We first save the rdd with the level of coalescence used during the - // processing. This way the processing is done with the right level of - // tasks: - outputRDD.saveAsTextFile(outputFolder + "_tmp") - - // Then we read back this tmp folder, apply the coalesce and store it back: - decreaseCoalescenceInternal( - outputFolder + "_tmp", - outputFolder, - finalCoalescenceLevel, - sparkContext, - None - ) - } - - /** Saves as text file, but by decreasing the nbr of partitions of the output. - * - * Same as decreaseCoalescence, but the storage of the RDD in an intermediate - * folder is included. - * - * This still makes the processing parallelized, but the output is coalesced. - * - * {{{ - * SparkHelper.saveAsTextFileAndCoalesce( - * myRddToStore, - * "/produced/folder/path/with/only/300/files", - * 300, - * classOf[BZip2Codec]) - * }}} - * - * @param outputRDD the RDD to store, processed for instance on 10000 tasks - * (which would thus be stored as 10000 files). - * @param outputFolder the folder where will finally be stored the RDD but - * spread on only 300 files (where 300 is the value of the - * finalCoalescenceLevel parameter). - * @param finalCoalescenceLevel the nbr of files within the folder at the end - * of this method. - * @param codec the type of compression to use (for instance - * classOf[BZip2Codec] or classOf[GzipCodec])) - */ - def saveAsTextFileAndCoalesce( - outputRDD: RDD[String], - outputFolder: String, - finalCoalescenceLevel: Int, - codec: Class[_ <: CompressionCodec] - ): Unit = { - - val sparkContext = outputRDD.context - - // We remove folders where to store data in case they already exist: - HdfsHelper.deleteFolder(outputFolder + "_tmp") - HdfsHelper.deleteFolder(outputFolder) - - // We first save the rdd with the level of coalescence used during the - // processing. This way the processing is done with the right level of - // tasks: - outputRDD.saveAsTextFile(outputFolder + "_tmp") - - // Then we read back this tmp folder, apply the coalesce and store it back: - decreaseCoalescenceInternal( - outputFolder + "_tmp", - outputFolder, - finalCoalescenceLevel, + finalCoalesceLevel, sparkContext, Some(codec) ) - } /** Equivalent to sparkContext.textFile(), but for each line is associated * with its file path. @@ -515,13 +507,13 @@ object SparkHelper extends Serializable { * ) * }}} * - * @param hdfsPath the path of the folder (or structure of folders) to read + * @param path the path of the folder (or structure of folders) to read * @param sparkContext the SparkContext * @return the RDD of records where a record is a tuple containing the path * of the file the record comes from and the record itself. */ def textFileWithFileName( - hdfsPath: String, + path: String, sparkContext: SparkContext ): RDD[(String, String)] = { @@ -531,7 +523,7 @@ object SparkHelper extends Serializable { sparkContext .hadoopFile( - hdfsPath, + path, classOf[TextInputFormat2], classOf[LongWritable], classOf[Text], @@ -549,7 +541,7 @@ object SparkHelper extends Serializable { private def saveAsSingleTextFileWithWorkingFolderInternal( outputRDD: RDD[String], - outputFile: String, + path: String, workingFolder: String, codec: Option[Class[_ <: CompressionCodec]] ): Unit = { @@ -562,7 +554,7 @@ object SparkHelper extends Serializable { saveAsSingleTextFileInternal(outputRDD, temporaryFile, codec) // And then only we put the resulting file in its final real location: - HdfsHelper.moveFile(temporaryFile, outputFile, overwrite = true) + HdfsHelper.moveFile(temporaryFile, path, overwrite = true) } /** Saves RDD in exactly one file. @@ -571,12 +563,12 @@ object SparkHelper extends Serializable { * the processing parallelized. * * @param outputRDD the RDD of strings to save as text file - * @param outputFile the path where to save the file + * @param path the path where to save the file * @param compression the compression codec to use (can be left to None) */ private def saveAsSingleTextFileInternal( outputRDD: RDD[String], - outputFile: String, + path: String, codec: Option[Class[_ <: CompressionCodec]] ): Unit = { @@ -584,25 +576,25 @@ object SparkHelper extends Serializable { val fileSystem = FileSystem.get(hadoopConfiguration) // Classic saveAsTextFile in a temporary folder: - HdfsHelper.deleteFolder(s"$outputFile.tmp") + HdfsHelper.deleteFolder(s"$path.tmp") codec match { case Some(codec) => - outputRDD.saveAsTextFile(s"$outputFile.tmp", codec) + outputRDD.saveAsTextFile(s"$path.tmp", codec) case None => - outputRDD.saveAsTextFile(s"$outputFile.tmp") + outputRDD.saveAsTextFile(s"$path.tmp") } // Merge the folder into a single file: - HdfsHelper.deleteFile(outputFile) + HdfsHelper.deleteFile(path) FileUtil.copyMerge( fileSystem, - new Path(s"$outputFile.tmp"), + new Path(s"$path.tmp"), fileSystem, - new Path(outputFile), + new Path(path), true, hadoopConfiguration, null) - HdfsHelper.deleteFolder(s"$outputFile.tmp") + HdfsHelper.deleteFolder(s"$path.tmp") } private def saveAsTextFileByKeyInternal( @@ -655,14 +647,14 @@ object SparkHelper extends Serializable { private def decreaseCoalescenceInternal( highCoalescenceLevelFolder: String, lowerCoalescenceLevelFolder: String, - finalCoalescenceLevel: Int, + finalCoalesceLevel: Int, sparkContext: SparkContext, codec: Option[Class[_ <: CompressionCodec]] ): Unit = { val intermediateRDD = sparkContext .textFile(highCoalescenceLevelFolder) - .coalesce(finalCoalescenceLevel) + .coalesce(finalCoalesceLevel) codec match { case Some(codec) => diff --git a/src/test/scala/com/spark_helper/SparkHelperTest.scala b/src/test/scala/com/spark_helper/SparkHelperTest.scala index 2fbcdf0..478aad5 100644 --- a/src/test/scala/com/spark_helper/SparkHelperTest.scala +++ b/src/test/scala/com/spark_helper/SparkHelperTest.scala @@ -226,6 +226,46 @@ class SparkHelperTest HdfsHelper.deleteFolder(keyValueFolder) } + test("Save as text file and reduce nbr of partitions") { + + val testFolder = s"$resourceFolder/folder" + + HdfsHelper.deleteFolder(testFolder) + + val rddToStore = + sc.parallelize(Array("data_a", "data_b", "data_c")).repartition(3) + + // 1: Without compressing: + + rddToStore.saveAsTextFileAndCoalesce(testFolder, 2) + + // Let's check the nbr of partitions: + var genratedKeyFiles = HdfsHelper.listFileNamesInFolder(testFolder) + var expectedKeyFiles = List("_SUCCESS", "part-00000", "part-00001") + assert(genratedKeyFiles === expectedKeyFiles) + + // And let's check the content: + var singleFileStoredData = sc.textFile(testFolder).collect().sorted + assert(singleFileStoredData === Array("data_a", "data_b", "data_c")) + + HdfsHelper.deleteFolder(testFolder) + + // 2: By compressing: + + rddToStore.saveAsTextFileAndCoalesce(testFolder, 2, classOf[GzipCodec]) + + // Let's check the nbr of partitions: + genratedKeyFiles = HdfsHelper.listFileNamesInFolder(testFolder) + expectedKeyFiles = List("_SUCCESS", "part-00000.gz", "part-00001.gz") + assert(genratedKeyFiles === expectedKeyFiles) + + // And let's check the content: + singleFileStoredData = sc.textFile(testFolder).collect().sorted + assert(singleFileStoredData === Array("data_a", "data_b", "data_c")) + + HdfsHelper.deleteFolder(testFolder) + } + test("Decrease coalescence level") { HdfsHelper.deleteFolder("src/test/resources/re_coalescence_test_input") From fdc854429b909f3e2180cec5947af996f4b8f135 Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Sat, 26 May 2018 12:12:15 +0100 Subject: [PATCH 07/25] Place decreaseCoalescence in the SparkContextExtensions implicit class --- .../scala/com/spark_helper/SparkHelper.scala | 167 +++++++++--------- .../com/spark_helper/SparkHelperTest.scala | 43 ++--- 2 files changed, 98 insertions(+), 112 deletions(-) diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index 8c253d4..c5658f5 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -395,92 +395,89 @@ object SparkHelper extends Serializable { ) .map { case (_, text) => text.toString } } - } - /** Decreases the nbr of partitions of a folder. - * - * This is often handy when the last step of your job needs to run on - * thousands of files, but you want to store your final output on let's say - * only 300 files. - * - * It's like a FileUtil.copyMerge, but the merging produces more than one - * file. - * - * Be aware that this methods deletes the provided input folder. - * - * {{{ - * SparkHelper.decreaseCoalescence( - * "/folder/path/with/2000/files", - * "/produced/folder/path/with/only/300/files", - * 300, - * sparkContext) - * }}} - * - * @param highCoalescenceLevelFolder the folder which contains 10000 files - * @param lowerCoalescenceLevelFolder the folder which will contain the same - * data as highCoalescenceLevelFolder but spread on only 300 files (where 300 - * is the finalCoalesceLevel parameter). - * @param finalCoalesceLevel the nbr of files within the folder at the end - * of this method. - * @param sparkContext the SparkContext - */ - def decreaseCoalescence( - highCoalescenceLevelFolder: String, - lowerCoalescenceLevelFolder: String, - finalCoalesceLevel: Int, - sparkContext: SparkContext - ): Unit = - decreaseCoalescenceInternal( - highCoalescenceLevelFolder, - lowerCoalescenceLevelFolder, - finalCoalesceLevel, - sparkContext, - None) - - /** Decreases the nbr of partitions of a folder. - * - * This is often handy when the last step of your job needs to run on - * thousands of files, but you want to store your final output on let's say - * only 300 files. - * - * It's like a FileUtil.copyMerge, but the merging produces more than one - * file. - * - * Be aware that this methods deletes the provided input folder. - * - * {{{ - * SparkHelper.decreaseCoalescence( - * "/folder/path/with/2000/files", - * "/produced/folder/path/with/only/300/files", - * 300, - * sparkContext, - * classOf[BZip2Codec]) - * }}} - * - * @param highCoalescenceLevelFolder the folder which contains 10000 files - * @param lowerCoalescenceLevelFolder the folder which will contain the same - * data as highCoalescenceLevelFolder but spread on only 300 files (where 300 - * is the finalCoalesceLevel parameter). - * @param finalCoalesceLevel the nbr of files within the folder at the end - * of this method. - * @param sparkContext the SparkContext - * @param codec the type of compression to use (for instance - * classOf[BZip2Codec] or classOf[GzipCodec])) - */ - def decreaseCoalescence( - highCoalescenceLevelFolder: String, - lowerCoalescenceLevelFolder: String, - finalCoalesceLevel: Int, - sparkContext: SparkContext, - codec: Class[_ <: CompressionCodec] - ): Unit = - decreaseCoalescenceInternal( - highCoalescenceLevelFolder, - lowerCoalescenceLevelFolder, - finalCoalesceLevel, - sparkContext, - Some(codec) - ) + /** Decreases the nbr of partitions of a folder. + * + * This comes in handy when the last step of your job needs to run on + * thousands of files, but you want to store your final output on let's say + * only 30 files. + * + * It's like a FileUtil.copyMerge() + * , but the merging produces more than one file. + * + * Be aware that this methods deletes the provided input folder. + * + * {{{ + * sc.decreaseCoalescence( + * "/folder/path/with/2000/files", + * "/produced/folder/path/with/only/30/files", + * 30 + * ) + * }}} + * + * @param highCoalescenceLevelFolder the folder which contains 10000 files + * @param lowerCoalescenceLevelFolder the folder which will contain the same + * data as highCoalescenceLevelFolder but spread on only 30 files (where 30 + * is the finalCoalesceLevel parameter). + * @param finalCoalesceLevel the nbr of files within the folder at the end + * of this method. + */ + def decreaseCoalescence( + highCoalescenceLevelFolder: String, + lowerCoalescenceLevelFolder: String, + finalCoalesceLevel: Int + ): Unit = + SparkHelper.decreaseCoalescenceInternal( + highCoalescenceLevelFolder, + lowerCoalescenceLevelFolder, + finalCoalesceLevel, + sc, + None + ) + + /** Decreases the nbr of partitions of a folder. + * + * This comes in handy when the last step of your job needs to run on + * thousands of files, but you want to store your final output on let's say + * only 30 files. + * + * It's like a FileUtil.copyMerge() + * , but the merging produces more than one file. + * + * Be aware that this methods deletes the provided input folder. + * + * {{{ + * sc.decreaseCoalescence( + * "/folder/path/with/2000/files", + * "/produced/folder/path/with/only/30/files", + * 30, + * classOf[BZip2Codec] + * ) + * }}} + * + * @param highCoalescenceLevelFolder the folder which contains 10000 files + * @param lowerCoalescenceLevelFolder the folder which will contain the same + * data as highCoalescenceLevelFolder but spread on only 30 files (where 30 + * is the finalCoalesceLevel parameter). + * @param finalCoalesceLevel the nbr of files within the folder at the end + * of this method. + * @param codec the type of compression to use (for instance + * classOf[BZip2Codec] or classOf[GzipCodec])) + */ + def decreaseCoalescence( + highCoalescenceLevelFolder: String, + lowerCoalescenceLevelFolder: String, + finalCoalesceLevel: Int, + codec: Class[_ <: CompressionCodec] + ): Unit = + SparkHelper.decreaseCoalescenceInternal( + highCoalescenceLevelFolder, + lowerCoalescenceLevelFolder, + finalCoalesceLevel, + sc, + Some(codec) + ) + } /** Equivalent to sparkContext.textFile(), but for each line is associated * with its file path. diff --git a/src/test/scala/com/spark_helper/SparkHelperTest.scala b/src/test/scala/com/spark_helper/SparkHelperTest.scala index 478aad5..2af3b4d 100644 --- a/src/test/scala/com/spark_helper/SparkHelperTest.scala +++ b/src/test/scala/com/spark_helper/SparkHelperTest.scala @@ -268,42 +268,30 @@ class SparkHelperTest test("Decrease coalescence level") { - HdfsHelper.deleteFolder("src/test/resources/re_coalescence_test_input") - HdfsHelper.deleteFolder("src/test/resources/re_coalescence_test_output") + val inputTestFolder = s"$resourceFolder/re_coalescence_test_input" + val outputTestFolder = s"$resourceFolder/re_coalescence_test_output" + + HdfsHelper.deleteFolder(inputTestFolder) + HdfsHelper.deleteFolder(outputTestFolder) // Let's create the folder with high level of coalescence (3 files): - sc.parallelize[String](Array("data_1_a", "data_1_b", "data_1_c")) - .saveAsSingleTextFile( - "src/test/resources/re_coalescence_test_input/input_file_1" - ) - sc.parallelize[String](Array("data_2_a", "data_2_b")) - .saveAsSingleTextFile( - "src/test/resources/re_coalescence_test_input/input_file_2" - ) - sc.parallelize[String](Array("data_3_a", "data_3_b", "data_3_c")) - .saveAsSingleTextFile( - "src/test/resources/re_coalescence_test_input/input_file_3" - ) + sc.parallelize(Array("data_1_a", "data_1_b", "data_1_c")) + .saveAsSingleTextFile(s"$inputTestFolder/input_file_1") + sc.parallelize(Array("data_2_a", "data_2_b")) + .saveAsSingleTextFile(s"$inputTestFolder/input_file_2") + sc.parallelize(Array("data_3_a", "data_3_b", "data_3_c")) + .saveAsSingleTextFile(s"$inputTestFolder/input_file_3") // Let's decrease the coalescence level in order to only have 2 files: - SparkHelper.decreaseCoalescence( - "src/test/resources/re_coalescence_test_input", - "src/test/resources/re_coalescence_test_output", - 2, - sc) + sc.decreaseCoalescence(inputTestFolder, outputTestFolder, 2) // And we check we have two files in output: - val outputFileList = HdfsHelper - .listFileNamesInFolder("src/test/resources/re_coalescence_test_output") + val outputFileList = HdfsHelper.listFileNamesInFolder(outputTestFolder) val expectedFileList = List("_SUCCESS", "part-00000", "part-00001") assert(outputFileList === expectedFileList) // And that all input data is in the output: - val outputData = sc - .textFile("src/test/resources/re_coalescence_test_output") - .collect - .sorted - + val outputData = sc.textFile(outputTestFolder).collect.sorted val expectedOutputData = Array( "data_1_a", "data_1_b", @@ -316,7 +304,8 @@ class SparkHelperTest ) assert(outputData === expectedOutputData) - HdfsHelper.deleteFolder("src/test/resources/re_coalescence_test_output") + HdfsHelper.deleteFolder(inputTestFolder) + HdfsHelper.deleteFolder(outputTestFolder) } test( From 46318ef2a6b06b6e189549e369d958f586a05fee Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Sat, 26 May 2018 12:30:07 +0100 Subject: [PATCH 08/25] Place textFileWithFileName in the SparkContextExtensions implicit class --- .../scala/com/spark_helper/SparkHelper.scala | 105 +++++++++--------- .../com/spark_helper/SparkHelperTest.scala | 20 ++-- 2 files changed, 63 insertions(+), 62 deletions(-) diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index c5658f5..2ed582e 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -396,6 +396,56 @@ object SparkHelper extends Serializable { .map { case (_, text) => text.toString } } + /** Equivalent to sparkContext.textFile() + * , but each record is associated with the file path it comes from. + * + * Produces an RDD[(file_name, line)] + * which provides a way to know from which file a given line comes from. + * + * {{{ + * // Considering this folder: + * // folder/file_1.txt whose content is data1\ndata2\ndata3 + * // folder/file_2.txt whose content is data4\ndata4 + * // folder/folder_1/file_3.txt whose content is data6\ndata7 + * // then: + * sc.textFileWithFileName("folder") + * // will return: + * RDD( + * ("file:/path/on/machine/folder/file_1.txt", "data1"), + * ("file:/path/on/machine/folder/file_1.txt", "data2"), + * ("file:/path/on/machine/folder/file_1.txt", "data3"), + * ("file:/path/on/machine/folder/file_2.txt", "data4"), + * ("file:/path/on/machine/folder/file_2.txt", "data5"), + * ("file:/path/on/machine/folder/folder_1/file_3.txt", "data6"), + * ("file:/path/on/machine/folder/folder_1/file_3.txt", "data7") + * ) + * }}} + * + * @param path the path of the folder (or structure of folders) to read + * @return the RDD of records where a record is a tuple containing the path + * of the file the record comes from and the record itself. + */ + def textFileWithFileName(path: String): RDD[(String, String)] = { + + // In order to go through the folder structure recursively: + sc.hadoopConfiguration + .set("mapreduce.input.fileinputformat.input.dir.recursive", "true") + + sc.hadoopFile( + path, + classOf[TextInputFormat2], + classOf[LongWritable], + classOf[Text], + sc.defaultMinPartitions + ) + .asInstanceOf[HadoopRDD[LongWritable, Text]] + .mapPartitionsWithInputSplit { + case (inputSplit, iterator) => + val file = inputSplit.asInstanceOf[FileSplit] + iterator.map(tpl => (file.getPath.toString, tpl._2.toString)) + } + } + /** Decreases the nbr of partitions of a folder. * * This comes in handy when the last step of your job needs to run on @@ -479,61 +529,6 @@ object SparkHelper extends Serializable { ) } - /** Equivalent to sparkContext.textFile(), but for each line is associated - * with its file path. - * - * Produces a RDD[(file_name, line)] which provides a way to know from which - * file a given line comes from. - * - * {{{ - * // Considering this folder: - * // folder/file_1.txt whose content is data1\ndata2\ndata3 - * // folder/file_2.txt whose content is data4\ndata4 - * // folder/folder_1/file_3.txt whose content is data6\ndata7 - * // then: - * SparkHelper.textFileWithFileName("folder", sparkContext) - * // will return: - * RDD( - * ("file:/path/on/machine/folder/file_1.txt", "data1"), - * ("file:/path/on/machine/folder/file_1.txt", "data2"), - * ("file:/path/on/machine/folder/file_1.txt", "data3"), - * ("file:/path/on/machine/folder/file_2.txt", "data4"), - * ("file:/path/on/machine/folder/file_2.txt", "data5"), - * ("file:/path/on/machine/folder/folder_1/file_3.txt", "data6"), - * ("file:/path/on/machine/folder/folder_1/file_3.txt", "data7") - * ) - * }}} - * - * @param path the path of the folder (or structure of folders) to read - * @param sparkContext the SparkContext - * @return the RDD of records where a record is a tuple containing the path - * of the file the record comes from and the record itself. - */ - def textFileWithFileName( - path: String, - sparkContext: SparkContext - ): RDD[(String, String)] = { - - // In order to go through the folder structure recursively: - sparkContext.hadoopConfiguration - .set("mapreduce.input.fileinputformat.input.dir.recursive", "true") - - sparkContext - .hadoopFile( - path, - classOf[TextInputFormat2], - classOf[LongWritable], - classOf[Text], - sparkContext.defaultMinPartitions - ) - .asInstanceOf[HadoopRDD[LongWritable, Text]] - .mapPartitionsWithInputSplit { - case (inputSplit, iterator) => - val file = inputSplit.asInstanceOf[FileSplit] - iterator.map(tpl => (file.getPath.toString, tpl._2.toString)) - } - } - // Internal core: private def saveAsSingleTextFileWithWorkingFolderInternal( diff --git a/src/test/scala/com/spark_helper/SparkHelperTest.scala b/src/test/scala/com/spark_helper/SparkHelperTest.scala index 2af3b4d..a692350 100644 --- a/src/test/scala/com/spark_helper/SparkHelperTest.scala +++ b/src/test/scala/com/spark_helper/SparkHelperTest.scala @@ -312,19 +312,25 @@ class SparkHelperTest "Extract lines of files to an RDD of tuple containing the line and file " + "the line comes from") { - HdfsHelper.deleteFolder("src/test/resources/with_file_name") + val testFolder = s"$resourceFolder/with_file_name" + + HdfsHelper.deleteFolder(testFolder) + HdfsHelper.writeToHdfsFile( "data_1_a\ndata_1_b\ndata_1_c", - "src/test/resources/with_file_name/file_1.txt") + s"$testFolder/file_1.txt" + ) HdfsHelper.writeToHdfsFile( "data_2_a\ndata_2_b", - "src/test/resources/with_file_name/file_2.txt") + s"$testFolder/file_2.txt" + ) HdfsHelper.writeToHdfsFile( "data_3_a\ndata_3_b\ndata_3_c\ndata_3_d", - "src/test/resources/with_file_name/folder_1/file_3.txt") + s"$testFolder/folder_1/file_3.txt" + ) - val computedRdd = SparkHelper - .textFileWithFileName("src/test/resources/with_file_name", sc) + val computedRdd = sc + .textFileWithFileName(testFolder) // We remove the part of the path which is specific to the local machine // on which the test run: .map { @@ -359,6 +365,6 @@ class SparkHelperTest assertRDDEquals(computedRdd, expectedRDD) - HdfsHelper.deleteFolder("src/test/resources/with_file_name") + HdfsHelper.deleteFolder(testFolder) } } From 9b04cbedf7309a8a869a25a75ba15457eedbf26e Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Sat, 26 May 2018 12:42:41 +0100 Subject: [PATCH 09/25] Add code-style to doc --- .../scala/com/spark_helper/SparkHelper.scala | 43 ++++++++++--------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index 2ed582e..1da4b10 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -21,8 +21,8 @@ import scala.util.Random * A few exemples: * * {{{ - * // Same as SparkContext.saveAsTextFile, but the result is a single file: - * SparkHelper.saveAsSingleTextFile(myOutputRDD, "/my/output/file/path.txt") + * // Same as sc.saveAsTextFile(path), but the result is a single file: + * rdd.saveAsSingleTextFile("/my/output/file/path.txt") * // Same as SparkContext.textFile, but instead of reading one record per * // line, it reads records spread over several lines. * // This way, xml, json, yml or any multi-line record file format can be used @@ -78,10 +78,10 @@ object SparkHelper extends Serializable { * Allows one to save an RDD in one file, while keeping the processing * parallelized. * - * This variant of saveAsSingleTextFile performs the storage in a temporary - * folder instead of directly in the final output folder. This way the - * risks of having corrupted files in the real output folder due to cluster - * interruptions is minimized. + * This variant of saveAsSingleTextFile + * performs the storage in a temporary folder instead of directly in the + * final output folder. This way the risks of having corrupted files in the + * real output folder due to cluster interruptions is minimized. * * {{{ rdd.saveAsSingleTextFile("/my/file/path.txt", "/my/working/folder/path") }}} * @@ -102,10 +102,10 @@ object SparkHelper extends Serializable { * Allows one to save an RDD in one file, while keeping the processing * parallelized. * - * This variant of saveAsSingleTextFile performs the storage in a temporary - * folder instead of directly in the final output folder. This way the risks - * of having corrupted files in the real output folder due to cluster - * interruptions is minimized. + * This variant of saveAsSingleTextFile + * performs the storage in a temporary folder instead of directly in the + * final output folder. This way the risks of having corrupted files in the + * real output folder due to cluster interruptions is minimized. * * {{{ * rdd.saveAsSingleTextFile("/my/file/path.txt", "/my/working/folder/path", classOf[BZip2Codec]) @@ -174,7 +174,7 @@ object SparkHelper extends Serializable { ) } - /** Saves as text file, but by decreasing the nbr of partitions of the output. + /** Saves as text file, and decreases the nbr of output partitions. * * Same as rdd.saveAsTextFile() * , but decreases the nbr of partitions in the output folder before doing @@ -234,7 +234,7 @@ object SparkHelper extends Serializable { * * As this internally needs to know the nbr of keys, this will have to * compute it. If this nbr of keys is known beforehand, it would spare - * resources to use saveAsTextFileByKey(path: String, keyNbr: Int) + * resources to use saveAsTextFileByKey(path: String, keyNbr: Int) * instead. * * This is not scalable. This shouldn't be considered for any data flow @@ -274,7 +274,7 @@ object SparkHelper extends Serializable { * As this internally needs to know the nbr of keys, this will have to * compute it. If this nbr of keys is known beforehand, it would spare * resources to use - * saveAsTextFileByKey(path: String, keyNbr: Int, codec: Class[_ <: CompressionCodec]) + * saveAsTextFileByKey(path: String, keyNbr: Int, codec: Class[_ <: CompressionCodec]) * instead. * * This is not scalable. This shouldn't be considered for any data flow @@ -320,17 +320,20 @@ object SparkHelper extends Serializable { implicit class SparkContextExtensions(val sc: SparkContext) extends AnyVal { - /** Equivalent to sparkContext.textFile(), but for a specific record delimiter. + /** Equivalent to sparkContext.textFile() + * , but for a specific record delimiter. * - * By default, sparkContext.textFile() will provide one record per line - * (per '\n'). But what if the format to read considers that one record - * is stored in more than one line (yml, custom format, ...)? + * By default, sparkContext.textFile() + * will provide one record per line (per '\n'). + * But what if the format to read considers that one record is stored in + * more than one line (yml, custom format, ...)? * * For instance in order to read a yml file, which is a format for which a * record (a single entity) is spread other several lines, you can modify - * the record delimiter with "---\n" instead of "\n". Same goes when - * reading an xml file where a record might be spread over several lines or - * worse the whole xml file is one line. + * the record delimiter with "---\n" + * instead of "\n". + * Same goes when reading an xml file where a record might be spread over + * several lines or worse the whole xml file is one line. * * {{{ * // Let's say data we want to use with Spark looks like this (one record From 22cdff8ebfc93bf9b9467f5a625029c25ad067a8 Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Sat, 26 May 2018 15:27:22 +0100 Subject: [PATCH 10/25] Add a sc.textFile which reads files containing commas in their name --- .../scala/com/spark_helper/SparkHelper.scala | 43 ++++++++++++++- .../org/apache/spark/TextFileOverwrite.scala | 54 +++++++++++++++++++ .../com/spark_helper/SparkHelperTest.scala | 29 +++++++++- 3 files changed, 122 insertions(+), 4 deletions(-) create mode 100644 src/main/scala/org/apache/spark/TextFileOverwrite.scala diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index 1da4b10..73b5ccc 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -1,5 +1,7 @@ package com.spark_helper +import org.apache.spark.TextFileOverwrite + import org.apache.spark.{HashPartitioner, SparkContext} import org.apache.spark.rdd.{RDD, HadoopRDD} import org.apache.hadoop.conf.Configuration @@ -449,6 +451,43 @@ object SparkHelper extends Serializable { } } + /** A replacement for sc.textFile() + * when files contains commas in their name. + * + * As sc.textFile() + * allows to provide several files at once by giving them as a string which + * is a list of strings joined with ,, + * we can't give it files containing commas in their name. + * + * This method aims at bypassing this limitation by passing paths as a + * sequence of strings. + * + * {{{ sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt")) }}} + * + * @param paths the paths of the file(s)/folder(s) to read + */ + def textFile(paths: Seq[String]): RDD[String] = + TextFileOverwrite.textFile(paths, sc.defaultMinPartitions, sc) + + /** A replacement for sc.textFile() + * when files contains commas in their name. + * + * As sc.textFile() + * allows to provide several files at once by giving them as a string which + * is a list of strings joined with ,, + * we can't give it files containing commas in their name. + * + * This method aims at bypassing this limitation by passing paths as a + * sequence of strings. + * + * {{{ sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt")) }}} + * + * @param paths the paths of the file(s)/folder(s) to read + * @param minPartitions the nbr of partitions in which to split the input + */ + def textFile(paths: Seq[String], minPartitions: Int): RDD[String] = + TextFileOverwrite.textFile(paths, minPartitions, sc) + /** Decreases the nbr of partitions of a folder. * * This comes in handy when the last step of your job needs to run on @@ -643,11 +682,11 @@ object SparkHelper extends Serializable { highCoalescenceLevelFolder: String, lowerCoalescenceLevelFolder: String, finalCoalesceLevel: Int, - sparkContext: SparkContext, + sc: SparkContext, codec: Option[Class[_ <: CompressionCodec]] ): Unit = { - val intermediateRDD = sparkContext + val intermediateRDD = sc .textFile(highCoalescenceLevelFolder) .coalesce(finalCoalesceLevel) diff --git a/src/main/scala/org/apache/spark/TextFileOverwrite.scala b/src/main/scala/org/apache/spark/TextFileOverwrite.scala new file mode 100644 index 0000000..8d1cc1b --- /dev/null +++ b/src/main/scala/org/apache/spark/TextFileOverwrite.scala @@ -0,0 +1,54 @@ +package org.apache.spark + +import org.apache.spark.rdd.{RDD, HadoopRDD} +import org.apache.spark.util.SerializableConfiguration +import org.apache.hadoop.mapred.{FileInputFormat, JobConf, TextInputFormat} +import org.apache.hadoop.io.{LongWritable, Text} +import org.apache.hadoop.fs.Path + +object TextFileOverwrite { + + def textFile( + paths: Seq[String], + minPartitions: Int, + sc: SparkContext + ): RDD[String] = { + + /* Private notes: + * + * * Compared to sc.textFile(), the only difference in the implementation is + * the call to FileInputFormat.setInputPaths which takes Paths in input + * instead of a comma-separated String. + * + * * I use the package org.apache.spark to store this function, because + * SerializableConfiguration has the visibility private[spark] in spark's + * code base. + * + * * I would have preferred giving Seq[Path] instead of Seq[String] as an + * input of this method, but Path is not yet Serializable in the current + * version of hadoop-common used by Spark (it will become Serializable + * starting version 3 of hadoop-common). + * + * * I don't String* (instead of Seq[String]) as for 1 String only it would + * confuse the compiler as to which sc.textFile to use (the default one or + * this one). + */ + + val confBroadcast = + sc.broadcast(new SerializableConfiguration(sc.hadoopConfiguration)) + + val setInputPathsFunc = + (jobConf: JobConf) => + FileInputFormat.setInputPaths(jobConf, paths.map(p => new Path(p)): _*) + + new HadoopRDD( + sc, + confBroadcast, + Some(setInputPathsFunc), + classOf[TextInputFormat], + classOf[LongWritable], + classOf[Text], + minPartitions + ).map(pair => pair._2.toString) + } +} diff --git a/src/test/scala/com/spark_helper/SparkHelperTest.scala b/src/test/scala/com/spark_helper/SparkHelperTest.scala index a692350..cbcea8d 100644 --- a/src/test/scala/com/spark_helper/SparkHelperTest.scala +++ b/src/test/scala/com/spark_helper/SparkHelperTest.scala @@ -342,7 +342,7 @@ class SparkHelperTest (nonLocalPath, line) } - val expectedRDD = sc.parallelize( + val expectedRdd = sc.parallelize( Array( ("file:/.../src/test/resources/with_file_name/file_1.txt", "data_1_a"), ("file:/.../src/test/resources/with_file_name/file_1.txt", "data_1_b"), @@ -363,7 +363,32 @@ class SparkHelperTest ("file:/.../src/test/resources/with_file_name/file_2.txt", "data_2_b") )) - assertRDDEquals(computedRdd, expectedRDD) + assertRDDEquals(computedRdd, expectedRdd) + + HdfsHelper.deleteFolder(testFolder) + } + + test("textFile with files containing commas in their path") { + + val testFolder = s"$resourceFolder/files_containing_commas" + + HdfsHelper.deleteFolder(testFolder) + + HdfsHelper.writeToHdfsFile( + "data_1_a\ndata_1_b", + s"$testFolder/file,1.txt" + ) + HdfsHelper.writeToHdfsFile( + "data_2_a\ndata_2_b", + s"$testFolder/file_2.txt" + ) + + val computedRdd = + sc.textFile(List(s"$testFolder/file,1.txt", s"$testFolder/file_2.txt")) + val expectedRdd = + sc.parallelize("data_1_a\ndata_1_b\ndata_2_a\ndata_2_b".split("\n")) + + assertRDDEquals(computedRdd, expectedRdd) HdfsHelper.deleteFolder(testFolder) } From f4007de198726c853f66a73c65af2c69362aa4bc Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Sun, 27 May 2018 09:07:38 +0100 Subject: [PATCH 11/25] Add as comment an alternative way of getting RDD with records' file name --- src/main/scala/com/spark_helper/SparkHelper.scala | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index 73b5ccc..c8c1c9c 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -449,6 +449,18 @@ object SparkHelper extends Serializable { val file = inputSplit.asInstanceOf[FileSplit] iterator.map(tpl => (file.getPath.toString, tpl._2.toString)) } + + /* An other way of doing would be: + * + * import org.apache.spark.sql.functions.input_file_name + * import spark.implicits._ + * + * spark.read + * .text(testFolder) + * .select(input_file_name, $"value") + * .as[(String, String)] + * .rdd + */ } /** A replacement for sc.textFile() From 815faaa849acc2f197ea0c195c7905141fae88da Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Sun, 10 Jun 2018 09:57:08 +0100 Subject: [PATCH 12/25] Add todos --- src/main/scala/com/spark_helper/SparkHelper.scala | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index c8c1c9c..e762697 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -1,7 +1,6 @@ package com.spark_helper import org.apache.spark.TextFileOverwrite - import org.apache.spark.{HashPartitioner, SparkContext} import org.apache.spark.rdd.{RDD, HadoopRDD} import org.apache.hadoop.conf.Configuration @@ -36,6 +35,17 @@ import scala.util.Random * SparkHelper.textFileWithFileName("folder", sparkContext) * }}} * + * @todo some kind of partialMap: + * + * {{{ + * RDD(1, 3, 2, 7, 8).partMap{ case a if a % 2 == 0 => 2 * a } + * res: RDD(1, 3, 4, 7, 16) + * in order to avoid: + * RDD(1, 3, 2, 7, 8).partMap{ case a if a % 2 == 0 => 2 * a; case a => a } + * }}} + * + * @todo sc.parallelize[T](elmts: T*) instead of sc.parallelize[T](elmts: Array[T]) + * * Source SparkHelper * From 46edfee6164e4494962fe8f821eaeb68c16f1f60 Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Sun, 10 Jun 2018 10:20:43 +0100 Subject: [PATCH 13/25] Pimp RDDs with a flatten method --- .../scala/com/spark_helper/SparkHelper.scala | 52 +++++++++++++++---- .../com/spark_helper/SparkHelperTest.scala | 16 ++++++ 2 files changed, 57 insertions(+), 11 deletions(-) diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index e762697..8718072 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -11,6 +11,8 @@ import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.hadoop.mapred.{FileSplit, TextInputFormat => TextInputFormat2} +import scala.reflect.ClassTag + import scala.util.Random /** A facility to deal with RDD/file manipulations based on the Spark API. @@ -36,7 +38,7 @@ import scala.util.Random * }}} * * @todo some kind of partialMap: - * + * * {{{ * RDD(1, 3, 2, 7, 8).partMap{ case a if a % 2 == 0 => 2 * a } * res: RDD(1, 3, 4, 7, 16) @@ -235,6 +237,34 @@ object SparkHelper extends Serializable { } } + implicit class SeqRDDExtensions[T: ClassTag](val rdd: RDD[Seq[T]]) { + + /** Flattens an RDD[Seq[T]] + * to RDD[T]. + * + * {{{ sc.parallelize(Array(Seq(1, 2, 3), Nil, Seq(4))).flatten == sc.parallelize(Array(Seq(1, 2, 3, 4))) }}} + * + * @return the flat RDD as RDD.flatMap(identity) + * or List.flatten + * would have. + */ + def flatten(): RDD[T] = rdd.flatMap(identity) + } + + implicit class OptionRDDExtensions[T: ClassTag](val rdd: RDD[Option[T]]) { + + /** Flattens an RDD[Option[T]] + * to RDD[T]. + * + * {{{ sc.parallelize(Array(Some(1), None, Some(2))).flatten == sc.parallelize(Array(Seq(1, 2))) }}} + * + * @return the flat RDD as RDD.flatMap(x => x) + * or List.flatten + * would have. + */ + def flatten(): RDD[T] = rdd.flatMap(o => o) + } + implicit class PairRDDExtensions(val rdd: RDD[(String, String)]) extends AnyVal { @@ -461,16 +491,16 @@ object SparkHelper extends Serializable { } /* An other way of doing would be: - * - * import org.apache.spark.sql.functions.input_file_name - * import spark.implicits._ - * - * spark.read - * .text(testFolder) - * .select(input_file_name, $"value") - * .as[(String, String)] - * .rdd - */ + * + * import org.apache.spark.sql.functions.input_file_name + * import spark.implicits._ + * + * spark.read + * .text(testFolder) + * .select(input_file_name, $"value") + * .as[(String, String)] + * .rdd + */ } /** A replacement for sc.textFile() diff --git a/src/test/scala/com/spark_helper/SparkHelperTest.scala b/src/test/scala/com/spark_helper/SparkHelperTest.scala index cbcea8d..5f07fc1 100644 --- a/src/test/scala/com/spark_helper/SparkHelperTest.scala +++ b/src/test/scala/com/spark_helper/SparkHelperTest.scala @@ -1,6 +1,7 @@ package com.spark_helper import com.spark_helper.SparkHelper.{RDDExtensions, PairRDDExtensions} +import com.spark_helper.SparkHelper.{SeqRDDExtensions, OptionRDDExtensions} import com.spark_helper.SparkHelper.SparkContextExtensions import org.apache.hadoop.io.compress.GzipCodec @@ -149,6 +150,21 @@ class SparkHelperTest HdfsHelper.deleteFile(xmlFilePath) } + test("Flatten RDD") { + + var in = sc.parallelize(Array(Seq(1, 2, 3), Seq(), Nil, Seq(4), Seq(5, 6))) + var out = sc.parallelize(Array(1, 2, 3, 4, 5, 6)) + assertRDDEquals(in.flatten, out) + + in = sc.parallelize(Array(List(1, 2, 3), List(), Nil, List(4), List(5, 6))) + out = sc.parallelize(Array(1, 2, 3, 4, 5, 6)) + assertRDDEquals(in.flatten, out) + + val in2 = sc.parallelize(Array(Option(1), None, Option(2))) + val out2 = sc.parallelize(Array(1, 2)) + assertRDDEquals(in2.flatten, out2) + } + test("Save as text file by key") { val keyValueFolder = s"$resourceFolder/key_value_storage" From 1e3a065195db8004cf1383fadc1ce10e1493e020 Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Mon, 11 Jun 2018 23:23:26 +0100 Subject: [PATCH 14/25] Pimp RDDs with a partialMap function --- .../scala/com/spark_helper/SparkHelper.scala | 46 ++++++++++++++----- .../com/spark_helper/SparkHelperTest.scala | 12 ++++- 2 files changed, 44 insertions(+), 14 deletions(-) diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index 8718072..ff400f9 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -37,26 +37,48 @@ import scala.util.Random * SparkHelper.textFileWithFileName("folder", sparkContext) * }}} * - * @todo some kind of partialMap: - * - * {{{ - * RDD(1, 3, 2, 7, 8).partMap{ case a if a % 2 == 0 => 2 * a } - * res: RDD(1, 3, 4, 7, 16) - * in order to avoid: - * RDD(1, 3, 2, 7, 8).partMap{ case a if a % 2 == 0 => 2 * a; case a => a } - * }}} - * - * @todo sc.parallelize[T](elmts: T*) instead of sc.parallelize[T](elmts: Array[T]) - * * Source SparkHelper * + * @todo sc.parallelize[T](elmts: T*) instead of sc.parallelize[T](elmts: Array[T]) * @author Xavier Guihot * @since 2017-02 */ object SparkHelper extends Serializable { - implicit class RDDExtensions(val rdd: RDD[String]) extends AnyVal { + implicit class RDDExtensions[T: ClassTag](val rdd: RDD[T]) { + + /** Map an RDD to the same type, by applying a partial function and the + * identity otherwise. + * + * Avoids having case x => x. + * + * Similar idea to .collect, + * but instead of skipping non-matching items, keeps then as-is. + * + * {{{ + * sc.parallelize(Array(1, 3, 2, 7, 8)).partialMap { case a if a % 2 == 0 => 2 * a } + * // is equivalent to: + * sc.parallelize(Array(1, 3, 2, 7, 8)).map { + * case a if a % 2 == 0 => 2 * a + * case a => a + * } + * // in order to map to: + * sc.parallelize(Array(1, 3, 4, 7, 16)) + * }}} + * + * @param pf the partial function to apply + * @return an rdd of the same type, for which each element is either the + * application of the partial function where defined or the identity. + */ + def partialMap(pf: PartialFunction[T, T]): RDD[T] = + rdd.map { + case x if pf.isDefinedAt(x) => pf(x) + case x => x + } + } + + implicit class StringRDDExtensions(val rdd: RDD[String]) extends AnyVal { /** Saves an RDD in exactly one file. * diff --git a/src/test/scala/com/spark_helper/SparkHelperTest.scala b/src/test/scala/com/spark_helper/SparkHelperTest.scala index 5f07fc1..1ccfba4 100644 --- a/src/test/scala/com/spark_helper/SparkHelperTest.scala +++ b/src/test/scala/com/spark_helper/SparkHelperTest.scala @@ -1,8 +1,8 @@ package com.spark_helper -import com.spark_helper.SparkHelper.{RDDExtensions, PairRDDExtensions} +import com.spark_helper.SparkHelper.{RDDExtensions, StringRDDExtensions} import com.spark_helper.SparkHelper.{SeqRDDExtensions, OptionRDDExtensions} -import com.spark_helper.SparkHelper.SparkContextExtensions +import com.spark_helper.SparkHelper.{SparkContextExtensions, PairRDDExtensions} import org.apache.hadoop.io.compress.GzipCodec @@ -408,4 +408,12 @@ class SparkHelperTest HdfsHelper.deleteFolder(testFolder) } + + test("Partial map") { + + val in = sc.parallelize(Array(1, 3, 2, 7, 8)) + val computedOut = in.partialMap { case a if a % 2 == 0 => 2 * a } + val expetcedOut = sc.parallelize(Array(1, 3, 4, 7, 16)) + assertRDDEquals(computedOut, expetcedOut) + } } From 277f22245938f4abe494df3f89dabd25cda758f6 Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Mon, 11 Jun 2018 23:59:51 +0100 Subject: [PATCH 15/25] Add test for HdfsHelper.compressFile --- .../scala/com/spark_helper/HdfsHelper.scala | 16 ++++++------- .../scala/com/spark_helper/SparkHelper.scala | 2 +- .../com/spark_helper/HdfsHelperTest.scala | 24 +++++++++++++++++++ 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/src/main/scala/com/spark_helper/HdfsHelper.scala b/src/main/scala/com/spark_helper/HdfsHelper.scala index f3dc3ed..a9038bd 100644 --- a/src/main/scala/com/spark_helper/HdfsHelper.scala +++ b/src/main/scala/com/spark_helper/HdfsHelper.scala @@ -340,8 +340,7 @@ object HdfsHelper extends Serializable { FileSystem .get(new Configuration()) .listStatus(new Path(hdfsPath)) - .flatMap(status => { - + .flatMap { status => // If it's a file: if (status.isFile) { if (onlyName) List(status.getPath.getName) @@ -352,11 +351,12 @@ object HdfsHelper extends Serializable { listFileNamesInFolder( hdfsPath + "/" + status.getPath.getName, true, - onlyName) + onlyName + ) // If it's a dir and we're not in a recursive option: else Nil - }) + } .toList .sorted } @@ -661,8 +661,8 @@ object HdfsHelper extends Serializable { val ClassOfBZip2 = classOf[BZip2Codec] val outputPath = compressionCodec match { - case ClassOfGzip => inputPath + ".gz" - case ClassOfBZip2 => inputPath + ".bz2" + case ClassOfGzip => s"$inputPath.gz" + case ClassOfBZip2 => s"$inputPath.bz2" } val inputStream = fileSystem.open(new Path(inputPath)) @@ -747,8 +747,8 @@ object HdfsHelper extends Serializable { val fileSystem = FileSystem.get(new Configuration()) val tmpOutputPath = workingFolderPath match { - case "" => filePath + ".tmp" - case _ => workingFolderPath + "/xml.tmp" + case "" => s"$filePath.tmp" + case _ => s"$workingFolderPath/xml.tmp" } deleteFile(tmpOutputPath) diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index ff400f9..075c22f 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -54,7 +54,7 @@ object SparkHelper extends Serializable { * Avoids having case x => x. * * Similar idea to .collect, - * but instead of skipping non-matching items, keeps then as-is. + * but instead of skipping non-matching items, it keeps them as-is. * * {{{ * sc.parallelize(Array(1, 3, 2, 7, 8)).partialMap { case a if a % 2 == 0 => 2 * a } diff --git a/src/test/scala/com/spark_helper/HdfsHelperTest.scala b/src/test/scala/com/spark_helper/HdfsHelperTest.scala index 1935e89..8c807d9 100644 --- a/src/test/scala/com/spark_helper/HdfsHelperTest.scala +++ b/src/test/scala/com/spark_helper/HdfsHelperTest.scala @@ -1,5 +1,7 @@ package com.spark_helper +import org.apache.hadoop.io.compress.GzipCodec + import com.holdenkarau.spark.testing.SharedSparkContext import org.scalatest.FunSuite @@ -11,6 +13,8 @@ import org.scalatest.FunSuite */ class HdfsHelperTest extends FunSuite with SharedSparkContext { + val resourceFolder = "src/test/resources" + test("Delete file/folder") { // Let's try to delete a file: @@ -538,4 +542,24 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext { HdfsHelper.deleteFolder("src/test/resources/folder_to_purge") } + + test("Compress hdfs file") { + + val testFolder = s"$resourceFolder/folder" + val filePath = s"$testFolder/file.txt" + + HdfsHelper.deleteFile(filePath) + + HdfsHelper.writeToHdfsFile("hello\nworld", filePath) + HdfsHelper.compressFile(filePath, classOf[GzipCodec], true) + + assert(HdfsHelper.fileExists(s"$filePath.gz")) + + // Easy to test with spark, as reading a file with the ".gz" extention + // forces the read with the compression codec: + val content = sc.textFile(s"$filePath.gz").collect.sorted + assert(content === Array("hello", "world")) + + HdfsHelper.deleteFolder(testFolder) + } } From f108bacfba679d1271ae40851da7ef8d93cbfe2f Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Tue, 12 Jun 2018 22:10:12 +0100 Subject: [PATCH 16/25] Clean-up of HdfsHelperTest --- .../com/spark_helper/HdfsHelperTest.scala | 393 +++++++----------- 1 file changed, 161 insertions(+), 232 deletions(-) diff --git a/src/test/scala/com/spark_helper/HdfsHelperTest.scala b/src/test/scala/com/spark_helper/HdfsHelperTest.scala index 8c807d9..9babb0b 100644 --- a/src/test/scala/com/spark_helper/HdfsHelperTest.scala +++ b/src/test/scala/com/spark_helper/HdfsHelperTest.scala @@ -14,61 +14,66 @@ import org.scalatest.FunSuite class HdfsHelperTest extends FunSuite with SharedSparkContext { val resourceFolder = "src/test/resources" + val testFolder = s"$resourceFolder/folder" test("Delete file/folder") { + val filePath = s"$testFolder/file.txt" + // Let's try to delete a file: - HdfsHelper.writeToHdfsFile("", "src/test/resources/file_to_delete.txt") + HdfsHelper.createEmptyHdfsFile(filePath) // 1: Let's try to delete it with the deleteFolder method: var messageThrown = intercept[IllegalArgumentException] { - HdfsHelper.deleteFolder("src/test/resources/file_to_delete.txt") + HdfsHelper.deleteFolder(filePath) } var expectedMessage = "requirement failed: to delete a file, prefer using the " + "deleteFile() method." assert(messageThrown.getMessage === expectedMessage) - assert(HdfsHelper.fileExists("src/test/resources/file_to_delete.txt")) + assert(HdfsHelper.fileExists(filePath)) // 2: Let's delete it with the deleteFile method: - HdfsHelper.deleteFile("src/test/resources/file_to_delete.txt") - assert(!HdfsHelper.fileExists("src/test/resources/file_to_delete.txt")) + HdfsHelper.deleteFile(filePath) + assert(!HdfsHelper.fileExists(filePath)) // Let's try to delete a folder: - HdfsHelper - .writeToHdfsFile("", "src/test/resources/folder_to_delete/file.txt") + HdfsHelper.createEmptyHdfsFile(s"$testFolder/file.txt") // 3: Let's try to delete it with the deleteFile method: messageThrown = intercept[IllegalArgumentException] { - HdfsHelper.deleteFile("src/test/resources/folder_to_delete") + HdfsHelper.deleteFile(testFolder) } expectedMessage = "requirement failed: to delete a folder, prefer using the " + "deleteFolder() method." assert(messageThrown.getMessage === expectedMessage) - assert(HdfsHelper.folderExists("src/test/resources/folder_to_delete")) + assert(HdfsHelper.folderExists(testFolder)) // 4: Let's delete it with the deleteFolder method: - HdfsHelper.deleteFolder("src/test/resources/folder_to_delete") - assert(!HdfsHelper.folderExists("src/test/resources/folder_to_delete")) + HdfsHelper.deleteFolder(testFolder) + assert(!HdfsHelper.folderExists(testFolder)) } test("File/folder exists") { - HdfsHelper.deleteFile("src/test/resources/file_to_check.txt") - HdfsHelper.deleteFolder("src/test/resources/folder_to_check") + val folderPath = s"$resourceFolder/folder" + val filePath = s"$folderPath/file.txt" + + HdfsHelper.deleteFile(filePath) + HdfsHelper.deleteFolder(folderPath) // Let's try to check if a file exists: - assert(!HdfsHelper.fileExists("src/test/resources/file_to_check.txt")) + assert(!HdfsHelper.fileExists(filePath)) - HdfsHelper.writeToHdfsFile("", "src/test/resources/file_to_check.txt") + HdfsHelper.createEmptyHdfsFile(filePath) // 1: Let's try to check it exists with the folderExists method: var messageThrown = intercept[IllegalArgumentException] { - HdfsHelper.folderExists("src/test/resources/file_to_check.txt") + HdfsHelper.folderExists(filePath) } var expectedMessage = "requirement failed: to check if a file exists, prefer using the " + @@ -76,18 +81,18 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext { assert(messageThrown.getMessage === expectedMessage) // 2: Let's try to check it exists with the fileExists method: - assert(HdfsHelper.fileExists("src/test/resources/file_to_check.txt")) + assert(HdfsHelper.fileExists(filePath)) // Let's try to check if a folder exists: - assert(!HdfsHelper.folderExists("src/test/resources/folder_to_check")) + HdfsHelper.deleteFolder(folderPath) + assert(!HdfsHelper.folderExists(folderPath)) - HdfsHelper - .writeToHdfsFile("", "src/test/resources/folder_to_check/file.txt") + HdfsHelper.createEmptyHdfsFile(filePath) // 3: Let's try to check it exists with the fileExists method: messageThrown = intercept[IllegalArgumentException] { - HdfsHelper.fileExists("src/test/resources/folder_to_check") + HdfsHelper.fileExists(folderPath) } expectedMessage = "requirement failed: to check if a folder exists, prefer using " + @@ -95,377 +100,306 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext { assert(messageThrown.getMessage === expectedMessage) // 2: Let's try to check it exists with the folderExists method: - assert(HdfsHelper.folderExists("src/test/resources/folder_to_check")) + assert(HdfsHelper.folderExists(folderPath)) - HdfsHelper.deleteFile("src/test/resources/file_to_check.txt") - HdfsHelper.deleteFolder("src/test/resources/folder_to_check") + HdfsHelper.deleteFile(filePath) + HdfsHelper.deleteFolder(folderPath) } test("Create an empty file on hdfs") { - HdfsHelper.deleteFile("src/test/resources/empty_file.token") + val filePath = s"$testFolder/empty_file.token" - HdfsHelper.createEmptyHdfsFile("src/test/resources/empty_file.token") + HdfsHelper.deleteFile(filePath) - assert(HdfsHelper.fileExists("src/test/resources/empty_file.token")) + HdfsHelper.createEmptyHdfsFile(filePath) - val tokenContent = sc - .textFile("src/test/resources/empty_file.token") - .collect() - .sorted - .mkString("\n") + assert(HdfsHelper.fileExists(filePath)) + val tokenContent = sc.textFile(filePath).collect().sorted.mkString("\n") assert(tokenContent === "") - HdfsHelper.deleteFile("src/test/resources/empty_file.token") + HdfsHelper.deleteFile(filePath) } test( "Save text in HDFS file with the fileSystem API instead of the Spark API") { + val filePath = s"$testFolder/small_file.txt" + // 1: Stores using a "\n"-joined string: - HdfsHelper.deleteFile("src/test/resources/folder/small_file.txt") + HdfsHelper.deleteFile(filePath) val contentToStore = "Hello World\nWhatever" - HdfsHelper.writeToHdfsFile( - contentToStore, - "src/test/resources/folder/small_file.txt") + HdfsHelper.writeToHdfsFile(contentToStore, filePath) - assert(HdfsHelper.fileExists("src/test/resources/folder/small_file.txt")) - - var storedContent = sc - .textFile("src/test/resources/folder/small_file.txt") - .collect() - .sorted - .mkString("\n") + assert(HdfsHelper.fileExists(filePath)) + var storedContent = sc.textFile(filePath).collect().sorted.mkString("\n") assert(storedContent === contentToStore) - HdfsHelper.deleteFolder("src/test/resources/folder") + HdfsHelper.deleteFolder(testFolder) // 2: Stores using a list of strings to be "\n"-joined: - HdfsHelper.deleteFile("src/test/resources/folder/small_file.txt") + HdfsHelper.deleteFile(filePath) val listToStore = List("Hello World", "Whatever") + HdfsHelper.writeToHdfsFile(listToStore, filePath) - HdfsHelper - .writeToHdfsFile(listToStore, "src/test/resources/folder/small_file.txt") - - assert(HdfsHelper.fileExists("src/test/resources/folder/small_file.txt")) - - storedContent = sc - .textFile("src/test/resources/folder/small_file.txt") - .collect() - .sorted - .mkString("\n") + assert(HdfsHelper.fileExists(filePath)) + storedContent = sc.textFile(filePath).collect().sorted.mkString("\n") assert(storedContent === listToStore.mkString("\n")) - HdfsHelper.deleteFolder("src/test/resources/folder") + HdfsHelper.deleteFolder(testFolder) } test("List file names in Hdfs folder") { - HdfsHelper.writeToHdfsFile("", "src/test/resources/folder_1/file_1.txt") - HdfsHelper.writeToHdfsFile("", "src/test/resources/folder_1/file_2.csv") - HdfsHelper - .writeToHdfsFile("", "src/test/resources/folder_1/folder_2/file_3.txt") + val folder1 = s"$resourceFolder/folder_1" + + HdfsHelper.createEmptyHdfsFile(s"$folder1/file_1.txt") + HdfsHelper.createEmptyHdfsFile(s"$folder1/file_2.csv") + HdfsHelper.createEmptyHdfsFile(s"$folder1/folder_2/file_3.txt") // 1: Not recursive, names only: - var fileNames = - HdfsHelper.listFileNamesInFolder("src/test/resources/folder_1") + var fileNames = HdfsHelper.listFileNamesInFolder(folder1) var expectedFileNames = List("file_1.txt", "file_2.csv") assert(fileNames === expectedFileNames) // 2: Not recursive, full paths: - fileNames = HdfsHelper - .listFileNamesInFolder("src/test/resources/folder_1", onlyName = false) - expectedFileNames = List( - "src/test/resources/folder_1/file_1.txt", - "src/test/resources/folder_1/file_2.csv" - ) + fileNames = HdfsHelper.listFileNamesInFolder(folder1, onlyName = false) + expectedFileNames = List(s"$folder1/file_1.txt", s"$folder1/file_2.csv") assert(fileNames === expectedFileNames) // 3: Recursive, names only: - fileNames = HdfsHelper - .listFileNamesInFolder("src/test/resources/folder_1", recursive = true) + fileNames = HdfsHelper.listFileNamesInFolder(folder1, recursive = true) expectedFileNames = List("file_1.txt", "file_2.csv", "file_3.txt") assert(fileNames === expectedFileNames) // 4: Recursive, full paths: - fileNames = HdfsHelper.listFileNamesInFolder( - "src/test/resources/folder_1", - recursive = true, - onlyName = false) + fileNames = HdfsHelper + .listFileNamesInFolder(folder1, recursive = true, onlyName = false) expectedFileNames = List( - "src/test/resources/folder_1/file_1.txt", - "src/test/resources/folder_1/file_2.csv", - "src/test/resources/folder_1/folder_2/file_3.txt" + s"$folder1/file_1.txt", + s"$folder1/file_2.csv", + s"$folder1/folder_2/file_3.txt" ) assert(fileNames === expectedFileNames) - HdfsHelper.deleteFolder("src/test/resources/folder_1") + HdfsHelper.deleteFolder(folder1) } test("List folder names in Hdfs folder") { - HdfsHelper.writeToHdfsFile("", "src/test/resources/folder_1/file_1.txt") - HdfsHelper - .writeToHdfsFile("", "src/test/resources/folder_1/folder_2/file_2.txt") - HdfsHelper - .writeToHdfsFile("", "src/test/resources/folder_1/folder_3/file_3.txt") + val folder1 = s"$resourceFolder/folder_1" - val folderNames = HdfsHelper.listFolderNamesInFolder( - "src/test/resources/folder_1" - ) + HdfsHelper.createEmptyHdfsFile(s"$folder1/file_1.txt") + HdfsHelper.createEmptyHdfsFile(s"$folder1/folder_2/file_2.txt") + HdfsHelper.createEmptyHdfsFile(s"$folder1/folder_3/file_3.txt") + + val folderNames = HdfsHelper.listFolderNamesInFolder(folder1) val expectedFolderNames = List("folder_2", "folder_3") assert(folderNames === expectedFolderNames) - HdfsHelper.deleteFolder("src/test/resources/folder_1") + HdfsHelper.deleteFolder(folder1) } test("Move file") { + val filePath = s"$testFolder/some_file.txt" + val renamedPath = s"$testFolder/renamed_file.txt" + // Let's remove possible previous stuff: - HdfsHelper.deleteFile("src/test/resources/some_file.txt") - HdfsHelper.deleteFile("src/test/resources/renamed_file.txt") + HdfsHelper.deleteFolder(testFolder) // Let's create the file to rename: - HdfsHelper.writeToHdfsFile("whatever", "src/test/resources/some_file.txt") + HdfsHelper.writeToHdfsFile("whatever", filePath) // 1: Let's try to move the file on a file which already exists without // the overwrite option: - assert(HdfsHelper.fileExists("src/test/resources/some_file.txt")) - assert(!HdfsHelper.fileExists("src/test/resources/renamed_file.txt")) + assert(HdfsHelper.fileExists(filePath)) + assert(!HdfsHelper.fileExists(renamedPath)) // Let's create the existing file where we want to move our file: - HdfsHelper.writeToHdfsFile("", "src/test/resources/renamed_file.txt") + HdfsHelper.createEmptyHdfsFile(renamedPath) // Let's rename the file to the path where a file already exists: val ioExceptionThrown = intercept[IllegalArgumentException] { - HdfsHelper.moveFile( - "src/test/resources/some_file.txt", - "src/test/resources/renamed_file.txt") + HdfsHelper.moveFile(filePath, renamedPath) } var expectedMessage = "requirement failed: overwrite option set to false, but a file " + - "already exists at target location src/test/resources/renamed_file.txt" + "already exists at target location " + + "src/test/resources/folder/renamed_file.txt" assert(ioExceptionThrown.getMessage === expectedMessage) - assert(HdfsHelper.fileExists("src/test/resources/some_file.txt")) - assert(HdfsHelper.fileExists("src/test/resources/renamed_file.txt")) + assert(HdfsHelper.fileExists(filePath)) + assert(HdfsHelper.fileExists(renamedPath)) - HdfsHelper.deleteFile("src/test/resources/renamed_file.txt") + HdfsHelper.deleteFile(renamedPath) // 2: Let's fail to move the file with the moveFolder() method: - assert(HdfsHelper.fileExists("src/test/resources/some_file.txt")) - assert(!HdfsHelper.fileExists("src/test/resources/renamed_file.txt")) + assert(HdfsHelper.fileExists(filePath)) + assert(!HdfsHelper.fileExists(renamedPath)) // Let's rename the file: val illegalArgExceptionThrown = intercept[IllegalArgumentException] { - HdfsHelper.moveFolder( - "src/test/resources/some_file.txt", - "src/test/resources/renamed_file.txt") + HdfsHelper.moveFolder(filePath, renamedPath) } expectedMessage = "requirement failed: to move a file, prefer using the " + "moveFile() method." assert(illegalArgExceptionThrown.getMessage === expectedMessage) - assert(HdfsHelper.fileExists("src/test/resources/some_file.txt")) - assert(!HdfsHelper.fileExists("src/test/resources/renamed_file.txt")) + assert(HdfsHelper.fileExists(filePath)) + assert(!HdfsHelper.fileExists(renamedPath)) // 3: Let's successfuly move the file with the moveFile() method: // Let's rename the file: - HdfsHelper.moveFile( - "src/test/resources/some_file.txt", - "src/test/resources/renamed_file.txt") - - assert(!HdfsHelper.fileExists("src/test/resources/some_file.txt")) - assert(HdfsHelper.fileExists("src/test/resources/renamed_file.txt")) + HdfsHelper.moveFile(filePath, renamedPath) - val newContent = sc.textFile("src/test/resources/renamed_file.txt").collect + assert(!HdfsHelper.fileExists(filePath)) + assert(HdfsHelper.fileExists(renamedPath)) + val newContent = sc.textFile(renamedPath).collect assert(Array("whatever") === newContent) - HdfsHelper.deleteFile("src/test/resources/renamed_file.txt") + HdfsHelper.deleteFolder(testFolder) } test("Move folder") { + val folderToMove = s"$testFolder/folder_to_move" + val renamedFolder = s"$testFolder/renamed_folder" + // Let's remove possible previous stuff: - HdfsHelper.deleteFolder("src/test/resources/some_folder_to_move") - HdfsHelper.deleteFolder("src/test/resources/renamed_folder") + HdfsHelper.deleteFolder(testFolder) // Let's create the folder to rename: - HdfsHelper.writeToHdfsFile( - "whatever", - "src/test/resources/some_folder_to_move/file_1.txt") - HdfsHelper.writeToHdfsFile( - "something", - "src/test/resources/some_folder_to_move/file_2.txt") + HdfsHelper.writeToHdfsFile("whatever", s"$folderToMove/file_1.txt") + HdfsHelper.writeToHdfsFile("something", s"$folderToMove/file_2.txt") // 1: Let's fail to move the folder with the moveFile() method: - assert( - HdfsHelper.fileExists( - "src/test/resources/some_folder_to_move/file_1.txt")) - assert( - HdfsHelper.fileExists( - "src/test/resources/some_folder_to_move/file_2.txt")) - assert(!HdfsHelper.folderExists("src/test/resources/renamed_folder")) + assert(HdfsHelper.fileExists(s"$folderToMove/file_1.txt")) + assert(HdfsHelper.fileExists(s"$folderToMove/file_2.txt")) + assert(!HdfsHelper.folderExists(renamedFolder)) // Let's rename the folder: val messageThrown = intercept[IllegalArgumentException] { - HdfsHelper.moveFile( - "src/test/resources/some_folder_to_move", - "src/test/resources/renamed_folder") + HdfsHelper.moveFile(folderToMove, renamedFolder) } val expectedMessage = "requirement failed: to move a folder, prefer using the " + "moveFolder() method." assert(messageThrown.getMessage === expectedMessage) - assert( - HdfsHelper.fileExists( - "src/test/resources/some_folder_to_move/file_1.txt")) - assert( - HdfsHelper.fileExists( - "src/test/resources/some_folder_to_move/file_2.txt")) - assert(!HdfsHelper.folderExists("src/test/resources/renamed_folder")) + assert(HdfsHelper.fileExists(s"$folderToMove/file_1.txt")) + assert(HdfsHelper.fileExists(s"$folderToMove/file_2.txt")) + assert(!HdfsHelper.folderExists(renamedFolder)) // 2: Let's successfuly move the folder with the moveFolder() method: // Let's rename the folder: - HdfsHelper.moveFolder( - "src/test/resources/some_folder_to_move", - "src/test/resources/renamed_folder") + HdfsHelper.moveFolder(folderToMove, renamedFolder) - assert(!HdfsHelper.folderExists("src/test/resources/some_folder_to_move")) - assert( - HdfsHelper.fileExists("src/test/resources/renamed_folder/file_1.txt")) - assert( - HdfsHelper.fileExists("src/test/resources/renamed_folder/file_2.txt")) - - val newContent = - sc.textFile("src/test/resources/renamed_folder").collect().sorted + assert(!HdfsHelper.folderExists(folderToMove)) + assert(HdfsHelper.fileExists(s"$renamedFolder/file_1.txt")) + assert(HdfsHelper.fileExists(s"$renamedFolder/file_2.txt")) + val newContent = sc.textFile(renamedFolder).collect().sorted assert(newContent === Array("something", "whatever")) - HdfsHelper.deleteFolder("src/test/resources/renamed_folder") + HdfsHelper.deleteFolder(testFolder) } test("Append header and footer to file") { + val filePath = s"$testFolder/header_footer_file.txt" + val tmpFolder = s"$testFolder/header_footer_tmp" + // 1: Without the tmp/working folder: - HdfsHelper.deleteFile("src/test/resources/header_footer_file.txt") + HdfsHelper.deleteFolder(testFolder) // Let's create the file for which to add header and footer: - HdfsHelper.writeToHdfsFile( - "whatever\nsomething else\n", - "src/test/resources/header_footer_file.txt") + HdfsHelper.writeToHdfsFile("whatever\nsomething else\n", filePath) - HdfsHelper.appendHeaderAndFooter( - "src/test/resources/header_footer_file.txt", - "my_header", - "my_footer") + HdfsHelper.appendHeaderAndFooter(filePath, "my_header", "my_footer") - var newContent = sc - .textFile("src/test/resources/header_footer_file.txt") - .collect - .mkString("\n") + var newContent = sc.textFile(filePath).collect.mkString("\n") - var expectedNewContent = ( + var expectedNewContent = "my_header\n" + "whatever\n" + "something else\n" + "my_footer" - ) assert(newContent === expectedNewContent) - HdfsHelper.deleteFile("src/test/resources/header_footer_file.txt") + HdfsHelper.deleteFile(filePath) // 2: With the tmp/working folder: // Let's create the file for which to add header and footer: - HdfsHelper.writeToHdfsFile( - "whatever\nsomething else\n", - "src/test/resources/header_footer_file.txt") + HdfsHelper.writeToHdfsFile("whatever\nsomething else\n", filePath) - HdfsHelper.appendHeaderAndFooter( - "src/test/resources/header_footer_file.txt", - "my_header", - "my_footer", - workingFolderPath = "src/test/resources/header_footer_tmp") + HdfsHelper + .appendHeaderAndFooter(filePath, "my_header", "my_footer", tmpFolder) - assert(HdfsHelper.folderExists("src/test/resources/header_footer_tmp")) - assert( - !HdfsHelper.fileExists("src/test/resources/header_footer_tmp/xml.tmp")) + assert(HdfsHelper.folderExists(tmpFolder)) + assert(!HdfsHelper.fileExists(s"$tmpFolder/xml.tmp")) - newContent = sc - .textFile("src/test/resources/header_footer_file.txt") - .collect - .mkString("\n") + newContent = sc.textFile(filePath).collect.mkString("\n") - expectedNewContent = ( + expectedNewContent = "my_header\n" + "whatever\n" + "something else\n" + "my_footer" - ) assert(newContent === expectedNewContent) - HdfsHelper.deleteFile("src/test/resources/header_footer_file.txt") - HdfsHelper.deleteFolder("src/test/resources/header_footer_tmp") + HdfsHelper.deleteFolder(testFolder) } test("Validate Xml Hdfs file with Xsd") { + val xmlPath = s"$testFolder/file.xml" + // 1: Valid xml: - HdfsHelper.deleteFile("src/test/resources/xml_file.txt") + HdfsHelper.deleteFolder(testFolder) HdfsHelper.writeToHdfsFile( "\n" + " 24\n" + "
34 thingy street, someplace, sometown
\n" + "
", - "src/test/resources/xml_file.txt" + xmlPath ) - var xsdFile = getClass.getResource("/some_xml.xsd") - - var isValid = HdfsHelper - .isHdfsXmlCompliantWithXsd("src/test/resources/xml_file.txt", xsdFile) - - assert(isValid) + assert(HdfsHelper.isHdfsXmlCompliantWithXsd(xmlPath, xsdFile)) // 2: Invalid xml: - HdfsHelper.deleteFile("src/test/resources/xml_file.txt") + HdfsHelper.deleteFolder(testFolder) HdfsHelper.writeToHdfsFile( "\n" + " trente\n" + "
34 thingy street, someplace, sometown
\n" + "
", - "src/test/resources/xml_file.txt" + xmlPath ) - xsdFile = getClass.getResource("/some_xml.xsd") + assert(!HdfsHelper.isHdfsXmlCompliantWithXsd(xmlPath, xsdFile)) - isValid = HdfsHelper - .isHdfsXmlCompliantWithXsd("src/test/resources/xml_file.txt", xsdFile) - - assert(!isValid) - - HdfsHelper.deleteFile("src/test/resources/xml_file.txt") + HdfsHelper.deleteFolder(testFolder) } test("Load Typesafe Config from Hdfs") { @@ -488,64 +422,59 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext { test("Load Xml file from Hdfs") { - HdfsHelper.deleteFile("src/test/resources/folder/xml_to_load.xml") + val xmlPath = s"$testFolder/file.xml" + + HdfsHelper.deleteFolder(testFolder) HdfsHelper.writeToHdfsFile( "\n" + " whatever\n" + "", - "src/test/resources/folder/xml_to_load.xml" + xmlPath ) - val xmlContent = HdfsHelper - .loadXmlFileFromHdfs("src/test/resources/folder/xml_to_load.xml") + val xmlContent = HdfsHelper.loadXmlFileFromHdfs(xmlPath) assert((xmlContent \ "sometag" \ "@value").text === "something") assert((xmlContent \ "sometag").text === "whatever") - HdfsHelper.deleteFolder("src/test/resources/folder/") + HdfsHelper.deleteFolder(testFolder) } test("Purge folder from too old files/folders") { - HdfsHelper.deleteFolder("src/test/resources/folder_to_purge") - HdfsHelper - .createEmptyHdfsFile("src/test/resources/folder_to_purge/file.txt") - HdfsHelper - .createEmptyHdfsFile("src/test/resources/folder_to_purge/folder/file.txt") - assert(HdfsHelper.fileExists("src/test/resources/folder_to_purge/file.txt")) - assert(HdfsHelper.folderExists("src/test/resources/folder_to_purge/folder")) - - HdfsHelper.purgeFolder("src/test/resources/folder_to_purge", 63) + val folderToPurge = s"$testFolder/folder_to_purge" - assert(HdfsHelper.fileExists("src/test/resources/folder_to_purge/file.txt")) - assert(HdfsHelper.folderExists("src/test/resources/folder_to_purge/folder")) + HdfsHelper.deleteFolder(testFolder) + HdfsHelper.createEmptyHdfsFile(s"$folderToPurge/file.txt") + HdfsHelper.createEmptyHdfsFile(s"$folderToPurge/folder/file.txt") + assert(HdfsHelper.fileExists(s"$folderToPurge/file.txt")) + assert(HdfsHelper.folderExists(s"$folderToPurge/folder")) - HdfsHelper.purgeFolder("src/test/resources/folder_to_purge", 1) + HdfsHelper.purgeFolder(folderToPurge, 63) + assert(HdfsHelper.fileExists(s"$folderToPurge/file.txt")) + assert(HdfsHelper.folderExists(s"$folderToPurge/folder")) - assert(HdfsHelper.fileExists("src/test/resources/folder_to_purge/file.txt")) - assert(HdfsHelper.folderExists("src/test/resources/folder_to_purge/folder")) + HdfsHelper.purgeFolder(folderToPurge, 1) + assert(HdfsHelper.fileExists(s"$folderToPurge/file.txt")) + assert(HdfsHelper.folderExists(s"$folderToPurge/folder")) val messageThrown = intercept[IllegalArgumentException] { - HdfsHelper.purgeFolder("src/test/resources/folder_to_purge", -3) + HdfsHelper.purgeFolder(folderToPurge, -3) } val expectedMessage = "requirement failed: the purgeAge provided \"-3\" must be superior to 0." assert(messageThrown.getMessage === expectedMessage) - HdfsHelper.purgeFolder("src/test/resources/folder_to_purge", 0) + HdfsHelper.purgeFolder(folderToPurge, 0) + assert(!HdfsHelper.fileExists(s"$folderToPurge/file.txt")) + assert(!HdfsHelper.folderExists(s"$folderToPurge/folder")) - assert( - !HdfsHelper.fileExists("src/test/resources/folder_to_purge/file.txt")) - assert( - !HdfsHelper.folderExists("src/test/resources/folder_to_purge/folder")) - - HdfsHelper.deleteFolder("src/test/resources/folder_to_purge") + HdfsHelper.deleteFolder(testFolder) } test("Compress hdfs file") { - val testFolder = s"$resourceFolder/folder" val filePath = s"$testFolder/file.txt" HdfsHelper.deleteFile(filePath) From 21bbb8041e9f381a536d916f1bae258c8c0f8453 Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Tue, 12 Jun 2018 22:22:56 +0100 Subject: [PATCH 17/25] Light refactoring of DateHelperTest --- .../com/spark_helper/DateHelperTest.scala | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/test/scala/com/spark_helper/DateHelperTest.scala b/src/test/scala/com/spark_helper/DateHelperTest.scala index 7154831..6d27b81 100644 --- a/src/test/scala/com/spark_helper/DateHelperTest.scala +++ b/src/test/scala/com/spark_helper/DateHelperTest.scala @@ -2,6 +2,8 @@ package com.spark_helper import org.scalatest.FunSuite +import com.spark_helper.{DateHelper => DH} + /** Testing facility for date helpers. * * @author Xavier Guihot @@ -37,25 +39,20 @@ class DateHelperTest extends FunSuite { } test("Reformat date") { - assert( - DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") === "170327") - assert( - DateHelper.reformatDate("20170327", "yyyyMMdd", "MMddyy") === "032717") + assert(DH.reformatDate("20170327", "yyyyMMdd", "yyMMdd") === "170327") + assert(DH.reformatDate("20170327", "yyyyMMdd", "MMddyy") === "032717") } test("Next day") { - assert(DateHelper.nextDay("20170310") === "20170311") - assert(DateHelper.nextDay("170310", "yyMMdd") === "170311") - assert( - DateHelper.nextDay("20170310_0000", "yyyyMMdd_HHmm") === "20170311_0000") + assert(DH.nextDay("20170310") === "20170311") + assert(DH.nextDay("170310", "yyMMdd") === "170311") + assert(DH.nextDay("20170310_0000", "yyyyMMdd_HHmm") === "20170311_0000") } test("Previous day") { - assert(DateHelper.previousDay("20170310") === "20170309") - assert(DateHelper.previousDay("170310", "yyMMdd") === "170309") - assert( - DateHelper - .previousDay("20170310_0000", "yyyyMMdd_HHmm") === "20170309_0000") + assert(DH.previousDay("20170310") === "20170309") + assert(DH.previousDay("170310", "yyMMdd") === "170309") + assert(DH.previousDay("20170310_0000", "yyyyMMdd_HHmm") === "20170309_0000") } test("Nbr of days between two dates") { From 75eadc82a2c6b70bd556ee6156c453e43889c9c5 Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Tue, 12 Jun 2018 23:03:02 +0100 Subject: [PATCH 18/25] Pimp Seq[String] and String with a writeToHdfs method --- README.md | 8 ++-- .../scala/com/spark_helper/HdfsHelper.scala | 39 ++++++++++++++++++- .../com/spark_helper/HdfsHelperTest.scala | 28 +++++++++++-- .../com/spark_helper/SparkHelperTest.scala | 4 +- 4 files changed, 67 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index e36dad4..c17d463 100644 --- a/README.md +++ b/README.md @@ -18,14 +18,14 @@ names are self-explanatory and readable. This also provides a monitoring/logger tool. -This is a bunch of 4 modules: +This is a set of 4 modules: -* [HdfsHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.HdfsHelper$): Wrapper around [apache Hadoop FileSystem API](https://hadoop.apache.org/docs/r2.6.1/api/org/apache/hadoop/fs/FileSystem.html) for file manipulations on hdfs. -* [SparkHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.SparkHelper$): Hdfs file manipulations through the Spark API. +* [HdfsHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.HdfsHelper$): Wrapper around the [apache Hadoop FileSystem API](https://hadoop.apache.org/docs/r2.6.1/api/org/apache/hadoop/fs/FileSystem.html) for file manipulations on hdfs. +* [SparkHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.SparkHelper$): Hdfs file manipulations through the Spark API (pimped RDDs and SparkContext). * [DateHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.DateHelper$): Wrapper around [joda-time](http://www.joda.org/joda-time/apidocs/) for usual data mining dates manipulations. * [Monitor](http://xavierguihot.com/spark_helper/#com.spark_helper.Monitor$): Spark custom monitoring/logger and kpi validator. -Compatible with Spark 2. +Compatible with Spark 2.x ### HdfsHelper: diff --git a/src/main/scala/com/spark_helper/HdfsHelper.scala b/src/main/scala/com/spark_helper/HdfsHelper.scala index a9038bd..82d7f3e 100644 --- a/src/main/scala/com/spark_helper/HdfsHelper.scala +++ b/src/main/scala/com/spark_helper/HdfsHelper.scala @@ -6,6 +6,8 @@ import org.apache.hadoop.io.compress.{CompressionCodec, CompressionCodecFactory} import org.apache.hadoop.io.compress.{GzipCodec, BZip2Codec} import org.apache.hadoop.io.IOUtils +import scala.reflect.ClassTag + import org.joda.time.{DateTime, Days} import org.joda.time.format.DateTimeFormat @@ -77,6 +79,41 @@ import com.typesafe.config.{Config, ConfigFactory} */ object HdfsHelper extends Serializable { + implicit class SeqExtensions[T <: Seq[String]: ClassTag](val seq: T) { + + /** Saves list elements in a file on hdfs. + * + * Please only consider this way of storing data when the data set is small + * enough. + * + * Overwrites the file if it already exists. + * + * {{{ + * Array("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt") + * List("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt") + * }}} + * + * @param filePath the path of the file in which to write the content of + * the List. + */ + def writeToHdfs(filePath: String): Unit = + HdfsHelper.writeToHdfsFile(seq, filePath) + } + + implicit class StringExtensions(val string: String) { + + /** Saves the String in a file on hdfs. + * + * Overwrites the file if it already exists. + * + * {{{ "some\nrelatively small\ntext".writeToHdfsFile("/some/hdfs/file/path.txt") }}} + * + * @param filePath the path of the file in which to write the String + */ + def writeToHdfs(filePath: String): Unit = + HdfsHelper.writeToHdfsFile(string, filePath) + } + /** Deletes a file on HDFS. * * Doesn't throw an exception if the file to delete doesn't exist. @@ -311,7 +348,7 @@ object HdfsHelper extends Serializable { * List("some", "relatively small", "text"), "/some/hdfs/file/path.txt") * }}} * - * @param content the array of strings to write in the file as one line per + * @param content the seq of strings to write in the file as one line per * string (this takes care of joining strings with "\n"s). * @param filePath the path of the file in which to write the content */ diff --git a/src/test/scala/com/spark_helper/HdfsHelperTest.scala b/src/test/scala/com/spark_helper/HdfsHelperTest.scala index 9babb0b..f96db97 100644 --- a/src/test/scala/com/spark_helper/HdfsHelperTest.scala +++ b/src/test/scala/com/spark_helper/HdfsHelperTest.scala @@ -1,5 +1,7 @@ package com.spark_helper +import com.spark_helper.HdfsHelper._ + import org.apache.hadoop.io.compress.GzipCodec import com.holdenkarau.spark.testing.SharedSparkContext @@ -127,9 +129,9 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext { val filePath = s"$testFolder/small_file.txt" - // 1: Stores using a "\n"-joined string: + HdfsHelper.deleteFolder(testFolder) - HdfsHelper.deleteFile(filePath) + // 1: Stores using a "\n"-joined string: val contentToStore = "Hello World\nWhatever" @@ -144,8 +146,6 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext { // 2: Stores using a list of strings to be "\n"-joined: - HdfsHelper.deleteFile(filePath) - val listToStore = List("Hello World", "Whatever") HdfsHelper.writeToHdfsFile(listToStore, filePath) @@ -155,6 +155,26 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext { assert(storedContent === listToStore.mkString("\n")) HdfsHelper.deleteFolder(testFolder) + + // 3: Using the pimped Seq/String: + + listToStore.toSeq.writeToHdfs(filePath) + assert(HdfsHelper.fileExists(filePath)) + storedContent = sc.textFile(filePath).collect().sorted.mkString("\n") + assert(storedContent === contentToStore) + HdfsHelper.deleteFolder(testFolder) + + listToStore.writeToHdfs(filePath) + assert(HdfsHelper.fileExists(filePath)) + storedContent = sc.textFile(filePath).collect().sorted.mkString("\n") + assert(storedContent === contentToStore) + HdfsHelper.deleteFolder(testFolder) + + contentToStore.writeToHdfs(filePath) + assert(HdfsHelper.fileExists(filePath)) + storedContent = sc.textFile(filePath).collect().sorted.mkString("\n") + assert(storedContent === contentToStore) + HdfsHelper.deleteFolder(testFolder) } test("List file names in Hdfs folder") { diff --git a/src/test/scala/com/spark_helper/SparkHelperTest.scala b/src/test/scala/com/spark_helper/SparkHelperTest.scala index 1ccfba4..21d01e0 100644 --- a/src/test/scala/com/spark_helper/SparkHelperTest.scala +++ b/src/test/scala/com/spark_helper/SparkHelperTest.scala @@ -1,8 +1,6 @@ package com.spark_helper -import com.spark_helper.SparkHelper.{RDDExtensions, StringRDDExtensions} -import com.spark_helper.SparkHelper.{SeqRDDExtensions, OptionRDDExtensions} -import com.spark_helper.SparkHelper.{SparkContextExtensions, PairRDDExtensions} +import com.spark_helper.SparkHelper._ import org.apache.hadoop.io.compress.GzipCodec From 889d5e53caab48de427cb1a5ea2fe64f06d8592b Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Wed, 13 Jun 2018 22:27:12 +0100 Subject: [PATCH 19/25] Make a singleton out of HdfsHelper by allowing setting a specific Configuration or FileSystem --- .../scala/com/spark_helper/HdfsHelper.scala | 141 ++++++++---------- 1 file changed, 66 insertions(+), 75 deletions(-) diff --git a/src/main/scala/com/spark_helper/HdfsHelper.scala b/src/main/scala/com/spark_helper/HdfsHelper.scala index 82d7f3e..694131e 100644 --- a/src/main/scala/com/spark_helper/HdfsHelper.scala +++ b/src/main/scala/com/spark_helper/HdfsHelper.scala @@ -79,6 +79,33 @@ import com.typesafe.config.{Config, ConfigFactory} */ object HdfsHelper extends Serializable { + private var conf = new Configuration() + private var hdfs = FileSystem.get(conf) + + /** Sets a specific Configuration + * used by the underlying FileSystem + * in case it requires some specificities. + * + * If this setter is not used, the default Configuration is set with + * new Configuration(). + * + * @param conf the specific Configuration to use + */ + def setConf(configuration: Configuration): Unit = { + conf = configuration + hdfs = FileSystem.get(configuration) + } + + /** Sets a specific FileSystem + * in case it requires some specificities. + * + * If this setter is not used, the default FileSystem is set with + * FileSystem.get(new Configuration()). + * + * @param fileSystem the specific FileSystem to use + */ + def setFileSystem(fileSystem: FileSystem): Unit = hdfs = fileSystem + implicit class SeqExtensions[T <: Seq[String]: ClassTag](val seq: T) { /** Saves list elements in a file on hdfs. @@ -122,17 +149,15 @@ object HdfsHelper extends Serializable { */ def deleteFile(hdfsPath: String): Unit = { - val fileSystem = FileSystem.get(new Configuration()) - val fileToDelete = new Path(hdfsPath) - if (fileSystem.exists(fileToDelete)) { + if (hdfs.exists(fileToDelete)) { require( - fileSystem.isFile(fileToDelete), + hdfs.isFile(fileToDelete), "to delete a folder, prefer using the deleteFolder() method.") - fileSystem.delete(fileToDelete, true) + hdfs.delete(fileToDelete, true) } } @@ -144,17 +169,15 @@ object HdfsHelper extends Serializable { */ def deleteFolder(hdfsPath: String): Unit = { - val fileSystem = FileSystem.get(new Configuration()) - val folderToDelete = new Path(hdfsPath) - if (fileSystem.exists(folderToDelete)) { + if (hdfs.exists(folderToDelete)) { require( - !fileSystem.isFile(folderToDelete), + !hdfs.isFile(folderToDelete), "to delete a file, prefer using the deleteFile() method.") - fileSystem.delete(folderToDelete, true) + hdfs.delete(folderToDelete, true) } } @@ -164,8 +187,7 @@ object HdfsHelper extends Serializable { * * @param hdfsPath the path of the folder to create */ - def createFolder(hdfsPath: String): Unit = - FileSystem.get(new Configuration()).mkdirs(new Path(hdfsPath)) + def createFolder(hdfsPath: String): Unit = hdfs.mkdirs(new Path(hdfsPath)) /** Checks if the file exists. * @@ -174,16 +196,14 @@ object HdfsHelper extends Serializable { */ def fileExists(hdfsPath: String): Boolean = { - val fileSystem = FileSystem.get(new Configuration()) - val fileToCheck = new Path(hdfsPath) - if (fileSystem.exists(fileToCheck)) + if (hdfs.exists(fileToCheck)) require( - fileSystem.isFile(fileToCheck), + hdfs.isFile(fileToCheck), "to check if a folder exists, prefer using the folderExists() method.") - fileSystem.exists(fileToCheck) + hdfs.exists(fileToCheck) } /** Checks if the folder exists. @@ -193,16 +213,14 @@ object HdfsHelper extends Serializable { */ def folderExists(hdfsPath: String): Boolean = { - val fileSystem = FileSystem.get(new Configuration()) - val folderToCheck = new Path(hdfsPath) - if (fileSystem.exists(folderToCheck)) + if (hdfs.exists(folderToCheck)) require( - !fileSystem.isFile(folderToCheck), + !hdfs.isFile(folderToCheck), "to check if a file exists, prefer using the fileExists() method.") - fileSystem.exists(folderToCheck) + hdfs.exists(folderToCheck) } /** Moves/renames a file. @@ -221,21 +239,19 @@ object HdfsHelper extends Serializable { overwrite: Boolean = false ): Unit = { - val fileSystem = FileSystem.get(new Configuration()) - val fileToRename = new Path(oldPath) val renamedFile = new Path(newPath) - if (fileSystem.exists(fileToRename)) + if (hdfs.exists(fileToRename)) require( - fileSystem.isFile(fileToRename), + hdfs.isFile(fileToRename), "to move a folder, prefer using the moveFolder() method.") if (overwrite) - fileSystem.delete(renamedFile, true) + hdfs.delete(renamedFile, true) else require( - !fileSystem.exists(renamedFile), + !hdfs.exists(renamedFile), "overwrite option set to false, but a file already exists at target " + "location " + newPath) @@ -244,7 +260,7 @@ object HdfsHelper extends Serializable { val targetContainerFolder = newPath.split("/").init.mkString("/") createFolder(targetContainerFolder) - fileSystem.rename(fileToRename, renamedFile) + hdfs.rename(fileToRename, renamedFile) } /** Moves/renames a folder. @@ -263,21 +279,19 @@ object HdfsHelper extends Serializable { overwrite: Boolean = false ): Unit = { - val fileSystem = FileSystem.get(new Configuration()) - val folderToRename = new Path(oldPath) val renamedFolder = new Path(newPath) - if (fileSystem.exists(folderToRename)) + if (hdfs.exists(folderToRename)) require( - !fileSystem.isFile(folderToRename), + !hdfs.isFile(folderToRename), "to move a file, prefer using the moveFile() method.") if (overwrite) - fileSystem.delete(renamedFolder, true) + hdfs.delete(renamedFolder, true) else require( - !fileSystem.exists(renamedFolder), + !hdfs.exists(renamedFolder), "overwrite option set to false, but a folder already exists at target " + "location " + newPath) @@ -286,7 +300,7 @@ object HdfsHelper extends Serializable { val targetContainerFolder = newPath.split("/").init.mkString("/") createFolder(targetContainerFolder) - fileSystem.rename(folderToRename, new Path(newPath)) + hdfs.rename(folderToRename, new Path(newPath)) } /** Creates an empty file on hdfs. @@ -309,7 +323,7 @@ object HdfsHelper extends Serializable { * @param filePath the path of the empty file to create */ def createEmptyHdfsFile(filePath: String): Unit = - FileSystem.get(new Configuration()).create(new Path(filePath)).close() + hdfs.create(new Path(filePath)).close() /** Saves text in a file when content is too small to really require an RDD. * @@ -326,10 +340,7 @@ object HdfsHelper extends Serializable { * @param filePath the path of the file in which to write the content */ def writeToHdfsFile(content: String, filePath: String): Unit = { - - val outputFile = - FileSystem.get(new Configuration()).create(new Path(filePath)) - + val outputFile = hdfs.create(new Path(filePath)) outputFile.write(content.getBytes("UTF-8")) outputFile.close() } @@ -374,8 +385,7 @@ object HdfsHelper extends Serializable { onlyName: Boolean = true ): List[String] = { - FileSystem - .get(new Configuration()) + hdfs .listStatus(new Path(hdfsPath)) .flatMap { status => // If it's a file: @@ -408,8 +418,7 @@ object HdfsHelper extends Serializable { * @return the list of folder names in the specified folder */ def listFolderNamesInFolder(hdfsPath: String): List[String] = - FileSystem - .get(new Configuration()) + hdfs .listStatus(new Path(hdfsPath)) .filter(!_.isFile) .map(_.getPath.getName) @@ -423,11 +432,7 @@ object HdfsHelper extends Serializable { * @return the joda DateTime of the last modification of the given file */ def fileModificationDateTime(hdfsPath: String): DateTime = - new DateTime( - FileSystem - .get(new Configuration()) - .getFileStatus(new Path(hdfsPath)) - .getModificationTime()) + new DateTime(hdfs.getFileStatus(new Path(hdfsPath)).getModificationTime()) /** Returns the stringified date of the last modification of the given file. * @@ -599,9 +604,7 @@ object HdfsHelper extends Serializable { */ def validateHdfsXmlWithXsd(hdfsXmlPath: String, xsdFile: URL): Unit = { - val fileSystem = FileSystem.get(new Configuration()) - - val xmlFile = new StreamSource(fileSystem.open(new Path(hdfsXmlPath))) + val xmlFile = new StreamSource(hdfs.open(new Path(hdfsXmlPath))) val schemaFactory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI) @@ -644,10 +647,7 @@ object HdfsHelper extends Serializable { * @return the com.typesafe.config.Config object which contains usable data */ def loadTypesafeConfigFromHdfs(hdfsConfigPath: String): Config = { - - val reader = new InputStreamReader( - FileSystem.get(new Configuration()).open(new Path(hdfsConfigPath))) - + val reader = new InputStreamReader(hdfs.open(new Path(hdfsConfigPath))) try { ConfigFactory.parseReader(reader) } finally { reader.close() } } @@ -660,10 +660,7 @@ object HdfsHelper extends Serializable { * @return the scala.xml.Elem object */ def loadXmlFileFromHdfs(hdfsXmlPath: String): Elem = { - - val reader = new InputStreamReader( - FileSystem.get(new Configuration()).open(new Path(hdfsXmlPath))) - + val reader = new InputStreamReader(hdfs.open(new Path(hdfsXmlPath))) try { XML.load(reader) } finally { reader.close() } } @@ -692,8 +689,6 @@ object HdfsHelper extends Serializable { deleteInputFile: Boolean = true ): Unit = { - val fileSystem = FileSystem.get(new Configuration()) - val ClassOfGzip = classOf[GzipCodec] val ClassOfBZip2 = classOf[BZip2Codec] @@ -702,12 +697,11 @@ object HdfsHelper extends Serializable { case ClassOfBZip2 => s"$inputPath.bz2" } - val inputStream = fileSystem.open(new Path(inputPath)) - val outputStream = fileSystem.create(new Path(outputPath)) + val inputStream = hdfs.open(new Path(inputPath)) + val outputStream = hdfs.create(new Path(outputPath)) // The compression code: - val codec = new CompressionCodecFactory(new Configuration()) - .getCodec(new Path(outputPath)) + val codec = new CompressionCodecFactory(conf).getCodec(new Path(outputPath)) // We include the compression codec to the output stream: val compressedOutputStream = codec.createOutputStream(outputStream) @@ -715,7 +709,7 @@ object HdfsHelper extends Serializable { IOUtils.copyBytes( inputStream, compressedOutputStream, - new Configuration(), + conf, false ) } finally { @@ -746,8 +740,7 @@ object HdfsHelper extends Serializable { purgeAge >= 0, "the purgeAge provided \"" + purgeAge.toString + "\" must be superior to 0.") - FileSystem - .get(new Configuration()) + hdfs .listStatus(new Path(folderPath)) .filter(path => { @@ -781,22 +774,20 @@ object HdfsHelper extends Serializable { workingFolderPath: String ): Unit = { - val fileSystem = FileSystem.get(new Configuration()) - val tmpOutputPath = workingFolderPath match { case "" => s"$filePath.tmp" case _ => s"$workingFolderPath/xml.tmp" } deleteFile(tmpOutputPath) - val inputFile = fileSystem.open(new Path(filePath)) - val tmpOutputFile = fileSystem.create(new Path(tmpOutputPath)) + val inputFile = hdfs.open(new Path(filePath)) + val tmpOutputFile = hdfs.create(new Path(tmpOutputPath)) // If there is an header, we add it to the file: header.foreach(h => tmpOutputFile.write((h + "\n").getBytes("UTF-8"))) try { - IOUtils.copyBytes(inputFile, tmpOutputFile, new Configuration(), false) + IOUtils.copyBytes(inputFile, tmpOutputFile, conf, false) } finally { inputFile.close() } From 5351664daa9e164e57e0c8ed69b8dd4d9e2db4fc Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Thu, 14 Jun 2018 21:01:32 +0100 Subject: [PATCH 20/25] Improve README --- README.md | 66 +++++++++++++------ .../scala/com/spark_helper/HdfsHelper.scala | 1 + .../scala/com/spark_helper/SparkHelper.scala | 21 +++--- 3 files changed, 57 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index c17d463..5d6fed8 100644 --- a/README.md +++ b/README.md @@ -36,21 +36,21 @@ The full list of methods is available at Contains basic file-related methods mostly based on hdfs apache Hadoop FileSystem API [org.apache.hadoop.fs.FileSystem](https://hadoop.apache.org/docs/r2.6.1/api/org/apache/hadoop/fs/FileSystem.html). -For instance, one don't want to remove a file from hdfs using 3 lines of code -and thus could instead just use `HdfsHelper.deleteFile("my/hdfs/file/path.csv")`. - A non-exhaustive list of exemples: ```scala import com.spark_helper.HdfsHelper // A bunch of methods wrapping the FileSystem API, such as: -HdfsHelper.fileExists("my/hdfs/file/path.txt") +HdfsHelper.fileExists("my/hdfs/file/path.txt") // HdfsHelper.folderExists("my/hdfs/folder") assert(HdfsHelper.listFileNamesInFolder("my/folder/path") == List("file_name_1.txt", "file_name_2.csv")) assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306") assert(HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") == 3) -HdfsHelper.deleteFile("my/hdfs/file/path.csv") -HdfsHelper.moveFolder("my/hdfs/folder") +HdfsHelper.deleteFile("my/hdfs/file/path.csv") // HdfsHelper.deleteFolder("my/hdfs/folder") +HdfsHelper.moveFolder("my/hdfs/folder") // HdfsHelper.moveFile("my/hdfs/file.txt") +HdfsHelper.createEmptyHdfsFile("/some/hdfs/file/path.token") // HdfsHelper.createFolder("my/hdfs/folder") + +// File content helpers: HdfsHelper.compressFile("hdfs/path/to/uncompressed_file.txt", classOf[GzipCodec]) HdfsHelper.appendHeader("my/hdfs/file/path.csv", "colum0,column1") @@ -63,41 +63,67 @@ HdfsHelper.loadTypesafeConfigFromHdfs("my/hdfs/file/path.conf"): Config // In order to write small amount of data in a file on hdfs without the whole spark stack: HdfsHelper.writeToHdfsFile(Array("some", "relatively small", "text"), "/some/hdfs/file/path.txt") +// or: +import com.spark_helper.HdfsHelper._ +Array("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt") +"hello world".writeToHdfs("/some/hdfs/file/path.txt") // Deletes all files/folders in "hdfs/path/to/folder" for which the timestamp is older than 10 days: HdfsHelper.purgeFolder("hdfs/path/to/folder", 10) ``` +In case a specific configuration is needed to access the file system, these +setters are available: + +```scala +// To use a specific conf FileSystem.get(whateverConf) instead of FileSystem.get(new Configuration()): +HdfsHelper.setConf(whateverConf) +// Or directly the FileSystem: +HdfsHelper.setFileSystem(whateverFileSystem) +``` + ### SparkHelper: The full list of methods is available at [SparkHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.SparkHelper$). -Contains basic file/RRD-related methods based on the Spark APIs. +Contains basic RRD-related methods. A non-exhaustive list of exemples: ```scala -import com.spark_helper.SparkHelper +import com.spark_helper.SparkHelper._ -// Same as SparkContext.saveAsTextFile, but the result is a single file: -SparkHelper.saveAsSingleTextFile(myOutputRDD, "/my/output/file/path.txt") +// Same as rdd.saveAsTextFile("path"), but the result is a single file (while +// keeping the processing parallelized): +rdd.saveAsSingleTextFile("/my/output/file/path.txt") +rdd.saveAsSingleTextFile("/my/output/file/path.txt", classOf[BZip2Codec]) -// Same as SparkContext.textFile, but instead of reading one record per line, -// it reads records spread over several lines. This way, xml, json, yml or -// any multi-line record file format can be used with Spark: -SparkHelper.textFileWithDelimiter("/my/input/folder/path", sparkContext, "---\n") +// Same as sc.textFile("path"), but instead of reading one record per line (by +// splitting the input with \n), it splits the file in records based on a custom +// delimiter. This way, xml, json, yml or any multi-line record file format can +// be used with Spark: +sc.textFile("/my/input/folder/path", "---\n") + +// Equivalent to rdd.flatMap(identity) for RDDs of Seqs or Options: +rdd.flatten // Equivalent to sparkContext.textFile(), but for each line is tupled with its // file path: SparkHelper.textFileWithFileName("folder", sparkContext) // which produces: -RDD( - ("file:/path/on/machine/folder/file_1.txt", "record1fromfile1"), - ("file:/path/on/machine/folder/file_1.txt", "record2fromfile1"), - ("file:/path/on/machine/folder/file_2.txt", "record1fromfile2"), - ... -) +// RDD(("folder/file_1.txt", "record1fromfile1"), ("folder/file_1.txt", "record2fromfile1"), +// ("folder/file_2.txt", "record1fromfile2"), ...) + +// In the given folder, this generates one file per key in the given key/value +// RDD. Within each file (named from the key) are all values for this key: +rdd.saveAsTextFileByKey("/my/output/folder/path") + +// Concept mapper (the following exemple transforms RDD(1, 3, 2, 7, 8) into RDD(1, 3, 4, 7, 16)): +rdd.partialMap { case a if a % 2 == 0 => 2 * a } + +// For when input files contain commas and textFile can't handle it: +sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt")) ``` ### DateHelper: diff --git a/src/main/scala/com/spark_helper/HdfsHelper.scala b/src/main/scala/com/spark_helper/HdfsHelper.scala index 694131e..920c905 100644 --- a/src/main/scala/com/spark_helper/HdfsHelper.scala +++ b/src/main/scala/com/spark_helper/HdfsHelper.scala @@ -74,6 +74,7 @@ import com.typesafe.config.{Config, ConfigFactory} * Source HdfsHelper * + * @todo Create a touch method * @author Xavier Guihot * @since 2017-02 */ diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index 075c22f..336ff4a 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -512,17 +512,16 @@ object SparkHelper extends Serializable { iterator.map(tpl => (file.getPath.toString, tpl._2.toString)) } - /* An other way of doing would be: - * - * import org.apache.spark.sql.functions.input_file_name - * import spark.implicits._ - * - * spark.read - * .text(testFolder) - * .select(input_file_name, $"value") - * .as[(String, String)] - * .rdd - */ + // An other way of doing would be: + // + // import org.apache.spark.sql.functions.input_file_name + // import spark.implicits._ + // + // spark.read + // .text(testFolder) + // .select(input_file_name, $"value") + // .as[(String, String)] + // .rdd } /** A replacement for sc.textFile() From b812cc1e70308cd931697ea2a0119360da768222 Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Thu, 14 Jun 2018 23:06:22 +0100 Subject: [PATCH 21/25] Pimp String/Int with DateHelper functions - refactor DateHelper/README --- README.md | 39 ++- .../scala/com/spark_helper/DateHelper.scala | 304 ++++++++++++++---- .../com/spark_helper/DateHelperTest.scala | 12 + 3 files changed, 290 insertions(+), 65 deletions(-) diff --git a/README.md b/README.md index 5d6fed8..9ec5927 100644 --- a/README.md +++ b/README.md @@ -132,21 +132,42 @@ The full list of methods is available at [DateHelper](http://xavierguihot.com/spark_helper/#com.spark_helper.DateHelper$). Wrapper around [joda-time](http://www.joda.org/joda-time/apidocs/) for -data-mining classic dates manipulations. +data-mining classic dates manipulations and job scheduling. A non-exhaustive list of exemples: ```scala import com.spark_helper.DateHelper -assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101")) -assert(DateHelper.today() == "20170310") // If today's "20170310" -assert(DateHelper.yesterday() == "20170309") // If today's "20170310" -assert(DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") == "170327") -assert(DateHelper.now("HH:mm") == "10:24") -assert(DateHelper.currentTimestamp() == "1493105229736") -assert(DateHelper.nDaysBefore(3) == "20170307") // If today's "20170310" -assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310") +DateHelper.daysBetween("20161230", "20170101") // List("20161230", "20161231", "20170101") +DateHelper.today // "20170310" +DateHelper.yesterday // "20170309" +DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") // "170327" +DateHelper.now("HH:mm") // "10:24" +DateHelper.currentTimestamp // "1493105229736" +DateHelper.nDaysBefore(3) // "20170307" +DateHelper.nDaysAfterDate(3, "20170307") // "20170310" +DateHelper.nextDay("20170310") // "20170311" +DateHelper.nbrOfDaysSince("20170302") // 8 +DateHelper.nbrOfDaysBetween("20170327", "20170401") // 5 +DateHelper.dayOfWeek("20160614") // 2 + +import com.spark_helper.DateHelper._ + +2.daysAgo // "20170308" +3.daysBefore("20170310") // "20170307" +5.daysAfter // "20170315" +4.daysAfter("20170310") // "20170314" +"20170302".isCompliantWith("yyyyMMdd") +"20170310".nextDay // "20170311" +"20170310".previousDay // "20170309" +``` + +The default format (when no format is specified) is "yyyyMMdd" (20170327). It +can be modified globally with: + +```scala +DateHelper.setFormat("ddMMMyy") ``` ### Monitor: diff --git a/src/main/scala/com/spark_helper/DateHelper.scala b/src/main/scala/com/spark_helper/DateHelper.scala index c2af405..9d11627 100644 --- a/src/main/scala/com/spark_helper/DateHelper.scala +++ b/src/main/scala/com/spark_helper/DateHelper.scala @@ -28,11 +28,126 @@ import scala.util.Try * Source DateHelper * + * @todo Is it possible to ddo something like ("20170325" to "20170327")? * @author Xavier Guihot * @since 2017-02 */ object DateHelper extends Serializable { + private var defaultFormat = "yyyyMMdd" + + /** Sets the default date format used by these functions when no date format + * is specified. + * + * {{{ + * // By default, yyyyMMdd is used: + * assert(3.daysBefore == "20170307") + * // But this can be modified globally: + * DateHelper.setFormat("ddMMMyy") + * assert(3.daysBefore == "07Mar17") + * }}} + * + * @param format the new default format + */ + def setFormat(format: String): Unit = defaultFormat = format + + implicit class IntExtensions(val int: Int) { + + /** Returns which date it was x days before today under the default format. + * + * If we're "20170125" and we request for 3 days before, we'll return + * "20170122". + * + * {{{ + * // If today's "20170310": + * assert(3.daysAgo == "20170307") + * }}} + * + * @return today's date minus the given nbr of days + */ + def daysAgo(): String = DateHelper.nDaysBefore(int) + + /** Returns which date it was x days before the given date. + * + * If the given date is "20170125" and we request the date it was 3 days + * before, this will return "20170122". + * + * {{{ assert(3.daysBefore("20170310") == "20170307") }}} + * + * @param date the date under the default format for which we want the date + * for nbrOfDaysBefore days before. + * @return the date it was nbrOfDaysBefore before date under the default + * format. + */ + def daysBefore(date: String): String = DateHelper.nDaysBeforeDate(int, date) + + /** Returns which date it will be x days after today under the default format. + * + * If we're "20170125" and we request for 3 days after, we'll return + * "20170127". + * + * {{{ + * // If today's "20170310": + * assert(3.daysAfter == "20170313") + * }}} + * + * @return today's date plus the given nbr of days + */ + def daysAfter(): String = DateHelper.nDaysAfter(int) + + /** Returns which date it will be x days after the given date under the + * default format. + * + * If the given date is "20170122" and we request the date it will be 3 + * days after, we'll return "20170125". + * + * {{{ assert(5.daysAfter("20170305") == "20170310") }}} + * + * @param date the date under the default format for which we want the date + * for nbrOfDaysAfter days after. + * @return the date it was nbrOfDaysAfter after date under the default + * format. + */ + def daysAfter(date: String): String = DateHelper.nDaysAfterDate(int, date) + } + + implicit class StringExtensions(val string: String) { + + /** Validates the stringified date is compliant with the provided format. + * + * {{{ + * assert("20170302".isCompliantWith("yyyyMMdd")) + * assert(!"20170333".isCompliantWith("yyyyMMdd")) + * assert("20170228".isCompliantWith("yyyyMMdd")) + * assert(!"20170229".isCompliantWith("yyyyMMdd")) + * assert(!"170228".isCompliantWith("yyyyMMdd")) + * assert(!"".isCompliantWith("yyyyMMdd")) + * assert(!"a".isCompliantWith("yyyyMMdd")) + * assert(!"24JAN17".isCompliantWith("yyyyMMdd")) + * }}} + * + * @return if the provided date is under the provided format + */ + def isCompliantWith(format: String): Boolean = + DateHelper.isDateCompliantWithFormat(string, format) + + /** Returns the date one day after the given date. + * + * {{{ assert("20170310".nextDay == "20170311") }}} + * + * @return the date of the day after the given date + */ + def nextDay(): String = DateHelper.nextDay(string) + + /** Returns the date one day before the given date. + * + * {{{ assert("20170310".previousDay == "20170309") }}} + * + * @return the date of the day before the given date + */ + def previousDay(): String = DateHelper.previousDay(string) + } + /** Finds the list of dates between the two given dates. * * {{{ @@ -84,47 +199,78 @@ object DateHelper extends Serializable { } /** Returns which date it was x days before today under the requested format. - * - * If we're "20170125" and we request for 3 days before, we'll return - * "20170122". * * {{{ * // If today's "20170310": - * assert(DateHelper.nDaysBefore(3) == "20170307") * assert(DateHelper.nDaysBefore(5, "yyMMdd") == "170305") * }}} * * @param nbrOfDaysBefore the nbr of days before today - * @param format (default = "yyyyMMdd") the format for the returned date + * @param format the format for the returned date * @return today's date minus the nbrOfDaysBefore under the requested format */ - def nDaysBefore(nbrOfDaysBefore: Int, format: String = "yyyyMMdd"): String = + def nDaysBefore(nbrOfDaysBefore: Int, format: String): String = DateTimeFormat .forPattern(format) .print(new DateTime().minusDays(nbrOfDaysBefore)) - /** Returns which date it was x days before the given date. + /** Returns which date it was x days before today. * - * If the given date is "20170125" and we request the date it was 3 days - * before, we'll return "20170122". + * {{{ + * // If today's "20170310": + * assert(DateHelper.nDaysBefore(5) == "20170305") + * }}} + * + * @param nbrOfDaysBefore the nbr of days before today + * @return today's date minus the nbrOfDaysBefore under the default format + */ + def nDaysBefore(nbrOfDaysBefore: Int): String = + nDaysBefore(nbrOfDaysBefore, defaultFormat) + + /** Returns which date it will be x days after today under the requested format. + * + * {{{ + * // If today's "20170310": + * assert(DateHelper.nDaysAfter(5, "yyMMdd") == "170315") + * }}} + * + * @param nbrOfDaysAfter the nbr of days after today + * @param format the format for the returned date + * @return today's date plus the nbrOfDaysAfter under the requested format + */ + def nDaysAfter(nbrOfDaysAfter: Int, format: String): String = + nDaysBefore(-nbrOfDaysAfter, format) + + /** Returns which date it will be x days after today under the default format. * * {{{ - * assert(DateHelper.nDaysBeforeDate(3, "20170310") == "20170307") - * assert(DateHelper.nDaysBeforeDate(5, "170310", "yyMMdd") == "170305") + * // If today's "20170310": + * assert(DateHelper.nDaysAfter(5) == "20170315") * }}} * + * @param nbrOfDaysAfter the nbr of days after today + * @return today's date plus the nbrOfDaysAfter under the default format + */ + def nDaysAfter(nbrOfDaysAfter: Int): String = nDaysBefore(-nbrOfDaysAfter) + + /** Returns which date it was x days before the given date. + * + * If the given date is "20170125" and we request the date it was 3 days + * before, this will return "20170122". + * + * {{{ assert(DateHelper.nDaysBeforeDate(5, "170310", "yyMMdd") == "170305") }}} + * * @param nbrOfDaysBefore the nbr of days before the given date * @param date the date under the provided format for which we want the date * for nbrOfDaysBefore days before. - * @param format (default = "yyyyMMdd") the format for the provided and - * returned dates. + * @param format the format for the provided and returned dates. * @return the date it was nbrOfDaysBefore before date under the requested * format. */ def nDaysBeforeDate( nbrOfDaysBefore: Int, date: String, - format: String = "yyyyMMdd" + format: String ): String = { val currentDate = DateTimeFormat.forPattern(format).parseDateTime(date) @@ -134,36 +280,59 @@ object DateHelper extends Serializable { .print(currentDate.minusDays(nbrOfDaysBefore)) } + /** Returns which date it was x days before the given date. + * + * If the given date is "20170125" and we request the date it was 3 days + * before, this will return "20170122". + * + * {{{ assert(DateHelper.nDaysBeforeDate(5, "20170310") == "20170305") }}} + * + * @param nbrOfDaysBefore the nbr of days before the given date + * @param date the date under the default format for which we want the date + * for nbrOfDaysBefore days before. + * @return the date it was nbrOfDaysBefore before date under the default + * format. + */ + def nDaysBeforeDate(nbrOfDaysBefore: Int, date: String): String = + nDaysBeforeDate(nbrOfDaysBefore, date, defaultFormat) + /** Returns which date it will be x days after the given date. * * If the given date is "20170122" and we request the date it will be 3 days * after, we'll return "20170125". * - * {{{ - * assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310") - * assert(DateHelper.nDaysAfterDate(5, "170305", "yyMMdd") == "170310") - * }}} + * {{{ assert(DateHelper.nDaysAfterDate(5, "170305", "yyMMdd") == "170310") }}} * * @param nbrOfDaysAfter the nbr of days after the given date * @param date the date under the provided format for which we want the date * for nbrOfDaysAfter days after. - * @param format (default = "yyyyMMdd") the format for the provided and - * returned dates. + * @param format the format for the provided and returned dates. * @return the date it was nbrOfDaysAfter after date under the requested * format. */ def nDaysAfterDate( nbrOfDaysAfter: Int, date: String, - format: String = "yyyyMMdd" - ): String = { - - val currentDate = DateTimeFormat.forPattern(format).parseDateTime(date) + format: String + ): String = + nDaysBeforeDate(-nbrOfDaysAfter, date, format) - DateTimeFormat - .forPattern(format) - .print(currentDate.plusDays(nbrOfDaysAfter)) - } + /** Returns which date it will be x days after the given date under the + * default format. + * + * If the given date is "20170122" and we request the date it will be 3 days + * after, we'll return "20170125". + * + * {{{ assert(DateHelper.nDaysAfterDate(5, "20170305") == "20170310") }}} + * + * @param nbrOfDaysAfter the nbr of days after the given date + * @param date the date under the default format for which we want the date + * for nbrOfDaysAfter days after. + * @return the date it was nbrOfDaysAfter after date under the default + * format. + */ + def nDaysAfterDate(nbrOfDaysAfter: Int, date: String): String = + nDaysAfterDate(nbrOfDaysAfter, date, defaultFormat) /** Returns today's date/time under the requested format. * @@ -187,42 +356,70 @@ object DateHelper extends Serializable { * * {{{ * // If today's "20170310": - * assert(DateHelper.today() == "20170310") * assert(DateHelper.today("yyMMdd") == "170310") * }}} * - * @param format (default = "yyyyMMdd") the format for the current date + * @param format the format for the current date * @return today's date under the requested format */ - def today(format: String = "yyyyMMdd"): String = nDaysBefore(0, format) + def today(format: String): String = nDaysBefore(0, format) + + /** Returns today's date/time under the default format. + * + * {{{ + * // If today's "20170310": + * assert(DateHelper.today() == "20170310") + * }}} + * + * @return today's date under the default format + */ + def today(): String = nDaysBefore(0, defaultFormat) /** Returns yesterday's date/time under the requested format. * * {{{ * // If today's "20170310": - * assert(DateHelper.yesterday() == "20170309") * assert(DateHelper.yesterday("yyMMdd") == "170309") * }}} * - * @param format (default = "yyyyMMdd") the format in which to output the - * date of yesterday. + * @param format the format in which to output the date of yesterday * @return yesterday's date under the requested format */ - def yesterday(format: String = "yyyyMMdd"): String = nDaysBefore(1, format) + def yesterday(format: String): String = nDaysBefore(1, format) + + /** Returns yesterday's date/time under the default format. + * + * {{{ + * // If today's "20170310": + * assert(DateHelper.yesterday() == "20170309") + * }}} + * + * @return yesterday's date under the default format + */ + def yesterday(): String = nDaysBefore(1, defaultFormat) /** Returns which date it was 2 days before today under the requested format. * * {{{ * // If today's "20170310": - * assert(DateHelper.twoDaysAgo() == "20170308") * assert(DateHelper.twoDaysAgo("yyMMdd") == "170308") * }}} * - * @param format (default = "yyyyMMdd") the format in which to output the - * date of two days ago. + * @param format the format in which to output the date of two days ago * @return the date of two days ago under the requested format */ - def twoDaysAgo(format: String = "yyyyMMdd"): String = nDaysBefore(2, format) + def twoDaysAgo(format: String): String = nDaysBefore(2, format) + + /** Returns which date it was 2 days before today under the default format. + * + * {{{ + * // If today's "20170310": + * assert(DateHelper.twoDaysAgo() == "20170308") + * }}} + * + * @return the date of two days ago under the default format + */ + def twoDaysAgo(): String = nDaysBefore(2, defaultFormat) /** Reformats a date from one format to another. * @@ -265,17 +462,15 @@ object DateHelper extends Serializable { /** Returns for a date the date one day latter. * * {{{ - * // If the given date is "20170310": * assert(DateHelper.nextDay("20170310") == "20170311") * assert(DateHelper.nextDay("170310", "yyMMdd") == "170311") * }}} * * @param date the date for which to find the date of the day after - * @param format (default = "yyyyMMdd") the format of the provided and the - * returned dates. + * @param format the format of the provided and the returned dates * @return the date of the day after the given date */ - def nextDay(date: String, format: String = "yyyyMMdd"): String = { + def nextDay(date: String, format: String = defaultFormat): String = { val currentDate = DateTimeFormat.forPattern(format).parseDateTime(date) DateTimeFormat.forPattern(format).print(currentDate.plusDays(1)) } @@ -283,17 +478,15 @@ object DateHelper extends Serializable { /** Returns for a date the date one day before. * * {{{ - * // If the given date is "20170310": * assert(DateHelper.previousDay("20170310") == "20170309") * assert(DateHelper.previousDay("170310", "yyMMdd") == "170309") * }}} * * @param date the date for which to find the date of the day before - * @param format (default = "yyyyMMdd") the format of the provided and the - * returned dates. + * @param format the format of the provided and the returned dates * @return the date of the day before the given date */ - def previousDay(date: String, format: String = "yyyyMMdd"): String = { + def previousDay(date: String, format: String = defaultFormat): String = { val currentDate = DateTimeFormat.forPattern(format).parseDateTime(date) DateTimeFormat.forPattern(format).print(currentDate.minusDays(1)) } @@ -307,10 +500,10 @@ object DateHelper extends Serializable { * }}} * * @param date the date for which to find the nbr of days of diff with today - * @param format (default = "yyyyMMdd") the format of the provided date + * @param format the format of the provided date * @return the nbr of days between today and the given date */ - def nbrOfDaysSince(date: String, format: String = "yyyyMMdd"): Int = + def nbrOfDaysSince(date: String, format: String = defaultFormat): Int = Days .daysBetween( DateTimeFormat.forPattern(format).parseDateTime(date), @@ -331,13 +524,13 @@ object DateHelper extends Serializable { * days. * @param lastDate the last date of the range for which to egt the nbr of * days. - * @param format (default = "yyyyMMdd") the format of the provided dates + * @param format the format of the provided dates * @return the nbr of days between the two given dates */ def nbrOfDaysBetween( firstDate: String, lastDate: String, - format: String = "yyyyMMdd" + format: String = defaultFormat ): Int = { val formatter = DateTimeFormat.forPattern(format).withZone(DateTimeZone.UTC) @@ -359,12 +552,12 @@ object DateHelper extends Serializable { * * @param timestamp the UTC timestamps (nbr of millis since 1970-01-01) for * which to get the associated date. - * @param format (default = "yyyyMMdd") the format of the provided dates + * @param format the format of the provided dates * @return the associated date under the requested format */ def dateFromTimestamp( timestamp: Long, - format: String = "yyyyMMdd" + format: String = defaultFormat ): String = DateTimeFormat .forPattern(format) @@ -377,11 +570,10 @@ object DateHelper extends Serializable { * {{{ assert(DateHelper.dayOfWeek("20160614") == 2) }}} * * @param date the date for which to get the day of week - * @param format (default = "yyyyMMdd") the format under which the date is - * provided. + * @param format the format under which the date is provided * @return the associated day of week, such as 2 for Tuesday */ - def dayOfWeek(date: String, format: String = "yyyyMMdd"): Int = + def dayOfWeek(date: String, format: String = defaultFormat): Int = DateTimeFormat.forPattern(format).parseDateTime(date).getDayOfWeek() /** Validates a string date is under the provided format. diff --git a/src/test/scala/com/spark_helper/DateHelperTest.scala b/src/test/scala/com/spark_helper/DateHelperTest.scala index 6d27b81..30e3dd8 100644 --- a/src/test/scala/com/spark_helper/DateHelperTest.scala +++ b/src/test/scala/com/spark_helper/DateHelperTest.scala @@ -1,5 +1,7 @@ package com.spark_helper +import com.spark_helper.DateHelper._ + import org.scalatest.FunSuite import com.spark_helper.{DateHelper => DH} @@ -85,6 +87,7 @@ class DateHelperTest extends FunSuite { } test("Date versus provided format") { + assert(DateHelper.isDateCompliantWithFormat("20170302", "yyyyMMdd")) assert(!DateHelper.isDateCompliantWithFormat("20170333", "yyyyMMdd")) assert(DateHelper.isDateCompliantWithFormat("20170228", "yyyyMMdd")) @@ -93,5 +96,14 @@ class DateHelperTest extends FunSuite { assert(!DateHelper.isDateCompliantWithFormat("", "yyyyMMdd")) assert(!DateHelper.isDateCompliantWithFormat("a", "yyyyMMdd")) assert(!DateHelper.isDateCompliantWithFormat("24JAN17", "yyyyMMdd")) + + assert("20170302".isCompliantWith("yyyyMMdd")) + assert(!"20170333".isCompliantWith("yyyyMMdd")) + assert("20170228".isCompliantWith("yyyyMMdd")) + assert(!"20170229".isCompliantWith("yyyyMMdd")) + assert(!"170228".isCompliantWith("yyyyMMdd")) + assert(!"".isCompliantWith("yyyyMMdd")) + assert(!"a".isCompliantWith("yyyyMMdd")) + assert(!"24JAN17".isCompliantWith("yyyyMMdd")) } } From 6d2868e38e011db51f7677a5583a276188c9cfca Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Thu, 14 Jun 2018 23:27:17 +0100 Subject: [PATCH 22/25] Add 20161230 to 20170101 method --- README.md | 1 + .../scala/com/spark_helper/DateHelper.scala | 20 +++++++++++++++---- .../com/spark_helper/DateHelperTest.scala | 6 +++++- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9ec5927..edbad1b 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,7 @@ DateHelper.dayOfWeek("20160614") // 2 import com.spark_helper.DateHelper._ 2.daysAgo // "20170308" +"20161230" to "20170101" // List("20161230", "20161231", "20170101") 3.daysBefore("20170310") // "20170307" 5.daysAfter // "20170315" 4.daysAfter("20170310") // "20170314" diff --git a/src/main/scala/com/spark_helper/DateHelper.scala b/src/main/scala/com/spark_helper/DateHelper.scala index 9d11627..fdb1885 100644 --- a/src/main/scala/com/spark_helper/DateHelper.scala +++ b/src/main/scala/com/spark_helper/DateHelper.scala @@ -28,7 +28,6 @@ import scala.util.Try * Source DateHelper * - * @todo Is it possible to ddo something like ("20170325" to "20170327")? * @author Xavier Guihot * @since 2017-02 */ @@ -146,6 +145,19 @@ object DateHelper extends Serializable { * @return the date of the day before the given date */ def previousDay(): String = DateHelper.previousDay(string) + + /** Creates the list of dates between the two given dates. + * + * {{{ + * assert(("20161230" to "20170101") == List("20161230", "20161231", "20170101")) + * }}} + * + * @param lastDate the last date + * @return the list of dates between this string and the lastDate in the + * default format. + */ + def to(lastDate: String): List[String] = + DateHelper.daysBetween(string, lastDate) } /** Finds the list of dates between the two given dates. @@ -156,15 +168,15 @@ object DateHelper extends Serializable { * * @param firstDate the first date (in the given format) * @param lastDate the last date (in the given format) - * @param format (default = "yyyyMMdd") the format to use for firstDate and - * lastDate and for the returned list of dates. + * @param format the format to use for firstDate and lastDate and for the + * returned list of dates. * @return the list of dates between firstDate and lastDate in the given * format. */ def daysBetween( firstDate: String, lastDate: String, - format: String = "yyyyMMdd" + format: String = defaultFormat ): List[String] = { val formatter = DateTimeFormat.forPattern(format).withZone(DateTimeZone.UTC) diff --git a/src/test/scala/com/spark_helper/DateHelperTest.scala b/src/test/scala/com/spark_helper/DateHelperTest.scala index 30e3dd8..dd2a525 100644 --- a/src/test/scala/com/spark_helper/DateHelperTest.scala +++ b/src/test/scala/com/spark_helper/DateHelperTest.scala @@ -27,7 +27,11 @@ class DateHelperTest extends FunSuite { ) assert(dates === expectedDates) - // 2: With a custom formatter: + // 2: Same as 1, but using the pimped String: + dates = "20161229" to "20170103" + assert(dates === expectedDates) + + // 3: With a custom formatter: dates = DateHelper.daysBetween("29Dec16", "03Jan17", "ddMMMyy") expectedDates = List( "29Dec16", From 114dde301502661044af9fada0dec2c5604bef57 Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Fri, 15 Jun 2018 19:12:05 +0100 Subject: [PATCH 23/25] Minor refactoring - fix typos --- .gitignore | 2 + README.md | 33 ++++--- .../scala/com/spark_helper/DateHelper.scala | 30 +++--- .../scala/com/spark_helper/HdfsHelper.scala | 48 +++++----- src/main/scala/com/spark_helper/Monitor.scala | 93 +++++++++---------- .../scala/com/spark_helper/SparkHelper.scala | 65 +++++++------ .../com/spark_helper/monitoring/Test.scala | 12 +-- .../org/apache/spark/TextFileOverwrite.scala | 6 +- .../com/spark_helper/DateHelperTest.scala | 2 +- .../com/spark_helper/HdfsHelperTest.scala | 13 +-- .../scala/com/spark_helper/MonitorTest.scala | 77 +++++++-------- .../com/spark_helper/SparkHelperTest.scala | 67 ++++++------- 12 files changed, 210 insertions(+), 238 deletions(-) diff --git a/.gitignore b/.gitignore index d838934..b76ffde 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ project/target target *.crc + +.idea diff --git a/README.md b/README.md index edbad1b..26818eb 100644 --- a/README.md +++ b/README.md @@ -43,11 +43,11 @@ import com.spark_helper.HdfsHelper // A bunch of methods wrapping the FileSystem API, such as: HdfsHelper.fileExists("my/hdfs/file/path.txt") // HdfsHelper.folderExists("my/hdfs/folder") -assert(HdfsHelper.listFileNamesInFolder("my/folder/path") == List("file_name_1.txt", "file_name_2.csv")) -assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306") -assert(HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") == 3) +HdfsHelper.listFileNamesInFolder("my/folder/path") // List("file_name_1.txt", "file_name_2.csv") +HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") // "20170306" +HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") // 3 HdfsHelper.deleteFile("my/hdfs/file/path.csv") // HdfsHelper.deleteFolder("my/hdfs/folder") -HdfsHelper.moveFolder("my/hdfs/folder") // HdfsHelper.moveFile("my/hdfs/file.txt") +HdfsHelper.moveFolder("old/path", "new/path") // HdfsHelper.moveFile("old/path.txt", "new/path.txt") HdfsHelper.createEmptyHdfsFile("/some/hdfs/file/path.token") // HdfsHelper.createFolder("my/hdfs/folder") // File content helpers: @@ -103,14 +103,13 @@ rdd.saveAsSingleTextFile("/my/output/file/path.txt", classOf[BZip2Codec]) // splitting the input with \n), it splits the file in records based on a custom // delimiter. This way, xml, json, yml or any multi-line record file format can // be used with Spark: -sc.textFile("/my/input/folder/path", "---\n") +sc.textFile("/my/input/folder/path", "---\n") // for a yml file for instance // Equivalent to rdd.flatMap(identity) for RDDs of Seqs or Options: rdd.flatten -// Equivalent to sparkContext.textFile(), but for each line is tupled with its -// file path: -SparkHelper.textFileWithFileName("folder", sparkContext) +// Equivalent to sc.textFile(), but for each line is tupled with its file path: +sc.textFileWithFileName("/my/input/folder/path") // which produces: // RDD(("folder/file_1.txt", "record1fromfile1"), ("folder/file_1.txt", "record2fromfile1"), // ("folder/file_2.txt", "record1fromfile2"), ...) @@ -176,15 +175,15 @@ DateHelper.setFormat("ddMMMyy") The full list of methods is available at [Monitor](http://xavierguihot.com/spark_helper/#com.spark_helper.Monitor$) -It's a simple logger/report which contains a report that one can update from -the driver and a success state. The idea is to persist job executions logs and -errors (and forget about grepping unreadable yarn logs). +It's a simple logger/report which contains a report and a state that one can +update from the driver. The idea is to persist job executions logs and errors +(and forget about grepping unreadable yarn logs). -It's designed for perdiodic spark jobs (handles storage and purge of logs) and +It's designed for periodic spark jobs (handles storage and purge of logs) and provides a way to handle kpis validation. Logs are stored on the go which means one can have a direct real time access of -the job logs/status and it's current state (which can overwise be a pain if it +the job logs/status and it's current state (which can otherwise be a pain if it means going through yarn logs, or even for certain production environments going through additional layers of software logs to get to yarn logs). @@ -198,9 +197,9 @@ the logger for a clean logging. This is a "driver-only" logger and is not intended at logging concurrent actions from executors. -Produced reports can easily be inserted in a notification email whenerver the +Produced reports can easily be inserted in a notification email whenever the job fails, which saves a lot of time to maintainers operating on heavy -production environements. +production environments. The produced persisted report is also a way for downstream jobs to know the status of their input data. @@ -238,7 +237,7 @@ try { Monitor.error(e, "My pipeline descirption") // whatever unexpected error } -if (Monitor.isSuccess()) { +if (Monitor.isSuccess) { val doMore = "Let's do some more stuff!" Monitor.log("My second pipeline description: success") } @@ -249,7 +248,7 @@ Monitor.store() // At the end of the job, if the job isn't successfull, you might want to // crash it (for instance to get a notification from your scheduler): -if (!Monitor.isSuccess()) throw new Exception() // or send an email, or ... +if (!Monitor.isSuccess) throw new Exception() // or send an email, or ... ``` At any time during the job, logs can be accessed from file diff --git a/src/main/scala/com/spark_helper/DateHelper.scala b/src/main/scala/com/spark_helper/DateHelper.scala index fdb1885..d393718 100644 --- a/src/main/scala/com/spark_helper/DateHelper.scala +++ b/src/main/scala/com/spark_helper/DateHelper.scala @@ -12,7 +12,7 @@ import scala.util.Try * spark job and replace it with methods fully tested whose name is * self-explanatory/readable. * - * A few exemples: + * A few examples: * * {{{ * assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101")) @@ -64,7 +64,7 @@ object DateHelper extends Serializable { * * @return today's date minus the given nbr of days */ - def daysAgo(): String = DateHelper.nDaysBefore(int) + def daysAgo: String = DateHelper.nDaysBefore(int) /** Returns which date it was x days before the given date. * @@ -92,7 +92,7 @@ object DateHelper extends Serializable { * * @return today's date plus the given nbr of days */ - def daysAfter(): String = DateHelper.nDaysAfter(int) + def daysAfter: String = DateHelper.nDaysAfter(int) /** Returns which date it will be x days after the given date under the * default format. @@ -112,7 +112,7 @@ object DateHelper extends Serializable { implicit class StringExtensions(val string: String) { - /** Validates the stringified date is compliant with the provided format. + /** Validates the formatted date is compliant with the provided format. * * {{{ * assert("20170302".isCompliantWith("yyyyMMdd")) @@ -136,7 +136,7 @@ object DateHelper extends Serializable { * * @return the date of the day after the given date */ - def nextDay(): String = DateHelper.nextDay(string) + def nextDay: String = DateHelper.nextDay(string) /** Returns the date one day before the given date. * @@ -144,7 +144,7 @@ object DateHelper extends Serializable { * * @return the date of the day before the given date */ - def previousDay(): String = DateHelper.previousDay(string) + def previousDay: String = DateHelper.previousDay(string) /** Creates the list of dates between the two given dates. * @@ -205,7 +205,7 @@ object DateHelper extends Serializable { ): List[DateTime] = { val nbrOfDaysWithinRange = - Days.daysBetween(jodaFirstDate, jodaLastDate).getDays() + Days.daysBetween(jodaFirstDate, jodaLastDate).getDays (0 to nbrOfDaysWithinRange).toList.map(jodaFirstDate.plusDays) } @@ -385,7 +385,7 @@ object DateHelper extends Serializable { * * @return today's date under the default format */ - def today(): String = nDaysBefore(0, defaultFormat) + def today: String = nDaysBefore(0, defaultFormat) /** Returns yesterday's date/time under the requested format. * @@ -408,7 +408,7 @@ object DateHelper extends Serializable { * * @return yesterday's date under the default format */ - def yesterday(): String = nDaysBefore(1, defaultFormat) + def yesterday: String = nDaysBefore(1, defaultFormat) /** Returns which date it was 2 days before today under the requested format. * @@ -460,7 +460,7 @@ object DateHelper extends Serializable { * @return the current timestamps (nbr of millis since 1970-01-01) in the * local computer's zone. */ - def currentTimestamp(): String = new DateTime().getMillis().toString + def currentTimestamp: String = new DateTime().getMillis.toString /** Returns the current UTC timestamp. * @@ -469,7 +469,7 @@ object DateHelper extends Serializable { * @return the current UTC timestamps (nbr of millis since 1970-01-01). */ def currentUtcTimestamp(): String = - new DateTime().withZone(DateTimeZone.UTC).getMillis().toString + new DateTime().withZone(DateTimeZone.UTC).getMillis.toString /** Returns for a date the date one day latter. * @@ -521,7 +521,7 @@ object DateHelper extends Serializable { DateTimeFormat.forPattern(format).parseDateTime(date), new DateTime() ) - .getDays() + .getDays /** Returns the nbr of days between the two given dates. * @@ -552,7 +552,7 @@ object DateHelper extends Serializable { formatter.parseDateTime(firstDate), formatter.parseDateTime(lastDate) ) - .getDays() + .getDays } /** Returns the date associated to the given UTC timestamp. @@ -586,7 +586,7 @@ object DateHelper extends Serializable { * @return the associated day of week, such as 2 for Tuesday */ def dayOfWeek(date: String, format: String = defaultFormat): Int = - DateTimeFormat.forPattern(format).parseDateTime(date).getDayOfWeek() + DateTimeFormat.forPattern(format).parseDateTime(date).getDayOfWeek /** Validates a string date is under the provided format. * @@ -601,7 +601,7 @@ object DateHelper extends Serializable { * assert(!DateHelper.isDateCompliantWithFormat("24JAN17", "yyyyMMdd")) * }}} * - * @param stringValue the stringified date + * @param stringValue the formatted date * @return if the provided date is under the provided format */ def isDateCompliantWithFormat( diff --git a/src/main/scala/com/spark_helper/HdfsHelper.scala b/src/main/scala/com/spark_helper/HdfsHelper.scala index 920c905..b4cd82a 100644 --- a/src/main/scala/com/spark_helper/HdfsHelper.scala +++ b/src/main/scala/com/spark_helper/HdfsHelper.scala @@ -36,7 +36,7 @@ import com.typesafe.config.{Config, ConfigFactory} * code and thus could instead just use * HdfsHelper.deleteFile("my/hdfs/file/path.csv"). * - * A few exemples: + * A few examples: * * {{{ * import com.spark_helper.HdfsHelper @@ -49,7 +49,7 @@ import com.typesafe.config.{Config, ConfigFactory} * HdfsHelper.deleteFile("my/hdfs/file/path.csv") * HdfsHelper.moveFolder("my/hdfs/folder") * HdfsHelper.compressFile("hdfs/path/to/uncompressed_file.txt", classOf[GzipCodec]) - * HdfsHelper.appendHeader("my/hdfs/file/path.csv", "colum0,column1") + * HdfsHelper.appendHeader("my/hdfs/file/path.csv", "column0,column1") * * // Some Xml/Typesafe helpers for hadoop as well: * HdfsHelper.isHdfsXmlCompliantWithXsd( @@ -57,7 +57,7 @@ import com.typesafe.config.{Config, ConfigFactory} * HdfsHelper.loadXmlFileFromHdfs("my/hdfs/file/path.xml") * * // Very handy to load a config (typesafe format) stored on hdfs at the - * // begining of a spark job: + * // beginning of a spark job: * HdfsHelper.loadTypesafeConfigFromHdfs("my/hdfs/file/path.conf"): Config * * // In order to write small amount of data in a file on hdfs without the @@ -90,7 +90,7 @@ object HdfsHelper extends Serializable { * If this setter is not used, the default Configuration is set with * new Configuration(). * - * @param conf the specific Configuration to use + * @param configuration the specific Configuration to use */ def setConf(configuration: Configuration): Unit = { conf = configuration @@ -306,8 +306,8 @@ object HdfsHelper extends Serializable { /** Creates an empty file on hdfs. * - * Might be usefull for token files. For instance a file which is only used - * as a timestamp token of the last update of a processus, or a file which + * Might be useful for token files. For instance a file which is only used + * as a timestamp token of the last update of a process, or a file which * blocks the execution of an other instance of the same job, ... * * Overwrites the file if it already exists. @@ -398,7 +398,7 @@ object HdfsHelper extends Serializable { else if (recursive) listFileNamesInFolder( hdfsPath + "/" + status.getPath.getName, - true, + recursive = true, onlyName ) // If it's a dir and we're not in a recursive option: @@ -433,9 +433,9 @@ object HdfsHelper extends Serializable { * @return the joda DateTime of the last modification of the given file */ def fileModificationDateTime(hdfsPath: String): DateTime = - new DateTime(hdfs.getFileStatus(new Path(hdfsPath)).getModificationTime()) + new DateTime(hdfs.getFileStatus(new Path(hdfsPath)).getModificationTime) - /** Returns the stringified date of the last modification of the given file. + /** Returns the formatted date of the last modification of the given file. * * {{{ * assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306") @@ -445,7 +445,7 @@ object HdfsHelper extends Serializable { * modification date. * @param format (default = "yyyyMMdd") the format under which to get the * modification date. - * @return the stringified date of the last modification of the given file, + * @return the formatted date of the last modification of the given file, * under the provided format. */ def fileModificationDate( @@ -463,7 +463,7 @@ object HdfsHelper extends Serializable { def folderModificationDateTime(hdfsPath: String): DateTime = fileModificationDateTime(hdfsPath) - /** Returns the stringified date of the last modification of the given folder. + /** Returns the formatted date of the last modification of the given folder. * * {{{ * assert(HdfsHelper.folderModificationDate("my/hdfs/folder") == "20170306") @@ -473,7 +473,7 @@ object HdfsHelper extends Serializable { * modification date. * @param format (default = "yyyyMMdd") the format under which to get the * modification date. - * @return the stringified date of the last modification of the given folder, + * @return the formatted date of the last modification of the given folder, * under the provided format. */ def folderModificationDate( @@ -495,17 +495,17 @@ object HdfsHelper extends Serializable { def nbrOfDaysSinceFileWasLastModified(hdfsPath: String): Int = Days .daysBetween(fileModificationDateTime(hdfsPath), new DateTime()) - .getDays() + .getDays /** Appends a header and a footer to a file. * - * Usefull when creating an xml file with spark and you need to add top level + * Useful when creating an xml file with spark and you need to add top level * tags. * * If the workingFolderPath parameter is provided, then the processing is * done in a working/tmp folder and then only, the final file is moved to its * final real location. This way, in case of cluster instability, i.e. in - * case the Spark job is interupted, this avoids having a temporary or + * case the Spark job is interrupted, this avoids having a temporary or * corrupted file in output. * * @param filePath the path of the file for which to add the header and the @@ -528,13 +528,13 @@ object HdfsHelper extends Serializable { /** Appends a header to a file. * - * Usefull when creating a csv file with spark and you need to add a header + * Useful when creating a csv file with spark and you need to add a header * describing the different fields. * * If the workingFolderPath parameter is provided, then the processing is * done in a working/tmp folder and then only, the final file is moved to its * final real location. This way, in case of cluster instability, i.e. in - * case the Spark job is interupted, this avoids having a temporary or + * case the Spark job is interrupted, this avoids having a temporary or * corrupted file in output. * * @param filePath the path of the file for which to add the header @@ -557,7 +557,7 @@ object HdfsHelper extends Serializable { * If the workingFolderPath parameter is provided, then the processing is * done in a working/tmp folder and then only, the final file is moved to its * final real location. This way, in case of cluster instability, i.e. in - * case the Spark job is interupted, this avoids having a temporary or + * case the Spark job is interrupted, this avoids having a temporary or * corrupted file in output. * * @param filePath the path of the file for which to add the footer @@ -589,7 +589,7 @@ object HdfsHelper extends Serializable { validateHdfsXmlWithXsd(hdfsXmlPath, xsdFile) true } catch { - case saxe: SAXException => false + case _: SAXException => false } /** Validates an XML file on hdfs in regard to the given XSD. @@ -615,7 +615,7 @@ object HdfsHelper extends Serializable { validator.validate(xmlFile) } - /** Loads a typesafe config from Hdfs. + /** Loads a Typesafe config from Hdfs. * * The best way to load the configuration of your job from hdfs. * @@ -643,8 +643,8 @@ object HdfsHelper extends Serializable { * } * }}} * - * @param hdfsConfigPath the absolute path of the typesafe config file on - * hdfs we want to load as a typesafe Config object. + * @param hdfsConfigPath the absolute path of the Typesafe config file on + * hdfs we want to load as a Typesafe Config object. * @return the com.typesafe.config.Config object which contains usable data */ def loadTypesafeConfigFromHdfs(hdfsConfigPath: String): Config = { @@ -746,8 +746,8 @@ object HdfsHelper extends Serializable { .filter(path => { val fileAgeInDays = Days - .daysBetween(new DateTime(path.getModificationTime()), new DateTime()) - .getDays() + .daysBetween(new DateTime(path.getModificationTime), new DateTime()) + .getDays fileAgeInDays >= purgeAge diff --git a/src/main/scala/com/spark_helper/Monitor.scala b/src/main/scala/com/spark_helper/Monitor.scala index 3866cbb..f5eeeca 100644 --- a/src/main/scala/com/spark_helper/Monitor.scala +++ b/src/main/scala/com/spark_helper/Monitor.scala @@ -6,19 +6,17 @@ import java.util.Calendar import org.apache.commons.lang3.time.DurationFormatUtils -import java.lang.Throwable - -/** A logger dedicated to Spak jobs. +/** A logger dedicated to Spark jobs. * * It's a simple logger/report which contains a report that one can update from * the driver and a success state. The idea is to persist job executions logs * and errors (and forget about grepping unreadable yarn logs). * - * It's designed for perdiodic spark jobs (handles storage and purge of logs) + * It's designed for periodic spark jobs (handles storage and purge of logs) * and provides a way to handle kpis validation. * * Logs are stored on the go which means one can have a direct real time access - * of the job logs/status and it's current state (which can overwise be a pain + * of the job logs/status and it's current state (which can otherwise be a pain * if it means going through yarn logs, or even for certain production * environments going through additional layers of software logs to get to yarn * logs). @@ -33,9 +31,9 @@ import java.lang.Throwable * This is a "driver-only" logger and is not intended at logging concurrent * actions from executors. * - * Produced reports can easily be inserted in a notification email whenerver + * Produced reports can easily be inserted in a notification email whenever * the job fails, which saves a lot of time to maintainers operating on heavy - * production environements. + * production environments. * * The produced persisted report is also a way for downstream jobs to know the * status of their input data. @@ -61,7 +59,7 @@ import java.lang.Throwable * Test("Nbr of output records", processedData.count(), SUPERIOR_THAN, 10e6d, NBR), * Test("Some pct of invalid output", your_complex_kpi, INFERIOR_THAN, 3, PCT) * ), - * "My pipeline descirption" + * "My pipeline description" * ) * * if (outputIsValid) @@ -69,9 +67,9 @@ import java.lang.Throwable * * } catch { * case iie: InvalidInputException => - * Monitor.error(iie, "My pipeline descirption", diagnostic = "No input data!") + * Monitor.error(iie, "My pipeline description", diagnostic = "No input data!") * case e: Throwable => - * Monitor.error(e, "My pipeline descirption") // whatever unexpected error + * Monitor.error(e, "My pipeline description") // whatever unexpected error * } * * if (Monitor.isSuccess()) { @@ -83,7 +81,7 @@ import java.lang.Throwable * // HDFS (this saves the logs in the folder set with Monitor.setLogFolder): * Monitor.store() * - * // At the end of the job, if the job isn't successfull, you might want to + * // At the end of the job, if the job isn't successful, you might want to * // crash it (for instance to get a notification from your scheduler): * if (!Monitor.isSuccess()) throw new Exception() // or send an email, or ... * }}} @@ -100,8 +98,8 @@ import java.lang.Throwable * * My job description (whatever you want); for instance: * Documentation: https://github.com/xavierguihot/spark_helper - * [10:23] Begining - * [10:23-10:23] My pipeline descirption: failed + * [10:23] Beginning + * [10:23-10:23] My pipeline description: failed * Diagnostic: No input data! * org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://my/hdfs/input/path * at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:285) @@ -116,8 +114,8 @@ import java.lang.Throwable * * My job description (whatever you want); for instance: * Documentation: https://github.com/xavierguihot/spark_helper - * [10:23] Begining - * [10:23-10:36] My pipeline descirption: failed + * [10:23] Beginning + * [10:23-10:36] My pipeline description: failed * java.lang.NumberFormatException: For input string: "a" * java.lang.NumberFormatException.forInputString(NumberFormatException.java:65) * java.lang.Integer.parseInt(Integer.java:492) @@ -125,14 +123,14 @@ import java.lang.Throwable * [10:36] Duration: 00:13:47 * }}} * - * Another scenario, successfull spark pipeline and KPIs are valid; all good!: + * Another scenario, successful spark pipeline and KPIs are valid; all good!: * {{{ * My job title * * My job description (whatever you want); for instance: * Documentation: https://github.com/xavierguihot/spark_helper - * [10:23] Begining - * [10:23-10:41] My pipeline descirption: success + * [10:23] Beginning + * [10:23-10:41] My pipeline description: success * KPI: Nbr of output records * Value: 14669071.0 * Must be superior than 10000000.0 @@ -148,6 +146,7 @@ import java.lang.Throwable * Source Monitor * + * @todo would a State monad be appropriate? * @author Xavier Guihot * @since 2017-02 */ @@ -159,13 +158,13 @@ object Monitor { private var logDirectory: Option[String] = None private var purgeWindow: Option[Int] = None - private val jobStart = DateHelper.now("[HH:mm]") + " Begining" + private val jobStart = DateHelper.now("[HH:mm]") + " Beginning" // Join of reportTitle, pointsOfContact, reportDescription, logDirectory and // jobStart: private var reportHeader = buildReportHeader() - private val begining = Calendar.getInstance().getTimeInMillis() + private val beginning = Calendar.getInstance().getTimeInMillis private var lastReportUpdate = DateHelper.now("HH:mm") /** Sets the report's title. @@ -175,7 +174,7 @@ object Monitor { * {{{ * // Using: * Monitor.setReportTitle("My Simple Job") - * // Produces this at the begining of the report: + * // Produces this at the beginning of the report: * " My Simple Job" * "" * }}} @@ -196,7 +195,7 @@ object Monitor { * // Using: * Monitor.setReportTitle("My Simple Job") * Monitor.addContacts(List("x.guihot@gmail.com", "smbdy@gmail.com")) - * // Produces this at the begining of the report: + * // Produces this at the beginning of the report: * " My Simple Job" * "" * "Point of contact: x.guihot@gmail.com, smbdy@gmail.com" @@ -218,7 +217,7 @@ object Monitor { * // Using: * Monitor.setReportTitle("My Simple Job") * Monitor.addDescription("Documentation: https://github.com/xavierguihot/spark_helper") - * // Produces this at the begining of the report: + * // Produces this at the beginning of the report: * " My Simple Job" * "" * "Documentation: https://github.com/xavierguihot/spark_helper" @@ -269,7 +268,7 @@ object Monitor { * * @return if your spark job is successful. */ - def isSuccess(): Boolean = successful + def isSuccess: Boolean = successful /** Returns the current state of the monitoring report. * @@ -286,7 +285,7 @@ object Monitor { * * @param text the text to append to the report */ - def log(text: String): Unit = log(text, true) + def log(text: String): Unit = log(text, withTimestamp = true) /** Updates the report with some text and a success. * @@ -316,7 +315,7 @@ object Monitor { * will result in this to be appended to the report: * {{{ "[10:35-10:37] Some text: failure\n" }}} * - * Once the monitoring is a failure, then whatever following successfull + * Once the monitoring is a failure, then whatever following successful * action won't change the failed status of the monitoring. * * @param taskDescription the text to append to the report @@ -342,12 +341,12 @@ object Monitor { * {{{ * monitor.error( * invalidInputException, - * "My pipeline descirption", + * "My pipeline description", * diagnostic = "No input data!") * }}} * will result in this to be appended to the report: * {{{ - * [10:23-10:24] My pipeline descirption: failed + * [10:23-10:24] My pipeline description: failed * Diagnostic: No input data! * org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://my/hdfs/input/path * at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:285) @@ -370,7 +369,7 @@ object Monitor { successful = false val serializedException = - "\t\t" + exception.toString() + "\n" + + "\t\t" + exception.toString + "\n" + exception.getStackTrace.map(line => s"\t\t$line").mkString("\n") val update = List( @@ -433,14 +432,13 @@ object Monitor { if (!testsAreValid) successful = false - val seriralizedTests = tests.mkString("\n") + val serializedTests = tests.mkString("\n") val update = testSuitName match { - case "" => seriralizedTests - case _ => { + case "" => serializedTests + case _ => val status = if (testsAreValid) "success" else "failed" - s"$testSuitName: $status\n$seriralizedTests" - } + s"$testSuitName: $status\n$serializedTests" } log(update) @@ -506,11 +504,10 @@ object Monitor { logDirectory match { - case Some(logFolder) => { - + case Some(logFolder) => // We add the job duration to the report: val jobDuration = DurationFormatUtils.formatDuration( - Calendar.getInstance().getTimeInMillis() - begining, + Calendar.getInstance().getTimeInMillis - beginning, "HH:mm:ss") var now = DateHelper.now("[HH:mm]") @@ -532,7 +529,6 @@ object Monitor { .writeToHdfsFile(finalReport, s"$logFolder/current.$reportExtension") purgeWindow.foreach(window => purgeOutdatedLogs(logFolder, window)) - } case None => require( @@ -583,20 +579,17 @@ object Monitor { /** Updates the current stored version of logs in file * logFolder/current.ongoing */ private def storeCurrent(): Unit = - logDirectory.foreach { - case logFolder => { - - val warning = - "WARNING: If this file exists it does not necessarily mean that " + - "your job is still running. This file might persist if your job " + - "has been killed and thus couldn't reach your call to the " + - "Monitor.store()." + logDirectory.foreach { logFolder => + val warning = + "WARNING: If this file exists it does not necessarily mean that " + + "your job is still running. This file might persist if your job " + + "has been killed and thus couldn't reach your call to the " + + "Monitor.store()." - val ongoingReport = - s"$reportHeader\n$report\n$warning" + val ongoingReport = + s"$reportHeader\n$report\n$warning" - HdfsHelper.writeToHdfsFile(ongoingReport, s"$logFolder/current.ongoing") - } + HdfsHelper.writeToHdfsFile(ongoingReport, s"$logFolder/current.ongoing") } private def purgeOutdatedLogs(logFolder: String, window: Int): Unit = { diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index 336ff4a..f87edf9 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -21,7 +21,7 @@ import scala.util.Random * spark job and replace it with methods fully tested whose name is * self-explanatory/readable. * - * A few exemples: + * A few examples: * * {{{ * // Same as sc.saveAsTextFile(path), but the result is a single file: @@ -83,7 +83,7 @@ object SparkHelper extends Serializable { /** Saves an RDD in exactly one file. * * Allows one to save an RDD in one file, while keeping the processing - * parallelized. + * distributed. * * {{{ rdd.saveAsSingleTextFile("/my/file/path.txt") }}} * @@ -95,7 +95,7 @@ object SparkHelper extends Serializable { /** Saves an RDD in exactly one file. * * Allows one to save an RDD in one file, while keeping the processing - * parallelized. + * distributed. * * {{{ rdd.saveAsSingleTextFile("/my/file/path.txt", classOf[BZip2Codec]) }}} * @@ -112,7 +112,7 @@ object SparkHelper extends Serializable { /** Saves an RDD in exactly one file. * * Allows one to save an RDD in one file, while keeping the processing - * parallelized. + * distributed. * * This variant of saveAsSingleTextFile * performs the storage in a temporary folder instead of directly in the @@ -136,7 +136,7 @@ object SparkHelper extends Serializable { /** Saves an RDD in exactly one file. * * Allows one to save an RDD in one file, while keeping the processing - * parallelized. + * distributed. * * This variant of saveAsSingleTextFile * performs the storage in a temporary folder instead of directly in the @@ -174,8 +174,8 @@ object SparkHelper extends Serializable { * The result is equivalent to rdd.coalesce(x).saveAsTextFile() * , but if x * is very low, coalesce - * would make the processing time explode, wherease this methods keeps the - * processing parallelized, save as text file and then only merges the + * would make the processing time explode, whereas this methods keeps the + * processing distributed, save as text file and then only merges the * result in a lower nbr of partitions. * * {{{ rdd.saveAsTextFileAndCoalesce("/produced/folder/path/with/only/30/files", 30) }}} @@ -219,8 +219,8 @@ object SparkHelper extends Serializable { * The result is equivalent to rdd.coalesce(x).saveAsTextFile() * , but if x * is very low, coalesce - * would make the processing time explode, wherease this methods keeps the - * processing parallelized, save as text file and then only merges the + * would make the processing time explode, whereas this methods keeps the + * processing distributed, save as text file and then only merges the * result in a lower nbr of partitions. * * {{{ rdd.saveAsTextFileAndCoalesce("/produced/folder/path/with/only/30/files", 30, classOf[BZip2Codec]) }}} @@ -261,7 +261,7 @@ object SparkHelper extends Serializable { implicit class SeqRDDExtensions[T: ClassTag](val rdd: RDD[Seq[T]]) { - /** Flattens an RDD[Seq[T]] + /** Flattens an RDD of Seq[T] * to RDD[T]. * * {{{ sc.parallelize(Array(Seq(1, 2, 3), Nil, Seq(4))).flatten == sc.parallelize(Array(Seq(1, 2, 3, 4))) }}} @@ -270,12 +270,12 @@ object SparkHelper extends Serializable { * or List.flatten * would have. */ - def flatten(): RDD[T] = rdd.flatMap(identity) + def flatten: RDD[T] = rdd.flatMap(identity) } implicit class OptionRDDExtensions[T: ClassTag](val rdd: RDD[Option[T]]) { - /** Flattens an RDD[Option[T]] + /** Flattens an RDD of Option[T] * to RDD[T]. * * {{{ sc.parallelize(Array(Some(1), None, Some(2))).flatten == sc.parallelize(Array(Seq(1, 2))) }}} @@ -284,7 +284,7 @@ object SparkHelper extends Serializable { * or List.flatten * would have. */ - def flatten(): RDD[T] = rdd.flatMap(o => o) + def flatten: RDD[T] = rdd.flatMap(o => o) } implicit class PairRDDExtensions(val rdd: RDD[(String, String)]) @@ -306,7 +306,7 @@ object SparkHelper extends Serializable { * * {{{ rdd.saveAsTextFileByKey("/my/output/folder/path") }}} * - * @param path the folder where will be storrred key files + * @param path the folder where will be stored key files */ def saveAsTextFileByKey(path: String): Unit = SparkHelper.saveAsTextFileByKeyInternal(rdd, path, None, None) @@ -322,8 +322,8 @@ object SparkHelper extends Serializable { * * {{{ rdd.saveAsTextFileByKey("/my/output/folder/path", 12) }}} * - * @param path the folder where will be storrred key files - * @param keyNbr the nbr of expected keys (which is the nbr of outputed + * @param path the folder where will be stored key files + * @param keyNbr the nbr of expected keys (which is the nbr of output * files) */ def saveAsTextFileByKey(path: String, keyNbr: Int): Unit = @@ -346,7 +346,7 @@ object SparkHelper extends Serializable { * * {{{ rdd.saveAsTextFileByKey("/my/output/folder/path", classOf[BZip2Codec]) }}} * - * @param path the folder where will be storrred key files + * @param path the folder where will be stored key files * @param codec the type of compression to use (for instance * classOf[BZip2Codec] or classOf[GzipCodec])) */ @@ -367,8 +367,8 @@ object SparkHelper extends Serializable { * * {{{ rdd.saveAsTextFileByKey("/my/output/folder/path", 12, classOf[BZip2Codec]) }}} * - * @param path the folder where will be storrred key files - * @param keyNbr the nbr of expected keys (which is the nbr of outputed + * @param path the folder where will be stored key files + * @param keyNbr the nbr of expected keys (which is the nbr of output * files) * @param codec the type of compression to use (for instance * classOf[BZip2Codec] or classOf[GzipCodec])) @@ -667,11 +667,11 @@ object SparkHelper extends Serializable { /** Saves RDD in exactly one file. * * Allows one to save an RDD as one text file, but at the same time to keep - * the processing parallelized. + * the processing distributed. * * @param outputRDD the RDD of strings to save as text file * @param path the path where to save the file - * @param compression the compression codec to use (can be left to None) + * @param codec the compression codec to use (can be left to None) */ private def saveAsSingleTextFileInternal( outputRDD: RDD[String], @@ -685,8 +685,8 @@ object SparkHelper extends Serializable { // Classic saveAsTextFile in a temporary folder: HdfsHelper.deleteFolder(s"$path.tmp") codec match { - case Some(codec) => - outputRDD.saveAsTextFile(s"$path.tmp", codec) + case Some(compression) => + outputRDD.saveAsTextFile(s"$path.tmp", compression) case None => outputRDD.saveAsTextFile(s"$path.tmp") } @@ -718,25 +718,22 @@ object SparkHelper extends Serializable { val isCached = rdd.getStorageLevel.useMemory // If the nbr of keys isn't provided, we have to get it ourselves: - val keyNbr = optKeyNbr match { - case Some(keyNbr) => - keyNbr - case None => - if (!isCached) - rdd.cache() - rdd.keys.distinct.count.toInt + val keyNbr = optKeyNbr.getOrElse { + if (!isCached) + rdd.cache() + rdd.keys.distinct.count.toInt } val prdd = rdd.partitionBy(new HashPartitioner(keyNbr)) codec match { - case Some(codec) => + case Some(compression) => prdd.saveAsHadoopFile( path, classOf[String], classOf[String], classOf[KeyBasedOutput], - codec + compression ) case None => prdd.saveAsHadoopFile( @@ -764,8 +761,8 @@ object SparkHelper extends Serializable { .coalesce(finalCoalesceLevel) codec match { - case Some(codec) => - intermediateRDD.saveAsTextFile(lowerCoalescenceLevelFolder, codec) + case Some(compression) => + intermediateRDD.saveAsTextFile(lowerCoalescenceLevelFolder, compression) case None => intermediateRDD.saveAsTextFile(lowerCoalescenceLevelFolder) } diff --git a/src/main/scala/com/spark_helper/monitoring/Test.scala b/src/main/scala/com/spark_helper/monitoring/Test.scala index 97942e2..80b3ad7 100644 --- a/src/main/scala/com/spark_helper/monitoring/Test.scala +++ b/src/main/scala/com/spark_helper/monitoring/Test.scala @@ -7,7 +7,7 @@ import java.lang.Math.abs * This is intended to be used as parameter of Monitor.updateByKpiValidation * and Monitor.updateByKpisValidation methods. * - * Some exemples of Test objects: + * Some examples of Test objects: * {{{ * Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT) * Test("pctOfSomethingElse", 0.27d, SUPERIOR_THAN, 0.3d, PCT) @@ -19,7 +19,7 @@ import java.lang.Math.abs * * @constructor Creates a Test object. * - * Some exemples of Test objects: + * Some examples of Test objects: * {{{ * Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT) * Test("pctOfSomethingElse", 0.27d, SUPERIOR_THAN, 0.3d, PCT) @@ -42,22 +42,22 @@ final case class Test( kpiType: KpiType ) { - private[spark_helper] def isSuccess(): Boolean = thresholdType match { + private[spark_helper] def isSuccess: Boolean = thresholdType match { case EQUAL_TO => kpiValue == appliedThreshold case SUPERIOR_THAN => abs(kpiValue) >= appliedThreshold case INFERIOR_THAN => abs(kpiValue) <= appliedThreshold } - override def toString(): String = + override def toString: String = List( "\tKPI: " + description, "\t\tValue: " + kpiValue.toString + kpiType.name, "\t\tMust be " + thresholdType.name + " " + appliedThreshold.toString + kpiType.name, - "\t\tValidated: " + isSuccess().toString + "\t\tValidated: " + isSuccess.toString ).mkString("\n") } -/** An enumeration which represents the type of threshol to use (EQUAL_TO, +/** An enumeration which represents the type of threshold to use (EQUAL_TO, * SUPERIOR_THAN or INFERIOR_THAN) */ sealed trait ThresholdType { def name: String } diff --git a/src/main/scala/org/apache/spark/TextFileOverwrite.scala b/src/main/scala/org/apache/spark/TextFileOverwrite.scala index 8d1cc1b..28935ea 100644 --- a/src/main/scala/org/apache/spark/TextFileOverwrite.scala +++ b/src/main/scala/org/apache/spark/TextFileOverwrite.scala @@ -29,9 +29,9 @@ object TextFileOverwrite { * version of hadoop-common used by Spark (it will become Serializable * starting version 3 of hadoop-common). * - * * I don't String* (instead of Seq[String]) as for 1 String only it would - * confuse the compiler as to which sc.textFile to use (the default one or - * this one). + * * I don't use String* (instead of Seq[String]) as for 1 String only it + * would confuse the compiler as to which sc.textFile to use (the default + * one or this one). */ val confBroadcast = diff --git a/src/test/scala/com/spark_helper/DateHelperTest.scala b/src/test/scala/com/spark_helper/DateHelperTest.scala index dd2a525..5c68404 100644 --- a/src/test/scala/com/spark_helper/DateHelperTest.scala +++ b/src/test/scala/com/spark_helper/DateHelperTest.scala @@ -81,7 +81,7 @@ class DateHelperTest extends FunSuite { assert(DateHelper.nDaysBeforeDate(5, "170310", "yyMMdd") === "170305") } - test("Date it will be N days affter date") { + test("Date it will be N days after date") { assert(DateHelper.nDaysAfterDate(3, "20170307") === "20170310") assert(DateHelper.nDaysAfterDate(5, "170305", "yyMMdd") === "170310") } diff --git a/src/test/scala/com/spark_helper/HdfsHelperTest.scala b/src/test/scala/com/spark_helper/HdfsHelperTest.scala index f96db97..b64306d 100644 --- a/src/test/scala/com/spark_helper/HdfsHelperTest.scala +++ b/src/test/scala/com/spark_helper/HdfsHelperTest.scala @@ -158,7 +158,8 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext { // 3: Using the pimped Seq/String: - listToStore.toSeq.writeToHdfs(filePath) + val seqToStore = Seq("Hello World", "Whatever") + seqToStore.writeToHdfs(filePath) assert(HdfsHelper.fileExists(filePath)) storedContent = sc.textFile(filePath).collect().sorted.mkString("\n") assert(storedContent === contentToStore) @@ -281,7 +282,7 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext { assert(HdfsHelper.fileExists(filePath)) assert(!HdfsHelper.fileExists(renamedPath)) - // 3: Let's successfuly move the file with the moveFile() method: + // 3: Let's successfully move the file with the moveFile() method: // Let's rename the file: HdfsHelper.moveFile(filePath, renamedPath) @@ -326,7 +327,7 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext { assert(HdfsHelper.fileExists(s"$folderToMove/file_2.txt")) assert(!HdfsHelper.folderExists(renamedFolder)) - // 2: Let's successfuly move the folder with the moveFolder() method: + // 2: Let's successfully move the folder with the moveFolder() method: // Let's rename the folder: HdfsHelper.moveFolder(folderToMove, renamedFolder) @@ -411,7 +412,7 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext { HdfsHelper.deleteFolder(testFolder) HdfsHelper.writeToHdfsFile( "\n" + - " trente\n" + + " thirty\n" + "
34 thingy street, someplace, sometown
\n" + "
", xmlPath @@ -500,11 +501,11 @@ class HdfsHelperTest extends FunSuite with SharedSparkContext { HdfsHelper.deleteFile(filePath) HdfsHelper.writeToHdfsFile("hello\nworld", filePath) - HdfsHelper.compressFile(filePath, classOf[GzipCodec], true) + HdfsHelper.compressFile(filePath, classOf[GzipCodec]) assert(HdfsHelper.fileExists(s"$filePath.gz")) - // Easy to test with spark, as reading a file with the ".gz" extention + // Easy to test with spark, as reading a file with the ".gz" extension // forces the read with the compression codec: val content = sc.textFile(s"$filePath.gz").collect.sorted assert(content === Array("hello", "world")) diff --git a/src/test/scala/com/spark_helper/MonitorTest.scala b/src/test/scala/com/spark_helper/MonitorTest.scala index 45381ac..bda85da 100644 --- a/src/test/scala/com/spark_helper/MonitorTest.scala +++ b/src/test/scala/com/spark_helper/MonitorTest.scala @@ -17,12 +17,12 @@ class MonitorTest extends FunSuite with SharedSparkContext { test("Basic monitoring testing") { - // Monitor is initialy successful: - assert(Monitor.isSuccess()) + // Monitor is initially successful: + assert(Monitor.isSuccess) // Here is what a report generated without any additional settings should // look like: var report = removeTimeStamps(Monitor.logs()) - assert(report === "[..:..] Begining\n") + assert(report === "[..:..] Beginning\n") // Include additional info which are placed in the report's header: Monitor.setTitle("Processing of whatever") @@ -30,33 +30,30 @@ class MonitorTest extends FunSuite with SharedSparkContext { Monitor.addDescription( "Documentation: https://github.com/xavierguihot/spark_helper") report = removeTimeStamps(Monitor.logs()) - var expectedReport = ( + var expectedReport = " Processing of whatever\n" + "\n" + "Point of contact: x.guihot@gmail.com, smbdy@gmail.com\n" + "Documentation: https://github.com/xavierguihot/spark_helper\n" + - "[..:..] Begining\n" - ) + "[..:..] Beginning\n" assert(report === expectedReport) // Simple text update without success modification: Monitor.reset() Monitor.log("My First Stage") report = removeTimeStamps(Monitor.logs()) - expectedReport = ( - "[..:..] Begining\n" + + expectedReport = + "[..:..] Beginning\n" + "[..:..-..:..] My First Stage\n" - ) assert(report === expectedReport) // Let's call .log() another time: Monitor.log("My Second Stage") report = removeTimeStamps(Monitor.logs()) - expectedReport = ( - "[..:..] Begining\n" + + expectedReport = + "[..:..] Beginning\n" + "[..:..-..:..] My First Stage\n" + "[..:..-..:..] My Second Stage\n" - ) assert(report === expectedReport) // Successive updates: @@ -64,33 +61,30 @@ class MonitorTest extends FunSuite with SharedSparkContext { Monitor.reset() Monitor.success("My First Stage") report = removeTimeStamps(Monitor.logs()) - expectedReport = ( - "[..:..] Begining\n" + + expectedReport = + "[..:..] Beginning\n" + "[..:..-..:..] My First Stage: success\n" - ) assert(report === expectedReport) - assert(Monitor.isSuccess()) + assert(Monitor.isSuccess) // Update report with a failure: Monitor.error("My Second Stage") report = removeTimeStamps(Monitor.logs()) - expectedReport = ( - "[..:..] Begining\n" + + expectedReport = + "[..:..] Beginning\n" + "[..:..-..:..] My First Stage: success\n" + "[..:..-..:..] My Second Stage: failed\n" - ) assert(report === expectedReport) - assert(!Monitor.isSuccess()) + assert(!Monitor.isSuccess) // A success after a failure, which must not overwrite the failure: Monitor.success("My Third Stage") report = removeTimeStamps(Monitor.logs()) - expectedReport = ( - "[..:..] Begining\n" + + expectedReport = + "[..:..] Beginning\n" + "[..:..-..:..] My First Stage: success\n" + "[..:..-..:..] My Second Stage: failed\n" + "[..:..-..:..] My Third Stage: success\n" - ) assert(report === expectedReport) - assert(!Monitor.isSuccess()) + assert(!Monitor.isSuccess) } test("Check current.ongoing live monitoring") { @@ -112,18 +106,17 @@ class MonitorTest extends FunSuite with SharedSparkContext { .toList .mkString("\n") - val expectedReport = ( + val expectedReport = " My Processing\n" + "\n" + "Point of contact: x.guihot@gmail.com, smbdy@gmail.com\n" + "Documentation: https://github.com/xavierguihot/spark_helper\n" + - "[..:..] Begining\n" + + "[..:..] Beginning\n" + "[..:..-..:..] Doing something\n" + "\n" + "WARNING: If this file exists it does not necessarily mean that " + "your job is still running. This file might persist if your job has " + "been killed and thus couldn't reach your call to the Monitor.store()." - ) assert(removeTimeStamps(reportStoredLines) === expectedReport) } @@ -132,7 +125,7 @@ class MonitorTest extends FunSuite with SharedSparkContext { Monitor.reset() // Explanation to someone running tests and seeing an error stack trace - // even though tests are actually successfull: + // even though tests are actually successful: println( "README: The following stack trace is NOT a test failure. This " + "is the logging/print of the tested stack trace error as it would " + @@ -146,14 +139,13 @@ class MonitorTest extends FunSuite with SharedSparkContext { Monitor.error(nfe, "Parse to integer", "my diagnostic") } // Warning, here I remove the stack trace because it depends on the - // java/scala version! And yes this test is a bit less usefull. + // java/scala version! And yes this test is a bit less useful. val report = removeTimeStamps(Monitor.logs()).split("\n").take(3).mkString("\n") - val expectedReport = ( - "[..:..] Begining\n" + + val expectedReport = + "[..:..] Beginning\n" + "[..:..-..:..] Parse to integer: failed\n" + " Diagnostic: my diagnostic" - ) assert(report === expectedReport) } @@ -171,11 +163,11 @@ class MonitorTest extends FunSuite with SharedSparkContext { ) assert(!success) - assert(!Monitor.isSuccess()) + assert(!Monitor.isSuccess) var report = removeTimeStamps(Monitor.logs()) - var expectedReport = ( - "[..:..] Begining\n" + + var expectedReport = + "[..:..] Beginning\n" + "[..:..-..:..] Tests for whatever: failed\n" + " KPI: pctOfWhatever\n" + " Value: 0.06%\n" + @@ -189,7 +181,6 @@ class MonitorTest extends FunSuite with SharedSparkContext { " Value: 1235.0\n" + " Must be equal to 1235.0\n" + " Validated: true\n" - ) assert(report === expectedReport) // 2: Single test: @@ -199,17 +190,16 @@ class MonitorTest extends FunSuite with SharedSparkContext { "Tests for whatever") assert(success) - assert(Monitor.isSuccess()) + assert(Monitor.isSuccess) report = removeTimeStamps(Monitor.logs()) - expectedReport = ( - "[..:..] Begining\n" + + expectedReport = + "[..:..] Beginning\n" + "[..:..-..:..] Tests for whatever: success\n" + " KPI: someNbr\n" + " Value: 5.5E7\n" + " Must be superior than 5.0E7\n" + " Validated: true\n" - ) assert(report === expectedReport) } @@ -235,15 +225,14 @@ class MonitorTest extends FunSuite with SharedSparkContext { .mkString("\n") .dropRight(2) + "00" // removes the seconds of the job duration - val expectedReport = ( + val expectedReport = " My Processing\n" + "\n" + "Point of contact: x.guihot@gmail.com\n" + "Documentation: https://github.com/xavierguihot/spark_helper\n" + - "[..:..] Begining\n" + + "[..:..] Beginning\n" + "[..:..-..:..] Doing something: success\n" + "[..:..] Duration: 00:00:00" - ) assert(removeTimeStamps(reportStoredLines) === expectedReport) } @@ -296,7 +285,7 @@ class MonitorTest extends FunSuite with SharedSparkContext { timeStampFreeLogs.substring(0, index) + "[..:..-..:..]" + timeStampFreeLogs.substring(index + 13) - index = timeStampFreeLogs.indexOf("[", index + 1); + index = timeStampFreeLogs.indexOf("[", index + 1) } timeStampFreeLogs diff --git a/src/test/scala/com/spark_helper/SparkHelperTest.scala b/src/test/scala/com/spark_helper/SparkHelperTest.scala index 21d01e0..70d706d 100644 --- a/src/test/scala/com/spark_helper/SparkHelperTest.scala +++ b/src/test/scala/com/spark_helper/SparkHelperTest.scala @@ -75,35 +75,30 @@ class SparkHelperTest val weirdFormatFilePath = s"$resourceFolder/some_weird_format.txt" - // 1: Let's read a file where a record begins with a line begining with - // 3 and other lines begining by 4: + // 1: Let's read a file where a record begins with a line beginning with + // 3 and other lines beginning by 4: HdfsHelper.deleteFile(weirdFormatFilePath) - val textContent = ( + val textContent = "3 first line of the first record\n" + "4 another line of the first record\n" + "4 and another one for the first record\n" + "3 first line of the second record\n" + "3 first line of the third record\n" + "4 another line for the third record" - ) HdfsHelper.writeToHdfsFile(textContent, weirdFormatFilePath) var computedRecords = sc.textFile(weirdFormatFilePath, "\n3").collect() var expectedRecords = Array( - ( - "3 first line of the first record\n" + - "4 another line of the first record\n" + - "4 and another one for the first record" - ), + "3 first line of the first record\n" + + "4 another line of the first record\n" + + "4 and another one for the first record", " first line of the second record", - ( - " first line of the third record\n" + - "4 another line for the third record" - ) + " first line of the third record\n" + + "4 another line for the third record" ) assert(computedRecords === expectedRecords) @@ -132,15 +127,11 @@ class SparkHelperTest expectedRecords = Array( "\n", - ( - "
34 thingy street, someplace, sometown
\n" + - "
\n" - ), - ( - "
12 thingy street, someplace, sometown
\n" + - "
\n" + - "" - ) + "
34 thingy street, someplace, sometown
\n" + + "\n", + "
12 thingy street, someplace, sometown
\n" + + "\n" + + "" ) assert(computedRecords === expectedRecords) @@ -167,7 +158,7 @@ class SparkHelperTest val keyValueFolder = s"$resourceFolder/key_value_storage" - // 1: Let's strore key values per file: + // 1: Let's store key values per file: HdfsHelper.deleteFolder(keyValueFolder) @@ -189,9 +180,9 @@ class SparkHelperTest assert(HdfsHelper.folderExists(keyValueFolder)) // And it contains one file per key: - var genratedKeyFiles = HdfsHelper.listFileNamesInFolder(keyValueFolder) + var generatedKeyFiles = HdfsHelper.listFileNamesInFolder(keyValueFolder) var expectedKeyFiles = List("_SUCCESS", "key_1", "key_2", "key_3") - assert(genratedKeyFiles === expectedKeyFiles) + assert(generatedKeyFiles === expectedKeyFiles) var valuesForKey1 = sc.textFile(s"$keyValueFolder/key_1").collect().sorted assert(valuesForKey1 === Array("value_a", "value_b")) @@ -202,7 +193,7 @@ class SparkHelperTest val valuesForKey3 = sc.textFile(s"$keyValueFolder/key_3").collect().sorted assert(valuesForKey3 === Array("value_a", "value_b")) - // 2: Let's strore key values per file; but without providing the nbr of + // 2: Let's store key values per file; but without providing the nbr of // keys: HdfsHelper.deleteFolder(keyValueFolder) @@ -213,14 +204,14 @@ class SparkHelperTest assert(HdfsHelper.folderExists(keyValueFolder)) // And it contains one file per key: - genratedKeyFiles = HdfsHelper.listFileNamesInFolder(keyValueFolder) + generatedKeyFiles = HdfsHelper.listFileNamesInFolder(keyValueFolder) expectedKeyFiles = List("_SUCCESS", "key_1", "key_2", "key_3") - assert(genratedKeyFiles === expectedKeyFiles) + assert(generatedKeyFiles === expectedKeyFiles) valuesForKey1 = sc.textFile(s"$keyValueFolder/key_1").collect().sorted assert(valuesForKey1 === Array("value_a", "value_b")) - // 3: Let's strore key values per file and compress these files: + // 3: Let's store key values per file and compress these files: HdfsHelper.deleteFolder(keyValueFolder) @@ -230,9 +221,9 @@ class SparkHelperTest assert(HdfsHelper.folderExists(keyValueFolder)) // And it contains one file per key: - genratedKeyFiles = HdfsHelper.listFileNamesInFolder(keyValueFolder) + generatedKeyFiles = HdfsHelper.listFileNamesInFolder(keyValueFolder) expectedKeyFiles = List("_SUCCESS", "key_1.gz", "key_2.gz", "key_3.gz") - assert(genratedKeyFiles === expectedKeyFiles) + assert(generatedKeyFiles === expectedKeyFiles) valuesForKey1 = sc.textFile(s"$keyValueFolder/key_1.gz").collect().sorted assert(valuesForKey1 === Array("value_a", "value_b")) @@ -254,9 +245,9 @@ class SparkHelperTest rddToStore.saveAsTextFileAndCoalesce(testFolder, 2) // Let's check the nbr of partitions: - var genratedKeyFiles = HdfsHelper.listFileNamesInFolder(testFolder) + var generatedKeyFiles = HdfsHelper.listFileNamesInFolder(testFolder) var expectedKeyFiles = List("_SUCCESS", "part-00000", "part-00001") - assert(genratedKeyFiles === expectedKeyFiles) + assert(generatedKeyFiles === expectedKeyFiles) // And let's check the content: var singleFileStoredData = sc.textFile(testFolder).collect().sorted @@ -269,9 +260,9 @@ class SparkHelperTest rddToStore.saveAsTextFileAndCoalesce(testFolder, 2, classOf[GzipCodec]) // Let's check the nbr of partitions: - genratedKeyFiles = HdfsHelper.listFileNamesInFolder(testFolder) + generatedKeyFiles = HdfsHelper.listFileNamesInFolder(testFolder) expectedKeyFiles = List("_SUCCESS", "part-00000.gz", "part-00001.gz") - assert(genratedKeyFiles === expectedKeyFiles) + assert(generatedKeyFiles === expectedKeyFiles) // And let's check the content: singleFileStoredData = sc.textFile(testFolder).collect().sorted @@ -350,7 +341,7 @@ class SparkHelperTest .map { case (filePath, line) => val nonLocalPath = filePath.split("src/test/") match { - case Array(localPartOfPath, projectRelativePath) => + case Array(_, projectRelativePath) => "file:/.../src/test/" + projectRelativePath } (nonLocalPath, line) @@ -411,7 +402,7 @@ class SparkHelperTest val in = sc.parallelize(Array(1, 3, 2, 7, 8)) val computedOut = in.partialMap { case a if a % 2 == 0 => 2 * a } - val expetcedOut = sc.parallelize(Array(1, 3, 4, 7, 16)) - assertRDDEquals(computedOut, expetcedOut) + val expectedOut = sc.parallelize(Array(1, 3, 4, 7, 16)) + assertRDDEquals(computedOut, expectedOut) } } From 0440f99a9cb9b2454cdca2b69cdf15fa8205adda Mon Sep 17 00:00:00 2001 From: Xavier GUIHOT Date: Sun, 17 Jun 2018 13:14:22 +0100 Subject: [PATCH 24/25] Update doc --- README.md | 27 +- .../DateHelper$$IntExtensions.html | 599 ++++++++++++++++++ .../DateHelper$$StringExtensions.html | 595 +++++++++++++++++ docs/com/spark_helper/DateHelper$.html | 335 ++++++++-- .../HdfsHelper$$SeqExtensions.html | 539 ++++++++++++++++ .../HdfsHelper$$StringExtensions.html | 536 ++++++++++++++++ docs/com/spark_helper/HdfsHelper$.html | 152 ++++- docs/com/spark_helper/Monitor$.html | 56 +- .../SparkHelper$$OptionRDDExtensions.html | 540 ++++++++++++++++ .../SparkHelper$$PairRDDExtensions.html | 376 +++++++++++ .../SparkHelper$$RDDExtensions.html | 547 ++++++++++++++++ .../SparkHelper$$SeqRDDExtensions.html | 540 ++++++++++++++++ .../SparkHelper$$SparkContextExtensions.html | 476 ++++++++++++++ .../SparkHelper$$StringRDDExtensions.html | 422 ++++++++++++ docs/com/spark_helper/SparkHelper$.html | 481 ++++---------- docs/com/spark_helper/monitoring/Test.html | 4 +- .../monitoring/ThresholdType.html | 2 +- docs/com/spark_helper/monitoring/package.html | 4 +- docs/com/spark_helper/package.html | 150 +++-- docs/index.html | 14 +- docs/index.js | 2 +- docs/index/index-a.html | 3 + docs/index/index-d.html | 11 +- docs/index/index-f.html | 3 + docs/index/index-i.html | 9 + docs/index/index-n.html | 5 +- docs/index/index-o.html | 20 + docs/index/index-p.html | 8 +- docs/index/index-r.html | 6 + docs/index/index-s.html | 60 +- docs/index/index-t.html | 12 +- docs/index/index-w.html | 3 + docs/org/apache/package.html | 118 ++++ docs/org/apache/spark/TextFileOverwrite$.html | 499 +++++++++++++++ docs/org/apache/spark/package.html | 246 +++++++ docs/org/package.html | 118 ++++ docs/package.html | 17 + .../scala/com/spark_helper/DateHelper.scala | 33 +- .../scala/com/spark_helper/HdfsHelper.scala | 64 +- src/main/scala/com/spark_helper/Monitor.scala | 3 +- .../scala/com/spark_helper/SparkHelper.scala | 40 +- 41 files changed, 7077 insertions(+), 598 deletions(-) create mode 100644 docs/com/spark_helper/DateHelper$$IntExtensions.html create mode 100644 docs/com/spark_helper/DateHelper$$StringExtensions.html create mode 100644 docs/com/spark_helper/HdfsHelper$$SeqExtensions.html create mode 100644 docs/com/spark_helper/HdfsHelper$$StringExtensions.html create mode 100644 docs/com/spark_helper/SparkHelper$$OptionRDDExtensions.html create mode 100644 docs/com/spark_helper/SparkHelper$$PairRDDExtensions.html create mode 100644 docs/com/spark_helper/SparkHelper$$RDDExtensions.html create mode 100644 docs/com/spark_helper/SparkHelper$$SeqRDDExtensions.html create mode 100644 docs/com/spark_helper/SparkHelper$$SparkContextExtensions.html create mode 100644 docs/com/spark_helper/SparkHelper$$StringRDDExtensions.html create mode 100644 docs/index/index-o.html create mode 100644 docs/org/apache/package.html create mode 100644 docs/org/apache/spark/TextFileOverwrite$.html create mode 100644 docs/org/apache/spark/package.html create mode 100644 docs/org/package.html diff --git a/README.md b/README.md index 26818eb..50eb2bd 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ The full list of methods is available at Contains basic file-related methods mostly based on hdfs apache Hadoop FileSystem API [org.apache.hadoop.fs.FileSystem](https://hadoop.apache.org/docs/r2.6.1/api/org/apache/hadoop/fs/FileSystem.html). -A non-exhaustive list of exemples: +A non-exhaustive list of examples: ```scala import com.spark_helper.HdfsHelper @@ -58,7 +58,7 @@ HdfsHelper.appendHeader("my/hdfs/file/path.csv", "colum0,column1") HdfsHelper.isHdfsXmlCompliantWithXsd("my/hdfs/file/path.xml", getClass.getResource("/some_xml.xsd")) HdfsHelper.loadXmlFileFromHdfs("my/hdfs/file/path.xml") -// Very handy to load a config (typesafe format) stored on hdfs at the begining of a spark job: +// Very handy to load a config (typesafe format) stored on hdfs at the beginning of a spark job: HdfsHelper.loadTypesafeConfigFromHdfs("my/hdfs/file/path.conf"): Config // In order to write small amount of data in a file on hdfs without the whole spark stack: @@ -89,13 +89,13 @@ The full list of methods is available at Contains basic RRD-related methods. -A non-exhaustive list of exemples: +A non-exhaustive list of examples: ```scala import com.spark_helper.SparkHelper._ // Same as rdd.saveAsTextFile("path"), but the result is a single file (while -// keeping the processing parallelized): +// keeping the processing distributed): rdd.saveAsSingleTextFile("/my/output/file/path.txt") rdd.saveAsSingleTextFile("/my/output/file/path.txt", classOf[BZip2Codec]) @@ -118,7 +118,7 @@ sc.textFileWithFileName("/my/input/folder/path") // RDD. Within each file (named from the key) are all values for this key: rdd.saveAsTextFileByKey("/my/output/folder/path") -// Concept mapper (the following exemple transforms RDD(1, 3, 2, 7, 8) into RDD(1, 3, 4, 7, 16)): +// Concept mapper (the following example transforms RDD(1, 3, 2, 7, 8) into RDD(1, 3, 4, 7, 16)): rdd.partialMap { case a if a % 2 == 0 => 2 * a } // For when input files contain commas and textFile can't handle it: @@ -133,7 +133,7 @@ The full list of methods is available at Wrapper around [joda-time](http://www.joda.org/joda-time/apidocs/) for data-mining classic dates manipulations and job scheduling. -A non-exhaustive list of exemples: +A non-exhaustive list of examples: ```scala import com.spark_helper.DateHelper @@ -246,7 +246,7 @@ if (Monitor.isSuccess) { // HDFS (this saves the logs in the folder set with Monitor.setLogFolder): Monitor.store() -// At the end of the job, if the job isn't successfull, you might want to +// At the end of the job, if the job isn't successful, you might want to // crash it (for instance to get a notification from your scheduler): if (!Monitor.isSuccess) throw new Exception() // or send an email, or ... ``` @@ -261,7 +261,7 @@ Here are some possible reports generated by the previous pipeline: My job description (whatever you want); for instance: Documentation: https://github.com/xavierguihot/spark_helper -[10:23] Begining +[10:23] Beginning [10:23-10:23] My pipeline descirption: failed Diagnostic: No input data! org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://my/hdfs/input/path @@ -278,7 +278,7 @@ or My job description (whatever you want); for instance: Documentation: https://github.com/xavierguihot/spark_helper -[10:23] Begining +[10:23] Beginning [10:23-10:41] My pipeline descirption: success KPI: Nbr of output records Value: 14669071.0 @@ -295,7 +295,7 @@ Documentation: https://github.com/xavierguihot/spark_helper ## Including spark_helper to your dependencies: -With sbt, add these lines to your build.sbt: +With sbt: ```scala resolvers += "jitpack" at "https://jitpack.io" @@ -303,7 +303,7 @@ resolvers += "jitpack" at "https://jitpack.io" libraryDependencies += "com.github.xavierguihot" % "spark_helper" % "v1.1.1" ``` -With maven, add these lines to your pom.xml: +With maven: ```xml @@ -320,7 +320,7 @@ With maven, add these lines to your pom.xml: ``` -With gradle, add these lines to your build.gradle: +With gradle: ```groovy allprojects { @@ -333,3 +333,6 @@ dependencies { compile 'com.github.xavierguihot:spark_helper:v1.1.1' } ``` + +For versions anterior to `2.0.0`, use prefix `v` in the version tag; for +instance `v1.0.0` diff --git a/docs/com/spark_helper/DateHelper$$IntExtensions.html b/docs/com/spark_helper/DateHelper$$IntExtensions.html new file mode 100644 index 0000000..9b5203c --- /dev/null +++ b/docs/com/spark_helper/DateHelper$$IntExtensions.html @@ -0,0 +1,599 @@ + + + + IntExtensions - com.spark_helper.DateHelper.IntExtensions + + + + + + + + + + + + + + + +
+ Class +

com.spark_helper.DateHelper

+

IntExtensions

Related Doc: + package DateHelper +

+ + Permalink + + +
+ +

+ + implicit + class + + + IntExtensions extends AnyRef + +

+ +
+ Linear Supertypes +
AnyRef, Any
+
+ + +
+
+
+ Ordering +
    + +
  1. Alphabetic
  2. +
  3. By Inheritance
  4. +
+
+
+ Inherited
+
+
    +
  1. IntExtensions
  2. AnyRef
  3. Any
  4. +
+
+ +
    +
  1. Hide All
  2. +
  3. Show All
  4. +
+
+
+ Visibility +
  1. Public
  2. All
+
+
+ +
+
+
+

Instance Constructors

+
  1. + + +

    + + + new + + + IntExtensions(int: Int) + +

    + + Permalink + + + +
+
+ + + + + +
+

Value Members

+
  1. + + +

    + + final + def + + + !=(arg0: Any): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  2. + + +

    + + final + def + + + ##(): Int + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  3. + + +

    + + final + def + + + ==(arg0: Any): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  4. + + +

    + + final + def + + + asInstanceOf[T0]: T0 + +

    + + Permalink + + +
    Definition Classes
    Any
    +
  5. + + +

    + + + def + + + clone(): AnyRef + +

    + + Permalink + + +
    Attributes
    protected[java.lang]
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + ... + ) + +
    +
  6. + + +

    + + + def + + + daysAfter(date: String): String + +

    + + Permalink + + +

    Returns which date it will be x days after the given date under the +default format.

    Returns which date it will be x days after the given date under the +default format.

    If the given date is "20170122" and we request the date it will be 3 +days after, we'll return "20170125".

    assert(5.daysAfter("20170305") == "20170310")
    date

    the date under the default format for which we want the date +for nbrOfDaysAfter days after.

    returns

    the date it was nbrOfDaysAfter after date under the default +format.

    +
  7. + + +

    + + + def + + + daysAfter: String + +

    + + Permalink + + +

    Returns which date it will be x days after today under the default format.

    Returns which date it will be x days after today under the default format.

    If we're "20170125" and we request for 3 days after, we'll return +"20170127".

    // If today's "20170310":
    +assert(3.daysAfter == "20170313")
    returns

    today's date plus the given nbr of days

    +
  8. + + +

    + + + def + + + daysAgo: String + +

    + + Permalink + + +

    Returns which date it was x days before today under the default format.

    Returns which date it was x days before today under the default format.

    If we're "20170125" and we request for 3 days before, we'll return +"20170122".

    // If today's "20170310":
    +assert(3.daysAgo == "20170307")
    returns

    today's date minus the given nbr of days

    +
  9. + + +

    + + + def + + + daysBefore(date: String): String + +

    + + Permalink + + +

    Returns which date it was x days before the given date.

    Returns which date it was x days before the given date.

    If the given date is "20170125" and we request the date it was 3 days +before, this will return "20170122".

    assert(3.daysBefore("20170310") == "20170307")
    date

    the date under the default format for which we want the date +for nbrOfDaysBefore days before.

    returns

    the date it was nbrOfDaysBefore before date under the default +format.

    +
  10. + + +

    + + final + def + + + eq(arg0: AnyRef): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  11. + + +

    + + + def + + + equals(arg0: Any): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  12. + + +

    + + + def + + + finalize(): Unit + +

    + + Permalink + + +
    Attributes
    protected[java.lang]
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + classOf[java.lang.Throwable] + ) + +
    +
  13. + + +

    + + final + def + + + getClass(): Class[_] + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  14. + + +

    + + + def + + + hashCode(): Int + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  15. + + +

    + + + val + + + int: Int + +

    + + Permalink + + + +
  16. + + +

    + + final + def + + + isInstanceOf[T0]: Boolean + +

    + + Permalink + + +
    Definition Classes
    Any
    +
  17. + + +

    + + final + def + + + ne(arg0: AnyRef): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  18. + + +

    + + final + def + + + notify(): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  19. + + +

    + + final + def + + + notifyAll(): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  20. + + +

    + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  21. + + +

    + + + def + + + toString(): String + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  22. + + +

    + + final + def + + + wait(): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + ... + ) + +
    +
  23. + + +

    + + final + def + + + wait(arg0: Long, arg1: Int): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + ... + ) + +
    +
  24. + + +

    + + final + def + + + wait(arg0: Long): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + ... + ) + +
    +
+
+ + + + +
+ +
+
+

Inherited from AnyRef

+
+

Inherited from Any

+
+ +
+ +
+
+

Ungrouped

+ +
+
+ +
+ +
+ + + + + + diff --git a/docs/com/spark_helper/DateHelper$$StringExtensions.html b/docs/com/spark_helper/DateHelper$$StringExtensions.html new file mode 100644 index 0000000..cf0c71c --- /dev/null +++ b/docs/com/spark_helper/DateHelper$$StringExtensions.html @@ -0,0 +1,595 @@ + + + + StringExtensions - com.spark_helper.DateHelper.StringExtensions + + + + + + + + + + + + + + + +
+ Class +

com.spark_helper.DateHelper

+

StringExtensions

Related Doc: + package DateHelper +

+ + Permalink + + +
+ +

+ + implicit + class + + + StringExtensions extends AnyRef + +

+ +
+ Linear Supertypes +
AnyRef, Any
+
+ + +
+
+
+ Ordering +
    + +
  1. Alphabetic
  2. +
  3. By Inheritance
  4. +
+
+
+ Inherited
+
+
    +
  1. StringExtensions
  2. AnyRef
  3. Any
  4. +
+
+ +
    +
  1. Hide All
  2. +
  3. Show All
  4. +
+
+
+ Visibility +
  1. Public
  2. All
+
+
+ +
+
+
+

Instance Constructors

+
  1. + + +

    + + + new + + + StringExtensions(string: String) + +

    + + Permalink + + + +
+
+ + + + + +
+

Value Members

+
  1. + + +

    + + final + def + + + !=(arg0: Any): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  2. + + +

    + + final + def + + + ##(): Int + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  3. + + +

    + + final + def + + + ==(arg0: Any): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  4. + + +

    + + final + def + + + asInstanceOf[T0]: T0 + +

    + + Permalink + + +
    Definition Classes
    Any
    +
  5. + + +

    + + + def + + + clone(): AnyRef + +

    + + Permalink + + +
    Attributes
    protected[java.lang]
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + ... + ) + +
    +
  6. + + +

    + + final + def + + + eq(arg0: AnyRef): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  7. + + +

    + + + def + + + equals(arg0: Any): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  8. + + +

    + + + def + + + finalize(): Unit + +

    + + Permalink + + +
    Attributes
    protected[java.lang]
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + classOf[java.lang.Throwable] + ) + +
    +
  9. + + +

    + + final + def + + + getClass(): Class[_] + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  10. + + +

    + + + def + + + hashCode(): Int + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  11. + + +

    + + + def + + + isCompliantWith(format: String): Boolean + +

    + + Permalink + + +

    Validates the formatted date is compliant with the provided format.

    Validates the formatted date is compliant with the provided format.

    assert("20170302".isCompliantWith("yyyyMMdd"))
    +assert(!"20170333".isCompliantWith("yyyyMMdd"))
    +assert("20170228".isCompliantWith("yyyyMMdd"))
    +assert(!"20170229".isCompliantWith("yyyyMMdd"))
    +assert(!"170228".isCompliantWith("yyyyMMdd"))
    +assert(!"".isCompliantWith("yyyyMMdd"))
    +assert(!"a".isCompliantWith("yyyyMMdd"))
    +assert(!"24JAN17".isCompliantWith("yyyyMMdd"))
    returns

    if the provided date is under the provided format

    +
  12. + + +

    + + final + def + + + isInstanceOf[T0]: Boolean + +

    + + Permalink + + +
    Definition Classes
    Any
    +
  13. + + +

    + + final + def + + + ne(arg0: AnyRef): Boolean + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  14. + + +

    + + + def + + + nextDay: String + +

    + + Permalink + + +

    Returns the date one day after the given date.

    Returns the date one day after the given date.

    assert("20170310".nextDay == "20170311")
    returns

    the date of the day after the given date

    +
  15. + + +

    + + final + def + + + notify(): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  16. + + +

    + + final + def + + + notifyAll(): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  17. + + +

    + + + def + + + previousDay: String + +

    + + Permalink + + +

    Returns the date one day before the given date.

    Returns the date one day before the given date.

    assert("20170310".previousDay == "20170309")
    returns

    the date of the day before the given date

    +
  18. + + +

    + + + val + + + string: String + +

    + + Permalink + + + +
  19. + + +

    + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    +
  20. + + +

    + + + def + + + to(lastDate: String): List[String] + +

    + + Permalink + + +

    Creates the list of dates between the two given dates.

    Creates the list of dates between the two given dates.

    assert(("20161230" to "20170101") == List("20161230", "20161231", "20170101"))
    lastDate

    the last date

    returns

    the list of dates between this string and the lastDate in the +default format.

    +
  21. + + +

    + + + def + + + toString(): String + +

    + + Permalink + + +
    Definition Classes
    AnyRef → Any
    +
  22. + + +

    + + final + def + + + wait(): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + ... + ) + +
    +
  23. + + +

    + + final + def + + + wait(arg0: Long, arg1: Int): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + ... + ) + +
    +
  24. + + +

    + + final + def + + + wait(arg0: Long): Unit + +

    + + Permalink + + +
    Definition Classes
    AnyRef
    Annotations
    + @throws( + + ... + ) + +
    +
+
+ + + + +
+ +
+
+

Inherited from AnyRef

+
+

Inherited from Any

+
+ +
+ +
+
+

Ungrouped

+ +
+
+ +
+ +
+ + + + + + diff --git a/docs/com/spark_helper/DateHelper$.html b/docs/com/spark_helper/DateHelper$.html index 77a52dd..749074c 100644 --- a/docs/com/spark_helper/DateHelper$.html +++ b/docs/com/spark_helper/DateHelper$.html @@ -52,14 +52,31 @@

A facility which deals with usual date needs (wrapper around joda-time).

The goal is to remove the maximum of highly used low-level code from your spark job and replace it with methods fully tested whose name is -self-explanatory/readable.

A few exemples:

assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101"))
-assert(DateHelper.today() == "20170310") // If today's "20170310"
-assert(DateHelper.yesterday() == "20170309") // If today's "20170310"
-assert(DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") == "170327")
-assert(DateHelper.now("HH:mm") == "10:24")
-assert(DateHelper.currentTimestamp() == "1493105229736")
-assert(DateHelper.nDaysBefore(3) == "20170307") // If today's "20170310"
-assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310")

Source import com.spark_helper.DateHelper + +DateHelper.daysBetween("20161230", "20170101") // List("20161230", "20161231", "20170101") +DateHelper.today // "20170310" +DateHelper.yesterday // "20170309" +DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") // "170327" +DateHelper.now("HH:mm") // "10:24" +DateHelper.currentTimestamp // "1493105229736" +DateHelper.nDaysBefore(3) // "20170307" +DateHelper.nDaysAfterDate(3, "20170307") // "20170310" +DateHelper.nextDay("20170310") // "20170311" +DateHelper.nbrOfDaysSince("20170302") // 8 +DateHelper.nbrOfDaysBetween("20170327", "20170401") // 5 +DateHelper.dayOfWeek("20160614") // 2 + +import com.spark_helper.DateHelper._ + +2.daysAgo // "20170308" +"20161230" to "20170101" // List("20161230", "20161231", "20170101") +3.daysBefore("20170310") // "20170307" +5.daysAfter // "20170315" +4.daysAfter("20170310") // "20170314" +"20170302".isCompliantWith("yyyyMMdd") +"20170310".nextDay // "20170311" +"20170310".previousDay // "20170309"

Source DateHelper

Since

2017-02

Linear Supertypes @@ -100,7 +117,44 @@

- +
+

Type Members

+
  1. + + +

    + + implicit + class + + + IntExtensions extends AnyRef + +

    + + Permalink + + + +
  2. + + +

    + + implicit + class + + + StringExtensions extends AnyRef + +

    + + Permalink + + + +
+
@@ -198,18 +252,18 @@

  • - - + +

    def - currentTimestamp(): String + currentTimestamp: String

    - + Permalink @@ -241,7 +295,7 @@

    def - dateFromTimestamp(timestamp: Long, format: String = "yyyyMMdd"): String + dateFromTimestamp(timestamp: Long, format: String = defaultFormat): String

    @@ -250,7 +304,7 @@

    Returns the date associated to the given UTC timestamp.

    Returns the date associated to the given UTC timestamp.

    assert(DateHelper.dateFromTimestamp(1496074819L) == "20170529")
     assert(DateHelper.dateFromTimestamp(1496074819L, "yyMMdd") == "170529")
    timestamp

    the UTC timestamps (nbr of millis since 1970-01-01) for -which to get the associated date.

    format

    (default = "yyyyMMdd") the format of the provided dates

    returns

    the associated date under the requested format

    +which to get the associated date.

    format

    the format of the provided dates

    returns

    the associated date under the requested format

  • @@ -260,15 +314,14 @@

    def - dayOfWeek(date: String, format: String = "yyyyMMdd"): Int + dayOfWeek(date: String, format: String = defaultFormat): Int

    Permalink -

    Returns the day of week for a date under the given format.

    Returns the day of week for a date under the given format.

    A Monday is 1 and a Sunday is 7.

    assert(DateHelper.dayOfWeek("20160614") == 2)
    date

    the date for which to get the day of week

    format

    (default = "yyyyMMdd") the format under which the date is -provided.

    returns

    the associated day of week, such as 2 for Tuesday

    +

    Returns the day of week for a date under the given format.

    Returns the day of week for a date under the given format.

    A Monday is 1 and a Sunday is 7.

    assert(DateHelper.dayOfWeek("20160614") == 2)
    date

    the date for which to get the day of week

    format

    the format under which the date is provided

    returns

    the associated day of week, such as 2 for Tuesday

  • @@ -278,15 +331,15 @@

    def - daysBetween(firstDate: String, lastDate: String, format: String = "yyyyMMdd"): List[String] + daysBetween(firstDate: String, lastDate: String, format: String = defaultFormat): List[String]

    Permalink -

    Finds the list of dates between the two given dates.

    Finds the list of dates between the two given dates.

    assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101"))
    firstDate

    the first date (in the given format)

    lastDate

    the last date (in the given format)

    format

    (default = "yyyyMMdd") the format to use for firstDate and -lastDate and for the returned list of dates.

    returns

    the list of dates between firstDate and lastDate in the given +

    Finds the list of dates between the two given dates.

    Finds the list of dates between the two given dates.

    assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101"))
    firstDate

    the first date (in the given format)

    lastDate

    the last date (in the given format)

    format

    the format to use for firstDate and lastDate and for the +returned list of dates.

    returns

    the list of dates between firstDate and lastDate in the given format.

  • @@ -402,7 +455,7 @@

    assert(!DateHelper.isDateCompliantWithFormat("170228", "yyyyMMdd")) assert(!DateHelper.isDateCompliantWithFormat("", "yyyyMMdd")) assert(!DateHelper.isDateCompliantWithFormat("a", "yyyyMMdd")) -assert(!DateHelper.isDateCompliantWithFormat("24JAN17", "yyyyMMdd"))

  • stringValue

    the stringified date

    returns

    if the provided date is under the provided format

    +assert(!DateHelper.isDateCompliantWithFormat("24JAN17", "yyyyMMdd"))
    stringValue

    the formatted date

    returns

    if the provided date is under the provided format

  • @@ -440,6 +493,64 @@

    returned list of dates and thus prefer getting a list of Joda DateTime objects instead of String dates.

    jodaFirstDate

    the joda DateTime first date

    jodaLastDate

    the joda DateTime last date

    returns

    the list of joda DateTime between jodaFirstDate and jodaLastDate

    +

  • + + +

    + + + def + + + nDaysAfter(nbrOfDaysAfter: Int): String + +

    + + Permalink + + +

    Returns which date it will be x days after today under the default format.

    Returns which date it will be x days after today under the default format.

    // If today's "20170310":
    +assert(DateHelper.nDaysAfter(5) == "20170315")
    nbrOfDaysAfter

    the nbr of days after today

    returns

    today's date plus the nbrOfDaysAfter under the default format

    +
  • + + +

    + + + def + + + nDaysAfter(nbrOfDaysAfter: Int, format: String): String + +

    + + Permalink + + +

    Returns which date it will be x days after today under the requested format.

    Returns which date it will be x days after today under the requested format.

    // If today's "20170310":
    +assert(DateHelper.nDaysAfter(5, "yyMMdd") == "170315")
    nbrOfDaysAfter

    the nbr of days after today

    format

    the format for the returned date

    returns

    today's date plus the nbrOfDaysAfter under the requested format

    +
  • + + +

    + + + def + + + nDaysAfterDate(nbrOfDaysAfter: Int, date: String): String + +

    + + Permalink + + +

    Returns which date it will be x days after the given date under the +default format.

    Returns which date it will be x days after the given date under the +default format.

    If the given date is "20170122" and we request the date it will be 3 days +after, we'll return "20170125".

    assert(DateHelper.nDaysAfterDate(5, "20170305") == "20170310")
    nbrOfDaysAfter

    the nbr of days after the given date

    date

    the date under the default format for which we want the date +for nbrOfDaysAfter days after.

    returns

    the date it was nbrOfDaysAfter after date under the default +format.

  • @@ -449,7 +560,7 @@

    def - nDaysAfterDate(nbrOfDaysAfter: Int, date: String, format: String = "yyyyMMdd"): String + nDaysAfterDate(nbrOfDaysAfter: Int, date: String, format: String): String

    @@ -457,11 +568,27 @@

    Returns which date it will be x days after the given date.

    Returns which date it will be x days after the given date.

    If the given date is "20170122" and we request the date it will be 3 days -after, we'll return "20170125".

    assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310")
    -assert(DateHelper.nDaysAfterDate(5, "170305", "yyMMdd") == "170310")
    nbrOfDaysAfter

    the nbr of days after the given date

    date

    the date under the provided format for which we want the date -for nbrOfDaysAfter days after.

    format

    (default = "yyyyMMdd") the format for the provided and -returned dates.

    returns

    the date it was nbrOfDaysAfter after date under the requested +after, we'll return "20170125".

    assert(DateHelper.nDaysAfterDate(5, "170305", "yyMMdd") == "170310")
    nbrOfDaysAfter

    the nbr of days after the given date

    date

    the date under the provided format for which we want the date +for nbrOfDaysAfter days after.

    format

    the format for the provided and returned dates.

    returns

    the date it was nbrOfDaysAfter after date under the requested format.

    +

  • + + +

    + + + def + + + nDaysBefore(nbrOfDaysBefore: Int): String + +

    + + Permalink + + +

    Returns which date it was x days before today.

    Returns which date it was x days before today.

    // If today's "20170310":
    +assert(DateHelper.nDaysBefore(5) == "20170305")
    nbrOfDaysBefore

    the nbr of days before today

    returns

    today's date minus the nbrOfDaysBefore under the default format

  • @@ -471,17 +598,35 @@

    def - nDaysBefore(nbrOfDaysBefore: Int, format: String = "yyyyMMdd"): String + nDaysBefore(nbrOfDaysBefore: Int, format: String): String

    Permalink -

    Returns which date it was x days before today under the requested format.

    Returns which date it was x days before today under the requested format.

    If we're "20170125" and we request for 3 days before, we'll return -"20170122".

    // If today's "20170310":
    -assert(DateHelper.nDaysBefore(3) == "20170307")
    -assert(DateHelper.nDaysBefore(5, "yyMMdd") == "170305")
    nbrOfDaysBefore

    the nbr of days before today

    format

    (default = "yyyyMMdd") the format for the returned date

    returns

    today's date minus the nbrOfDaysBefore under the requested format

    +

    Returns which date it was x days before today under the requested format.

    Returns which date it was x days before today under the requested format.

    // If today's "20170310":
    +assert(DateHelper.nDaysBefore(5, "yyMMdd") == "170305")
    nbrOfDaysBefore

    the nbr of days before today

    format

    the format for the returned date

    returns

    today's date minus the nbrOfDaysBefore under the requested format

    +
  • + + +

    + + + def + + + nDaysBeforeDate(nbrOfDaysBefore: Int, date: String): String + +

    + + Permalink + + +

    Returns which date it was x days before the given date.

    Returns which date it was x days before the given date.

    If the given date is "20170125" and we request the date it was 3 days +before, this will return "20170122".

    assert(DateHelper.nDaysBeforeDate(5, "20170310") == "20170305")
    nbrOfDaysBefore

    the nbr of days before the given date

    date

    the date under the default format for which we want the date +for nbrOfDaysBefore days before.

    returns

    the date it was nbrOfDaysBefore before date under the default +format.

  • @@ -491,7 +636,7 @@

    def - nDaysBeforeDate(nbrOfDaysBefore: Int, date: String, format: String = "yyyyMMdd"): String + nDaysBeforeDate(nbrOfDaysBefore: Int, date: String, format: String): String

    @@ -499,10 +644,8 @@

    Returns which date it was x days before the given date.

    Returns which date it was x days before the given date.

    If the given date is "20170125" and we request the date it was 3 days -before, we'll return "20170122".

    assert(DateHelper.nDaysBeforeDate(3, "20170310") == "20170307")
    -assert(DateHelper.nDaysBeforeDate(5, "170310", "yyMMdd") == "170305")
    nbrOfDaysBefore

    the nbr of days before the given date

    date

    the date under the provided format for which we want the date -for nbrOfDaysBefore days before.

    format

    (default = "yyyyMMdd") the format for the provided and -returned dates.

    returns

    the date it was nbrOfDaysBefore before date under the requested +before, this will return "20170122".

    assert(DateHelper.nDaysBeforeDate(5, "170310", "yyMMdd") == "170305")
    nbrOfDaysBefore

    the nbr of days before the given date

    date

    the date under the provided format for which we want the date +for nbrOfDaysBefore days before.

    format

    the format for the provided and returned dates.

    returns

    the date it was nbrOfDaysBefore before date under the requested format.

  • @@ -513,7 +656,7 @@

    def - nbrOfDaysBetween(firstDate: String, lastDate: String, format: String = "yyyyMMdd"): Int + nbrOfDaysBetween(firstDate: String, lastDate: String, format: String = defaultFormat): Int

    @@ -524,7 +667,7 @@

    assert(DateHelper.nbrOfDaysBetween("20170327", "20170401") == 5)

    This expects the first date to be before the last date.

    firstDate

    the first date of the range for which to egt the nbr of days.

    lastDate

    the last date of the range for which to egt the nbr of -days.

    format

    (default = "yyyyMMdd") the format of the provided dates

    returns

    the nbr of days between the two given dates

    +days.

    format

    the format of the provided dates

    returns

    the nbr of days between the two given dates

  • @@ -534,7 +677,7 @@

    def - nbrOfDaysSince(date: String, format: String = "yyyyMMdd"): Int + nbrOfDaysSince(date: String, format: String = defaultFormat): Int

    @@ -543,7 +686,7 @@

    Returns the nbr of days between today and the given date.

    Returns the nbr of days between today and the given date.

    // If today is "20170327":
     assert(DateHelper.nbrOfDaysSince("20170310") == 17)
    -assert(DateHelper.nbrOfDaysSince("170310", "yyMMdd") == 17)
    date

    the date for which to find the nbr of days of diff with today

    format

    (default = "yyyyMMdd") the format of the provided date

    returns

    the nbr of days between today and the given date

    +assert(DateHelper.nbrOfDaysSince("170310", "yyMMdd") == 17)
    date

    the date for which to find the nbr of days of diff with today

    format

    the format of the provided date

    returns

    the nbr of days between today and the given date

  • @@ -570,17 +713,15 @@

    def - nextDay(date: String, format: String = "yyyyMMdd"): String + nextDay(date: String, format: String = defaultFormat): String

    Permalink -

    Returns for a date the date one day latter.

    Returns for a date the date one day latter.

    // If the given date is "20170310":
    -assert(DateHelper.nextDay("20170310") == "20170311")
    -assert(DateHelper.nextDay("170310", "yyMMdd") == "170311")
    date

    the date for which to find the date of the day after

    format

    (default = "yyyyMMdd") the format of the provided and the -returned dates.

    returns

    the date of the day after the given date

    +

    Returns for a date the date one day latter.

    Returns for a date the date one day latter.

    assert(DateHelper.nextDay("20170310") == "20170311")
    +assert(DateHelper.nextDay("170310", "yyMMdd") == "170311")
    date

    the date for which to find the date of the day after

    format

    the format of the provided and the returned dates

    returns

    the date of the day after the given date

  • @@ -643,17 +784,15 @@

    def - previousDay(date: String, format: String = "yyyyMMdd"): String + previousDay(date: String, format: String = defaultFormat): String

    Permalink -

    Returns for a date the date one day before.

    Returns for a date the date one day before.

    // If the given date is "20170310":
    -assert(DateHelper.previousDay("20170310") == "20170309")
    -assert(DateHelper.previousDay("170310", "yyMMdd") == "170309")
    date

    the date for which to find the date of the day before

    format

    (default = "yyyyMMdd") the format of the provided and the -returned dates.

    returns

    the date of the day before the given date

    +

    Returns for a date the date one day before.

    Returns for a date the date one day before.

    assert(DateHelper.previousDay("20170310") == "20170309")
    +assert(DateHelper.previousDay("170310", "yyMMdd") == "170309")
    date

    the date for which to find the date of the day before

    format

    the format of the provided and the returned dates

    returns

    the date of the day before the given date

  • @@ -671,6 +810,29 @@

    Reformats a date from one format to another.

    Reformats a date from one format to another.

    assert(DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") == "170327")
    date

    the date to reformat

    inputFormat

    the format in which the date to reformat is provided

    outputFormat

    the format in which to format the provided date

    returns

    the date under the new format

    +

  • + + +

    + + + def + + + setFormat(format: String): Unit + +

    + + Permalink + + +

    Sets the default date format used by these functions when no date format +is specified.

    Sets the default date format used by these functions when no date format +is specified.

    // By default, yyyyMMdd is used:
    +assert(3.daysBefore == "20170307")
    +// But this can be modified globally:
    +DateHelper.setFormat("ddMMMyy")
    +assert(3.daysBefore == "07Mar17")
    format

    the new default format

  • @@ -705,6 +867,24 @@

    Definition Classes
    AnyRef → Any
    +

  • + + +

    + + + def + + + today: String + +

    + + Permalink + + +

    Returns today's date/time under the default format.

    Returns today's date/time under the default format.

    // If today's "20170310":
    +assert(DateHelper.today() == "20170310")
    returns

    today's date under the default format

  • @@ -714,7 +894,7 @@

    def - today(format: String = "yyyyMMdd"): String + today(format: String): String

    @@ -722,8 +902,25 @@

    Returns today's date/time under the requested format.

    Returns today's date/time under the requested format.

    // If today's "20170310":
    -assert(DateHelper.today() == "20170310")
    -assert(DateHelper.today("yyMMdd") == "170310")
    format

    (default = "yyyyMMdd") the format for the current date

    returns

    today's date under the requested format

    +assert(DateHelper.today("yyMMdd") == "170310")
    format

    the format for the current date

    returns

    today's date under the requested format

    +

  • + + +

    + + + def + + + twoDaysAgo(): String + +

    + + Permalink + + +

    Returns which date it was 2 days before today under the default format.

    Returns which date it was 2 days before today under the default format.

    // If today's "20170310":
    +assert(DateHelper.twoDaysAgo() == "20170308")
    returns

    the date of two days ago under the default format

  • @@ -733,7 +930,7 @@

    def - twoDaysAgo(format: String = "yyyyMMdd"): String + twoDaysAgo(format: String): String

    @@ -741,9 +938,7 @@

    Returns which date it was 2 days before today under the requested format.

    Returns which date it was 2 days before today under the requested format.

    // If today's "20170310":
    -assert(DateHelper.twoDaysAgo() == "20170308")
    -assert(DateHelper.twoDaysAgo("yyMMdd") == "170308")
    format

    (default = "yyyyMMdd") the format in which to output the -date of two days ago.

    returns

    the date of two days ago under the requested format

    +assert(DateHelper.twoDaysAgo("yyMMdd") == "170308")
    format

    the format in which to output the date of two days ago

    returns

    the date of two days ago under the requested format

  • @@ -813,6 +1008,24 @@

    ) +

  • + + +

    + + + def + + + yesterday: String + +

    + + Permalink + + +

    Returns yesterday's date/time under the default format.

    Returns yesterday's date/time under the default format.

    // If today's "20170310":
    +assert(DateHelper.yesterday() == "20170309")
    returns

    yesterday's date under the default format

  • @@ -822,7 +1035,7 @@

    def - yesterday(format: String = "yyyyMMdd"): String + yesterday(format: String): String

    @@ -830,9 +1043,7 @@

    Returns yesterday's date/time under the requested format.

    Returns yesterday's date/time under the requested format.

    // If today's "20170310":
    -assert(DateHelper.yesterday() == "20170309")
    -assert(DateHelper.yesterday("yyMMdd") == "170309")
    format

    (default = "yyyyMMdd") the format in which to output the -date of yesterday.

    returns

    yesterday's date under the requested format

    +assert(DateHelper.yesterday("yyMMdd") == "170309")
    format

    the format in which to output the date of yesterday

    returns

    yesterday's date under the requested format

  • diff --git a/docs/com/spark_helper/HdfsHelper$$SeqExtensions.html b/docs/com/spark_helper/HdfsHelper$$SeqExtensions.html new file mode 100644 index 0000000..b234a92 --- /dev/null +++ b/docs/com/spark_helper/HdfsHelper$$SeqExtensions.html @@ -0,0 +1,539 @@ + + + + SeqExtensions - com.spark_helper.HdfsHelper.SeqExtensions + + + + + + + + + + + + + + + +
    + Class +

    com.spark_helper.HdfsHelper

    +

    SeqExtensions

    Related Doc: + package HdfsHelper +

    + + Permalink + + +
    + +

    + + implicit + class + + + SeqExtensions[T <: Seq[String]] extends AnyRef + +

    + +
    + Linear Supertypes +
    AnyRef, Any
    +
    + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. SeqExtensions
    2. AnyRef
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + +

      + + + new + + + SeqExtensions(seq: T)(implicit arg0: ClassTag[T]) + +

      + + Permalink + + + +
    +
    + + + + + +
    +

    Value Members

    +
    1. + + +

      + + final + def + + + !=(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + +

      + + final + def + + + ##(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + +

      + + final + def + + + ==(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + +

      + + final + def + + + asInstanceOf[T0]: T0 + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    5. + + +

      + + + def + + + clone(): AnyRef + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    6. + + +

      + + final + def + + + eq(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    7. + + +

      + + + def + + + equals(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + +

      + + + def + + + finalize(): Unit + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + +

      + + final + def + + + getClass(): Class[_] + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    10. + + +

      + + + def + + + hashCode(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    11. + + +

      + + final + def + + + isInstanceOf[T0]: Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    12. + + +

      + + final + def + + + ne(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    13. + + +

      + + final + def + + + notify(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    14. + + +

      + + final + def + + + notifyAll(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    15. + + +

      + + + val + + + seq: T + +

      + + Permalink + + + +
    16. + + +

      + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    17. + + +

      + + + def + + + toString(): String + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    18. + + +

      + + final + def + + + wait(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    19. + + +

      + + final + def + + + wait(arg0: Long, arg1: Int): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    20. + + +

      + + final + def + + + wait(arg0: Long): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    21. + + +

      + + + def + + + writeToHdfs(filePath: String): Unit + +

      + + Permalink + + +

      Saves list elements in a file on hdfs.

      Saves list elements in a file on hdfs.

      Please only consider this way of storing data when the data set is small +enough.

      Overwrites the file if it already exists.

      Array("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt")
      +List("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt")
      filePath

      the path of the file in which to write the content of +the List.

      +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + + + + diff --git a/docs/com/spark_helper/HdfsHelper$$StringExtensions.html b/docs/com/spark_helper/HdfsHelper$$StringExtensions.html new file mode 100644 index 0000000..ca7faf5 --- /dev/null +++ b/docs/com/spark_helper/HdfsHelper$$StringExtensions.html @@ -0,0 +1,536 @@ + + + + StringExtensions - com.spark_helper.HdfsHelper.StringExtensions + + + + + + + + + + + + + + + +
    + Class +

    com.spark_helper.HdfsHelper

    +

    StringExtensions

    Related Doc: + package HdfsHelper +

    + + Permalink + + +
    + +

    + + implicit + class + + + StringExtensions extends AnyRef + +

    + +
    + Linear Supertypes +
    AnyRef, Any
    +
    + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. StringExtensions
    2. AnyRef
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + +

      + + + new + + + StringExtensions(string: String) + +

      + + Permalink + + + +
    +
    + + + + + +
    +

    Value Members

    +
    1. + + +

      + + final + def + + + !=(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + +

      + + final + def + + + ##(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + +

      + + final + def + + + ==(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + +

      + + final + def + + + asInstanceOf[T0]: T0 + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    5. + + +

      + + + def + + + clone(): AnyRef + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    6. + + +

      + + final + def + + + eq(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    7. + + +

      + + + def + + + equals(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + +

      + + + def + + + finalize(): Unit + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + +

      + + final + def + + + getClass(): Class[_] + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    10. + + +

      + + + def + + + hashCode(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    11. + + +

      + + final + def + + + isInstanceOf[T0]: Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    12. + + +

      + + final + def + + + ne(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    13. + + +

      + + final + def + + + notify(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    14. + + +

      + + final + def + + + notifyAll(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    15. + + +

      + + + val + + + string: String + +

      + + Permalink + + + +
    16. + + +

      + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    17. + + +

      + + + def + + + toString(): String + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    18. + + +

      + + final + def + + + wait(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    19. + + +

      + + final + def + + + wait(arg0: Long, arg1: Int): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    20. + + +

      + + final + def + + + wait(arg0: Long): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    21. + + +

      + + + def + + + writeToHdfs(filePath: String): Unit + +

      + + Permalink + + +

      Saves the String in a file on hdfs.

      Saves the String in a file on hdfs.

      Overwrites the file if it already exists.

      "some\nrelatively small\ntext".writeToHdfsFile("/some/hdfs/file/path.txt")
      filePath

      the path of the file in which to write the String

      +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + + + + diff --git a/docs/com/spark_helper/HdfsHelper$.html b/docs/com/spark_helper/HdfsHelper$.html index ab99d93..ee3a0ca 100644 --- a/docs/com/spark_helper/HdfsHelper$.html +++ b/docs/com/spark_helper/HdfsHelper$.html @@ -54,38 +54,39 @@

    spark job and replace it with methods fully tested whose name is self-explanatory/readable.

    For instance, one don't want to remove a file from hdfs using 3 lines of code and thus could instead just use -HdfsHelper.deleteFile("my/hdfs/file/path.csv").

    A few exemples:

    import com.spark_helper.HdfsHelper
    +HdfsHelper.deleteFile("my/hdfs/file/path.csv").

    A few examples:

    import com.spark_helper.HdfsHelper
     
     // A bunch of methods wrapping the FileSystem API, such as:
    -HdfsHelper.fileExists("my/hdfs/file/path.txt")
    -assert(HdfsHelper.listFileNamesInFolder("my/folder/path") == List("file_name_1.txt", "file_name_2.csv"))
    -assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306")
    -assert(HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") == 3)
    -HdfsHelper.deleteFile("my/hdfs/file/path.csv")
    -HdfsHelper.moveFolder("my/hdfs/folder")
    +HdfsHelper.fileExists("my/hdfs/file/path.txt") // HdfsHelper.folderExists("my/hdfs/folder")
    +HdfsHelper.listFileNamesInFolder("my/folder/path") // List("file_name_1.txt", "file_name_2.csv")
    +HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") // "20170306"
    +HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") // 3
    +HdfsHelper.deleteFile("my/hdfs/file/path.csv") // HdfsHelper.deleteFolder("my/hdfs/folder")
    +HdfsHelper.moveFolder("old/path", "new/path") // HdfsHelper.moveFile("old/path.txt", "new/path.txt")
    +HdfsHelper.createEmptyHdfsFile("/some/hdfs/file/path.token") // HdfsHelper.createFolder("my/hdfs/folder")
    +
    +// File content helpers:
     HdfsHelper.compressFile("hdfs/path/to/uncompressed_file.txt", classOf[GzipCodec])
     HdfsHelper.appendHeader("my/hdfs/file/path.csv", "colum0,column1")
     
     // Some Xml/Typesafe helpers for hadoop as well:
    -HdfsHelper.isHdfsXmlCompliantWithXsd(
    -  "my/hdfs/file/path.xml", getClass.getResource("/some_xml.xsd"))
    +HdfsHelper.isHdfsXmlCompliantWithXsd("my/hdfs/file/path.xml", getClass.getResource("/some_xml.xsd"))
     HdfsHelper.loadXmlFileFromHdfs("my/hdfs/file/path.xml")
     
    -// Very handy to load a config (typesafe format) stored on hdfs at the
    -// begining of a spark job:
    +// Very handy to load a config (typesafe format) stored on hdfs at the beginning of a spark job:
     HdfsHelper.loadTypesafeConfigFromHdfs("my/hdfs/file/path.conf"): Config
     
    -// In order to write small amount of data in a file on hdfs without the
    -// whole spark stack:
    -HdfsHelper.writeToHdfsFile(
    -  Array("some", "relatively small", "text"),
    -  "/some/hdfs/file/path.txt")
    +// In order to write small amount of data in a file on hdfs without the whole spark stack:
    +HdfsHelper.writeToHdfsFile(Array("some", "relatively small", "text"), "/some/hdfs/file/path.txt")
    +// or:
    +import com.spark_helper.HdfsHelper._
    +Array("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt")
    +"hello world".writeToHdfs("/some/hdfs/file/path.txt")
     
    -// Deletes all files/folders in "hdfs/path/to/folder" for which the
    -// timestamp is older than 10 days:
    +// Deletes all files/folders in "hdfs/path/to/folder" for which the timestamp is older than 10 days:
     HdfsHelper.purgeFolder("hdfs/path/to/folder", 10)

    Source HdfsHelper -

    Since

    2017-02

    +

    Since

    2017-02

    To do

    Create a touch method

    Linear Supertypes
    Serializable, Serializable, AnyRef, Any
    @@ -124,7 +125,44 @@

    - +
    +

    Type Members

    +
    1. + + +

      + + implicit + class + + + SeqExtensions[T <: Seq[String]] extends AnyRef + +

      + + Permalink + + + +
    2. + + +

      + + implicit + class + + + StringExtensions extends AnyRef + +

      + + Permalink + + + +
    +
    @@ -200,7 +238,7 @@

    Appends a footer to a file.

    Appends a footer to a file.

    If the workingFolderPath parameter is provided, then the processing is done in a working/tmp folder and then only, the final file is moved to its final real location. This way, in case of cluster instability, i.e. in -case the Spark job is interupted, this avoids having a temporary or +case the Spark job is interrupted, this avoids having a temporary or corrupted file in output.

    filePath

    the path of the file for which to add the footer

    footer

    the footer to add

    workingFolderPath

    the path where file manipulations will happen

  • @@ -219,11 +257,11 @@

    Permalink -

    Appends a header to a file.

    Appends a header to a file.

    Usefull when creating a csv file with spark and you need to add a header +

    Appends a header to a file.

    Appends a header to a file.

    Useful when creating a csv file with spark and you need to add a header describing the different fields.

    If the workingFolderPath parameter is provided, then the processing is done in a working/tmp folder and then only, the final file is moved to its final real location. This way, in case of cluster instability, i.e. in -case the Spark job is interupted, this avoids having a temporary or +case the Spark job is interrupted, this avoids having a temporary or corrupted file in output.

    filePath

    the path of the file for which to add the header

    header

    the header to add

    workingFolderPath

    the path where file manipulations will happen

  • @@ -242,11 +280,11 @@

    Permalink -

    Appends a header and a footer to a file.

    Appends a header and a footer to a file.

    Usefull when creating an xml file with spark and you need to add top level +

    Appends a header and a footer to a file.

    Appends a header and a footer to a file.

    Useful when creating an xml file with spark and you need to add top level tags.

    If the workingFolderPath parameter is provided, then the processing is done in a working/tmp folder and then only, the final file is moved to its final real location. This way, in case of cluster instability, i.e. in -case the Spark job is interupted, this avoids having a temporary or +case the Spark job is interrupted, this avoids having a temporary or corrupted file in output.

    filePath

    the path of the file for which to add the header and the footer.

    header

    the header to add

    footer

    the footer to add

    workingFolderPath

    the path where file manipulations will happen

    @@ -329,8 +367,8 @@

    Permalink -

    Creates an empty file on hdfs.

    Creates an empty file on hdfs.

    Might be usefull for token files. For instance a file which is only used -as a timestamp token of the last update of a processus, or a file which +

    Creates an empty file on hdfs.

    Creates an empty file on hdfs.

    Might be useful for token files. For instance a file which is only used +as a timestamp token of the last update of a process, or a file which blocks the execution of an other instance of the same job, ...

    Overwrites the file if it already exists.

    HdfsHelper.createEmptyHdfsFile("/some/hdfs/file/path.token")

    In case this is used as a timestamp container, you can then use the following methods to retrieve its timestamp:

    val fileAge = HdfsHelper.nbrOfDaysSinceFileWasLastModified("/some/hdfs/file/path.token")
     val lastModificationDate = HdfsHelper.folderModificationDate("/some/hdfs/file/path.token")
    filePath

    the path of the empty file to create

    @@ -456,9 +494,9 @@

    Permalink -

    Returns the stringified date of the last modification of the given file.

    Returns the stringified date of the last modification of the given file.

    assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306")
    hdfsPath

    the path of the file for which to get the last +

    Returns the formatted date of the last modification of the given file.

    Returns the formatted date of the last modification of the given file.

    assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306")
    hdfsPath

    the path of the file for which to get the last modification date.

    format

    (default = "yyyyMMdd") the format under which to get the -modification date.

    returns

    the stringified date of the last modification of the given file, +modification date.

    returns

    the formatted date of the last modification of the given file, under the provided format.

  • @@ -536,9 +574,9 @@

    Permalink -

    Returns the stringified date of the last modification of the given folder.

    Returns the stringified date of the last modification of the given folder.

    assert(HdfsHelper.folderModificationDate("my/hdfs/folder") == "20170306")
    hdfsPath

    the path of the folder for which to get the last +

    Returns the formatted date of the last modification of the given folder.

    Returns the formatted date of the last modification of the given folder.

    assert(HdfsHelper.folderModificationDate("my/hdfs/folder") == "20170306")
    hdfsPath

    the path of the folder for which to get the last modification date.

    format

    (default = "yyyyMMdd") the format under which to get the -modification date.

    returns

    the stringified date of the last modification of the given folder, +modification date.

    returns

    the formatted date of the last modification of the given folder, under the provided format.

  • @@ -683,7 +721,7 @@

    Permalink -

    Loads a typesafe config from Hdfs.

    Loads a typesafe config from Hdfs.

    The best way to load the configuration of your job from hdfs.

    Typesafe is a config format which looks like this:

    config {
    +      

    Loads a Typesafe config from Hdfs.

    Loads a Typesafe config from Hdfs.

    The best way to load the configuration of your job from hdfs.

    Typesafe is a config format which looks like this:

    config {
       airlines = [
         {
           code = QF
    @@ -702,8 +740,8 @@ 

    } } ] -}

    hdfsConfigPath

    the absolute path of the typesafe config file on -hdfs we want to load as a typesafe Config object.

    returns

    the com.typesafe.config.Config object which contains usable data

    +}
    hdfsConfigPath

    the absolute path of the Typesafe config file on +hdfs we want to load as a Typesafe Config object.

    returns

    the com.typesafe.config.Config object which contains usable data

  • @@ -854,6 +892,50 @@

    // timestamp is older than 10 days: HdfsHelper.purgeFolder("hdfs/path/to/folder", 10)

  • folderPath

    the path of the folder on hdfs to purge

    purgeAge

    the threshold (in nbr of days) above which a file is considered too old and thus deleted/purged.

    +
  • + + +

    + + + def + + + setConf(configuration: Configuration): Unit + +

    + + Permalink + + +

    Sets a specific Configuration +used by the underlying FileSystem +in case it requires some specificities.

    Sets a specific Configuration +used by the underlying FileSystem +in case it requires some specificities.

    If this setter is not used, the default Configuration is set with +new Configuration(). +

    configuration

    the specific Configuration to use

    +
  • + + +

    + + + def + + + setFileSystem(fileSystem: FileSystem): Unit + +

    + + Permalink + + +

    Sets a specific FileSystem +in case it requires some specificities.

    Sets a specific FileSystem +in case it requires some specificities.

    If this setter is not used, the default FileSystem is set with +FileSystem.get(new Configuration()). +

    fileSystem

    the specific FileSystem to use

  • @@ -999,7 +1081,7 @@

    enough.

    Overwrites the file if it already exists.

    HdfsHelper.writeToHdfsFile(
       Array("some", "relatively small", "text"), "/some/hdfs/file/path.txt")
     HdfsHelper.writeToHdfsFile(
    -  List("some", "relatively small", "text"), "/some/hdfs/file/path.txt")
    content

    the array of strings to write in the file as one line per + List("some", "relatively small", "text"), "/some/hdfs/file/path.txt")

    content

    the seq of strings to write in the file as one line per string (this takes care of joining strings with "\n"s).

    filePath

    the path of the file in which to write the content

  • diff --git a/docs/com/spark_helper/Monitor$.html b/docs/com/spark_helper/Monitor$.html index 955f476..15e0a20 100644 --- a/docs/com/spark_helper/Monitor$.html +++ b/docs/com/spark_helper/Monitor$.html @@ -49,11 +49,11 @@

    -

    A logger dedicated to Spak jobs.

    It's a simple logger/report which contains a report that one can update from +

    A logger dedicated to Spark jobs.

    It's a simple logger/report which contains a report that one can update from the driver and a success state. The idea is to persist job executions logs -and errors (and forget about grepping unreadable yarn logs).

    It's designed for perdiodic spark jobs (handles storage and purge of logs) +and errors (and forget about grepping unreadable yarn logs).

    It's designed for periodic spark jobs (handles storage and purge of logs) and provides a way to handle kpis validation.

    Logs are stored on the go which means one can have a direct real time access -of the job logs/status and it's current state (which can overwise be a pain +of the job logs/status and it's current state (which can otherwise be a pain if it means going through yarn logs, or even for certain production environments going through additional layers of software logs to get to yarn logs).

    One of the issues this logger aims at tackling is the handling of exceptions @@ -62,9 +62,9 @@

    want to perform a few actions before letting the job crash. The idea is thus to surround (driver side) a Spark pipeline within a try catch and redirect the exception to the logger for a clean logging.

    This is a "driver-only" logger and is not intended at logging concurrent -actions from executors.

    Produced reports can easily be inserted in a notification email whenerver +actions from executors.

    Produced reports can easily be inserted in a notification email whenever the job fails, which saves a lot of time to maintainers operating on heavy -production environements.

    The produced persisted report is also a way for downstream jobs to know the +production environments.

    The produced persisted report is also a way for downstream jobs to know the status of their input data.

    Let's go through a simple Spark job example monitored with this Monitor facility:

    Monitor.setTitle("My job title")
     Monitor.addDescription(
    @@ -83,7 +83,7 @@ 

    Test("Nbr of output records", processedData.count(), SUPERIOR_THAN, 10e6d, NBR), Test("Some pct of invalid output", your_complex_kpi, INFERIOR_THAN, 3, PCT) ), - "My pipeline descirption" + "My pipeline description" ) if (outputIsValid) @@ -91,9 +91,9 @@

    } catch { case iie: InvalidInputException => - Monitor.error(iie, "My pipeline descirption", diagnostic = "No input data!") + Monitor.error(iie, "My pipeline description", diagnostic = "No input data!") case e: Throwable => - Monitor.error(e, "My pipeline descirption") // whatever unexpected error + Monitor.error(e, "My pipeline description") // whatever unexpected error } if (Monitor.isSuccess()) { @@ -105,7 +105,7 @@

    // HDFS (this saves the logs in the folder set with Monitor.setLogFolder): Monitor.store() -// At the end of the job, if the job isn't successfull, you might want to +// At the end of the job, if the job isn't successful, you might want to // crash it (for instance to get a notification from your scheduler): if (!Monitor.isSuccess()) throw new Exception() // or send an email, or ...

    At any time during the job, logs can be accessed from file path/to/log/folder/current.ongoing

    If we were to read the stored report after this simple pipeline, here are @@ -113,8 +113,8 @@

    My job description (whatever you want); for instance: Documentation: https://github.com/xavierguihot/spark_helper -[10:23] Begining -[10:23-10:23] My pipeline descirption: failed +[10:23] Beginning +[10:23-10:23] My pipeline description: failed Diagnostic: No input data! org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://my/hdfs/input/path at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:285) @@ -124,18 +124,18 @@

    My job description (whatever you want); for instance: Documentation: https://github.com/xavierguihot/spark_helper -[10:23] Begining -[10:23-10:36] My pipeline descirption: failed +[10:23] Beginning +[10:23-10:36] My pipeline description: failed java.lang.NumberFormatException: For input string: "a" java.lang.NumberFormatException.forInputString(NumberFormatException.java:65) java.lang.Integer.parseInt(Integer.java:492) ... -[10:36] Duration: 00:13:47

    Another scenario, successfull spark pipeline and KPIs are valid; all good!:

              My job title
    +[10:36] Duration: 00:13:47

    Another scenario, successful spark pipeline and KPIs are valid; all good!:

              My job title
     
     My job description (whatever you want); for instance:
     Documentation: https://github.com/xavierguihot/spark_helper
    -[10:23] Begining
    -[10:23-10:41] My pipeline descirption: success
    +[10:23] Beginning
    +[10:23-10:41] My pipeline description: success
       KPI: Nbr of output records
         Value: 14669071.0
         Must be superior than 10000000.0
    @@ -147,7 +147,7 @@ 

    [10:41-10:42] My second pipeline description: success [10:42] Duration: 00:19:23

    Source Monitor -

    Since

    2017-02

    +

    Since

    2017-02

    To do

    would a State monad be appropriate?

    Linear Supertypes
    AnyRef, Any
    @@ -262,7 +262,7 @@

    Sets the report's contact list.

    Sets the report's contact list.

    This will appear within the first lines of the report:

    // Using:
     Monitor.setReportTitle("My Simple Job")
     Monitor.addContacts(List("x.guihot@gmail.com", "smbdy@gmail.com"))
    -// Produces this at the begining of the report:
    +// Produces this at the beginning of the report:
     "          My Simple Job"
     ""
     "Point of contact: x.guihot@gmail.com, smbdy@gmail.com"
    contacts

    the list of points of contact

    @@ -285,7 +285,7 @@

    Sets the report's description.

    Sets the report's description.

    This will appear within the first lines of the report:

    // Using:
     Monitor.setReportTitle("My Simple Job")
     Monitor.addDescription("Documentation: https://github.com/xavierguihot/spark_helper")
    -// Produces this at the begining of the report:
    +// Produces this at the beginning of the report:
     "          My Simple Job"
     ""
     "Documentation: https://github.com/xavierguihot/spark_helper"
    description

    the description of the Spark job (or whatever)

    @@ -372,7 +372,7 @@

    def - error(exception: Throwable, taskDescription: String, diagnostic: String = ""): Boolean + error(exception: Throwable, taskDescription: String, diagnostic: String = ""): Boolean

    @@ -385,8 +385,8 @@

    catch whatever exception from executors and thus log the exact error while still being able to keep on with the job or end it properly.

    Catching an error like this:

    monitor.error(
       invalidInputException,
    -  "My pipeline descirption",
    -  diagnostic = "No input data!")

    will result in this to be appended to the report:

    [10:23-10:24] My pipeline descirption: failed
    +  "My pipeline description",
    +  diagnostic = "No input data!")

    will result in this to be appended to the report:

    [10:23-10:24] My pipeline description: failed
       Diagnostic: No input data!
         org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://my/hdfs/input/path
         at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:285)
    @@ -410,7 +410,7 @@ 

    Updates the report with some text and a failure.

    Updates the report with some text and a failure.

    This sets the status of the monitoring to false. After that the status -will never be success again, even if you update the report with success().

    Using this method like this:

    monitor.error("Some text")

    will result in this to be appended to the report:

    "[10:35-10:37] Some text: failure\n"

    Once the monitoring is a failure, then whatever following successfull +will never be success again, even if you update the report with success().

    Using this method like this:

    monitor.error("Some text")

    will result in this to be appended to the report:

    "[10:35-10:37] Some text: failure\n"

    Once the monitoring is a failure, then whatever following successful action won't change the failed status of the monitoring.

    taskDescription

    the text to append to the report

    returns

    false since it's a failure

  • @@ -488,18 +488,18 @@

    Definition Classes
    Any

  • - - + +

    def - isSuccess(): Boolean + isSuccess: Boolean

    - + Permalink @@ -696,7 +696,7 @@

    Sets the report's title.

    Sets the report's title.

    This will be the first line of the report:

    // Using:
     Monitor.setReportTitle("My Simple Job")
    -// Produces this at the begining of the report:
    +// Produces this at the beginning of the report:
     "          My Simple Job"
     ""
    title

    the title of the report

  • diff --git a/docs/com/spark_helper/SparkHelper$$OptionRDDExtensions.html b/docs/com/spark_helper/SparkHelper$$OptionRDDExtensions.html new file mode 100644 index 0000000..66a0d24 --- /dev/null +++ b/docs/com/spark_helper/SparkHelper$$OptionRDDExtensions.html @@ -0,0 +1,540 @@ + + + + OptionRDDExtensions - com.spark_helper.SparkHelper.OptionRDDExtensions + + + + + + + + + + + + + + + +
    + Class +

    com.spark_helper.SparkHelper

    +

    OptionRDDExtensions

    Related Doc: + package SparkHelper +

    + + Permalink + + +
    + +

    + + implicit + class + + + OptionRDDExtensions[T] extends AnyRef + +

    + +
    + Linear Supertypes +
    AnyRef, Any
    +
    + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. OptionRDDExtensions
    2. AnyRef
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + +

      + + + new + + + OptionRDDExtensions(rdd: RDD[Option[T]])(implicit arg0: ClassTag[T]) + +

      + + Permalink + + + +
    +
    + + + + + +
    +

    Value Members

    +
    1. + + +

      + + final + def + + + !=(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + +

      + + final + def + + + ##(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + +

      + + final + def + + + ==(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + +

      + + final + def + + + asInstanceOf[T0]: T0 + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    5. + + +

      + + + def + + + clone(): AnyRef + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    6. + + +

      + + final + def + + + eq(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    7. + + +

      + + + def + + + equals(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + +

      + + + def + + + finalize(): Unit + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + +

      + + + def + + + flatten: RDD[T] + +

      + + Permalink + + +

      Flattens an RDD of Option[T] +to RDD[T].

      Flattens an RDD of Option[T] +to RDD[T].

      sc.parallelize(Array(Some(1), None, Some(2))).flatten == sc.parallelize(Array(Seq(1, 2)))
      returns

      the flat RDD as RDD.flatMap(x => x) +or List.flatten +would have.

      +
    10. + + +

      + + final + def + + + getClass(): Class[_] + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    11. + + +

      + + + def + + + hashCode(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    12. + + +

      + + final + def + + + isInstanceOf[T0]: Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    13. + + +

      + + final + def + + + ne(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    14. + + +

      + + final + def + + + notify(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    15. + + +

      + + final + def + + + notifyAll(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    16. + + +

      + + + val + + + rdd: RDD[Option[T]] + +

      + + Permalink + + + +
    17. + + +

      + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    18. + + +

      + + + def + + + toString(): String + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    19. + + +

      + + final + def + + + wait(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    20. + + +

      + + final + def + + + wait(arg0: Long, arg1: Int): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    21. + + +

      + + final + def + + + wait(arg0: Long): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + + + + diff --git a/docs/com/spark_helper/SparkHelper$$PairRDDExtensions.html b/docs/com/spark_helper/SparkHelper$$PairRDDExtensions.html new file mode 100644 index 0000000..b136443 --- /dev/null +++ b/docs/com/spark_helper/SparkHelper$$PairRDDExtensions.html @@ -0,0 +1,376 @@ + + + + PairRDDExtensions - com.spark_helper.SparkHelper.PairRDDExtensions + + + + + + + + + + + + + + + +
    + Class +

    com.spark_helper.SparkHelper

    +

    PairRDDExtensions

    Related Doc: + package SparkHelper +

    + + Permalink + + +
    + +

    + + implicit final + class + + + PairRDDExtensions extends AnyVal + +

    + +
    + Linear Supertypes +
    AnyVal, Any
    +
    + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. PairRDDExtensions
    2. AnyVal
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + +

      + + + new + + + PairRDDExtensions(rdd: RDD[(String, String)]) + +

      + + Permalink + + + +
    +
    + + + + + +
    +

    Value Members

    +
    1. + + +

      + + final + def + + + !=(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    2. + + +

      + + final + def + + + ##(): Int + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    3. + + +

      + + final + def + + + ==(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    4. + + +

      + + final + def + + + asInstanceOf[T0]: T0 + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    5. + + +

      + + + def + + + getClass(): Class[_ <: AnyVal] + +

      + + Permalink + + +
      Definition Classes
      AnyVal → Any
      +
    6. + + +

      + + final + def + + + isInstanceOf[T0]: Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    7. + + +

      + + + val + + + rdd: RDD[(String, String)] + +

      + + Permalink + + + +
    8. + + +

      + + + def + + + saveAsTextFileByKey(path: String, keyNbr: Int, codec: Class[_ <: CompressionCodec]): Unit + +

      + + Permalink + + +

      Saves and repartitions a key/value RDD on files whose name is the key.

      Saves and repartitions a key/value RDD on files whose name is the key.

      Within the provided path, there will be one file per key in the given +keyValueRDD. And within a file for a given key are only stored values +for this key.

      This is not scalable. This shouldn't be considered for any data flow +with normal or big volumes.

      rdd.saveAsTextFileByKey("/my/output/folder/path", 12, classOf[BZip2Codec])
      path

      the folder where will be stored key files

      keyNbr

      the nbr of expected keys (which is the nbr of output +files)

      codec

      the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))

      +
    9. + + +

      + + + def + + + saveAsTextFileByKey(path: String, codec: Class[_ <: CompressionCodec]): Unit + +

      + + Permalink + + +

      Saves and repartitions a key/value RDD on files whose name is the key.

      Saves and repartitions a key/value RDD on files whose name is the key.

      Within the provided path, there will be one file per key in the given +keyValueRDD. And within a file for a given key are only stored values +for this key.

      As this internally needs to know the nbr of keys, this will have to +compute it. If this nbr of keys is known beforehand, it would spare +resources to use +saveAsTextFileByKey(path: String, keyNbr: Int, codec: Class[_ <: CompressionCodec]) +instead.

      This is not scalable. This shouldn't be considered for any data flow +with normal or big volumes.

      rdd.saveAsTextFileByKey("/my/output/folder/path", classOf[BZip2Codec])
      path

      the folder where will be stored key files

      codec

      the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))

      +
    10. + + +

      + + + def + + + saveAsTextFileByKey(path: String, keyNbr: Int): Unit + +

      + + Permalink + + +

      Saves and repartitions a key/value RDD on files whose name is the key.

      Saves and repartitions a key/value RDD on files whose name is the key.

      Within the provided path, there will be one file per key in the given +keyValueRDD. And within a file for a given key are only stored values +for this key.

      This is not scalable. This shouldn't be considered for any data flow +with normal or big volumes.

      rdd.saveAsTextFileByKey("/my/output/folder/path", 12)
      path

      the folder where will be stored key files

      keyNbr

      the nbr of expected keys (which is the nbr of output +files)

      +
    11. + + +

      + + + def + + + saveAsTextFileByKey(path: String): Unit + +

      + + Permalink + + +

      Saves and repartitions a key/value RDD on files whose name is the key.

      Saves and repartitions a key/value RDD on files whose name is the key.

      Within the provided path, there will be one file per key in the given +keyValueRDD. And within a file for a given key are only stored values +for this key.

      As this internally needs to know the nbr of keys, this will have to +compute it. If this nbr of keys is known beforehand, it would spare +resources to use saveAsTextFileByKey(path: String, keyNbr: Int) +instead.

      This is not scalable. This shouldn't be considered for any data flow +with normal or big volumes.

      rdd.saveAsTextFileByKey("/my/output/folder/path")
      path

      the folder where will be stored key files

      +
    12. + + +

      + + + def + + + toString(): String + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyVal

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + + + + diff --git a/docs/com/spark_helper/SparkHelper$$RDDExtensions.html b/docs/com/spark_helper/SparkHelper$$RDDExtensions.html new file mode 100644 index 0000000..cfdc746 --- /dev/null +++ b/docs/com/spark_helper/SparkHelper$$RDDExtensions.html @@ -0,0 +1,547 @@ + + + + RDDExtensions - com.spark_helper.SparkHelper.RDDExtensions + + + + + + + + + + + + + + + +
    + Class +

    com.spark_helper.SparkHelper

    +

    RDDExtensions

    Related Doc: + package SparkHelper +

    + + Permalink + + +
    + +

    + + implicit + class + + + RDDExtensions[T] extends AnyRef + +

    + +
    + Linear Supertypes +
    AnyRef, Any
    +
    + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. RDDExtensions
    2. AnyRef
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + +

      + + + new + + + RDDExtensions(rdd: RDD[T])(implicit arg0: ClassTag[T]) + +

      + + Permalink + + + +
    +
    + + + + + +
    +

    Value Members

    +
    1. + + +

      + + final + def + + + !=(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + +

      + + final + def + + + ##(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + +

      + + final + def + + + ==(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + +

      + + final + def + + + asInstanceOf[T0]: T0 + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    5. + + +

      + + + def + + + clone(): AnyRef + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    6. + + +

      + + final + def + + + eq(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    7. + + +

      + + + def + + + equals(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + +

      + + + def + + + finalize(): Unit + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + +

      + + final + def + + + getClass(): Class[_] + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    10. + + +

      + + + def + + + hashCode(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    11. + + +

      + + final + def + + + isInstanceOf[T0]: Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    12. + + +

      + + final + def + + + ne(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    13. + + +

      + + final + def + + + notify(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    14. + + +

      + + final + def + + + notifyAll(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    15. + + +

      + + + def + + + partialMap(pf: PartialFunction[T, T]): RDD[T] + +

      + + Permalink + + +

      Map an RDD to the same type, by applying a partial function and the +identity otherwise.

      Map an RDD to the same type, by applying a partial function and the +identity otherwise.

      Avoids having case x => x.

      Similar idea to .collect, +but instead of skipping non-matching items, it keeps them as-is.

      sc.parallelize(Array(1, 3, 2, 7, 8)).partialMap { case a if a % 2 == 0 => 2 * a }
      +// is equivalent to:
      +sc.parallelize(Array(1, 3, 2, 7, 8)).map {
      +  case a if a % 2 == 0 => 2 * a
      +  case a               => a
      +}
      +// in order to map to:
      +sc.parallelize(Array(1, 3, 4, 7, 16))
      pf

      the partial function to apply

      returns

      an rdd of the same type, for which each element is either the +application of the partial function where defined or the identity.

      +
    16. + + +

      + + + val + + + rdd: RDD[T] + +

      + + Permalink + + + +
    17. + + +

      + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    18. + + +

      + + + def + + + toString(): String + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    19. + + +

      + + final + def + + + wait(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    20. + + +

      + + final + def + + + wait(arg0: Long, arg1: Int): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    21. + + +

      + + final + def + + + wait(arg0: Long): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + + + + diff --git a/docs/com/spark_helper/SparkHelper$$SeqRDDExtensions.html b/docs/com/spark_helper/SparkHelper$$SeqRDDExtensions.html new file mode 100644 index 0000000..190d93d --- /dev/null +++ b/docs/com/spark_helper/SparkHelper$$SeqRDDExtensions.html @@ -0,0 +1,540 @@ + + + + SeqRDDExtensions - com.spark_helper.SparkHelper.SeqRDDExtensions + + + + + + + + + + + + + + + +
    + Class +

    com.spark_helper.SparkHelper

    +

    SeqRDDExtensions

    Related Doc: + package SparkHelper +

    + + Permalink + + +
    + +

    + + implicit + class + + + SeqRDDExtensions[T] extends AnyRef + +

    + +
    + Linear Supertypes +
    AnyRef, Any
    +
    + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. SeqRDDExtensions
    2. AnyRef
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + +

      + + + new + + + SeqRDDExtensions(rdd: RDD[Seq[T]])(implicit arg0: ClassTag[T]) + +

      + + Permalink + + + +
    +
    + + + + + +
    +

    Value Members

    +
    1. + + +

      + + final + def + + + !=(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + +

      + + final + def + + + ##(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + +

      + + final + def + + + ==(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + +

      + + final + def + + + asInstanceOf[T0]: T0 + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    5. + + +

      + + + def + + + clone(): AnyRef + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    6. + + +

      + + final + def + + + eq(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    7. + + +

      + + + def + + + equals(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + +

      + + + def + + + finalize(): Unit + +

      + + Permalink + + +
      Attributes
      protected[java.lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + +

      + + + def + + + flatten: RDD[T] + +

      + + Permalink + + +

      Flattens an RDD of Seq[T] +to RDD[T].

      Flattens an RDD of Seq[T] +to RDD[T].

      sc.parallelize(Array(Seq(1, 2, 3), Nil, Seq(4))).flatten == sc.parallelize(Array(Seq(1, 2, 3, 4)))
      returns

      the flat RDD as RDD.flatMap(identity) +or List.flatten +would have.

      +
    10. + + +

      + + final + def + + + getClass(): Class[_] + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    11. + + +

      + + + def + + + hashCode(): Int + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    12. + + +

      + + final + def + + + isInstanceOf[T0]: Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    13. + + +

      + + final + def + + + ne(arg0: AnyRef): Boolean + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    14. + + +

      + + final + def + + + notify(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    15. + + +

      + + final + def + + + notifyAll(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    16. + + +

      + + + val + + + rdd: RDD[Seq[T]] + +

      + + Permalink + + + +
    17. + + +

      + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      +
    18. + + +

      + + + def + + + toString(): String + +

      + + Permalink + + +
      Definition Classes
      AnyRef → Any
      +
    19. + + +

      + + final + def + + + wait(): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    20. + + +

      + + final + def + + + wait(arg0: Long, arg1: Int): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    21. + + +

      + + final + def + + + wait(arg0: Long): Unit + +

      + + Permalink + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + + + + diff --git a/docs/com/spark_helper/SparkHelper$$SparkContextExtensions.html b/docs/com/spark_helper/SparkHelper$$SparkContextExtensions.html new file mode 100644 index 0000000..cda2b45 --- /dev/null +++ b/docs/com/spark_helper/SparkHelper$$SparkContextExtensions.html @@ -0,0 +1,476 @@ + + + + SparkContextExtensions - com.spark_helper.SparkHelper.SparkContextExtensions + + + + + + + + + + + + + + + +
    + Class +

    com.spark_helper.SparkHelper

    +

    SparkContextExtensions

    Related Doc: + package SparkHelper +

    + + Permalink + + +
    + +

    + + implicit final + class + + + SparkContextExtensions extends AnyVal + +

    + +
    + Linear Supertypes +
    AnyVal, Any
    +
    + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. SparkContextExtensions
    2. AnyVal
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + +

      + + + new + + + SparkContextExtensions(sc: SparkContext) + +

      + + Permalink + + + +
    +
    + + + + + +
    +

    Value Members

    +
    1. + + +

      + + final + def + + + !=(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    2. + + +

      + + final + def + + + ##(): Int + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    3. + + +

      + + final + def + + + ==(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    4. + + +

      + + final + def + + + asInstanceOf[T0]: T0 + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    5. + + +

      + + + def + + + decreaseCoalescence(highCoalescenceLevelFolder: String, lowerCoalescenceLevelFolder: String, finalCoalesceLevel: Int, codec: Class[_ <: CompressionCodec]): Unit + +

      + + Permalink + + +

      Decreases the nbr of partitions of a folder.

      Decreases the nbr of partitions of a folder.

      This comes in handy when the last step of your job needs to run on +thousands of files, but you want to store your final output on let's say +only 30 files.

      It's like a FileUtil.copyMerge() +, but the merging produces more than one file.

      Be aware that this methods deletes the provided input folder.

      sc.decreaseCoalescence(
      +  "/folder/path/with/2000/files",
      +  "/produced/folder/path/with/only/30/files",
      +  30,
      +  classOf[BZip2Codec]
      +)
      highCoalescenceLevelFolder

      the folder which contains 10000 files

      lowerCoalescenceLevelFolder

      the folder which will contain the same +data as highCoalescenceLevelFolder but spread on only 30 files (where 30 +is the finalCoalesceLevel parameter).

      finalCoalesceLevel

      the nbr of files within the folder at the end +of this method.

      codec

      the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))

      +
    6. + + +

      + + + def + + + decreaseCoalescence(highCoalescenceLevelFolder: String, lowerCoalescenceLevelFolder: String, finalCoalesceLevel: Int): Unit + +

      + + Permalink + + +

      Decreases the nbr of partitions of a folder.

      Decreases the nbr of partitions of a folder.

      This comes in handy when the last step of your job needs to run on +thousands of files, but you want to store your final output on let's say +only 30 files.

      It's like a FileUtil.copyMerge() +, but the merging produces more than one file.

      Be aware that this methods deletes the provided input folder.

      sc.decreaseCoalescence(
      +  "/folder/path/with/2000/files",
      +  "/produced/folder/path/with/only/30/files",
      +  30
      +)
      highCoalescenceLevelFolder

      the folder which contains 10000 files

      lowerCoalescenceLevelFolder

      the folder which will contain the same +data as highCoalescenceLevelFolder but spread on only 30 files (where 30 +is the finalCoalesceLevel parameter).

      finalCoalesceLevel

      the nbr of files within the folder at the end +of this method.

      +
    7. + + +

      + + + def + + + getClass(): Class[_ <: AnyVal] + +

      + + Permalink + + +
      Definition Classes
      AnyVal → Any
      +
    8. + + +

      + + final + def + + + isInstanceOf[T0]: Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    9. + + +

      + + + val + + + sc: SparkContext + +

      + + Permalink + + + +
    10. + + +

      + + + def + + + textFile(paths: Seq[String], minPartitions: Int): RDD[String] + +

      + + Permalink + + +

      A replacement for sc.textFile() +when files contains commas in their name.

      A replacement for sc.textFile() +when files contains commas in their name.

      As sc.textFile() +allows to provide several files at once by giving them as a string which +is a list of strings joined with ,, +we can't give it files containing commas in their name.

      This method aims at bypassing this limitation by passing paths as a +sequence of strings.

      sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt"))
      paths

      the paths of the file(s)/folder(s) to read

      minPartitions

      the nbr of partitions in which to split the input

      +
    11. + + +

      + + + def + + + textFile(paths: Seq[String]): RDD[String] + +

      + + Permalink + + +

      A replacement for sc.textFile() +when files contains commas in their name.

      A replacement for sc.textFile() +when files contains commas in their name.

      As sc.textFile() +allows to provide several files at once by giving them as a string which +is a list of strings joined with ,, +we can't give it files containing commas in their name.

      This method aims at bypassing this limitation by passing paths as a +sequence of strings.

      sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt"))
      paths

      the paths of the file(s)/folder(s) to read

      +
    12. + + +

      + + + def + + + textFile(path: String, delimiter: String, maxRecordLength: String = "1000000"): RDD[String] + +

      + + Permalink + + +

      Equivalent to sparkContext.textFile() +, but for a specific record delimiter.

      Equivalent to sparkContext.textFile() +, but for a specific record delimiter.

      By default, sparkContext.textFile() +will provide one record per line (per '\n'). +But what if the format to read considers that one record is stored in +more than one line (yml, custom format, ...)?

      For instance in order to read a yml file, which is a format for which a +record (a single entity) is spread other several lines, you can modify +the record delimiter with "---\n" +instead of "\n". +Same goes when reading an xml file where a record might be spread over +several lines or worse the whole xml file is one line.

      // Let's say data we want to use with Spark looks like this (one record
      +// is a customer, but it's spread over several lines):
      +<Customers>\n
      +<Customer>\n
      +<Address>34 thingy street, someplace, sometown</Address>\n
      +</Customer>\n
      +<Customer>\n
      +<Address>12 thingy street, someplace, sometown</Address>\n
      +</Customer>\n
      +</Customers>
      +//Then you can use it this way:
      +val computedRecords = sc.textFile("my/path/to/customers.xml", "\n")
      +val expectedRecords = RDD(
      +  <Customers>\n,
      +  (
      +    <Address>34 thingy street, someplace, sometown</Address>\n +
      +    </Customer>\n
      +  ),
      +  (
      +    <Address>12 thingy street, someplace, sometown</Address>\n +
      +    </Customer>\n +
      +    </Customers>
      +  )
      +)
      +assert(computedRecords == expectedRecords)
      path

      the path of the file to read (folder or file, '*' works +as well).

      delimiter

      the specific record delimiter which replaces "\n"

      maxRecordLength

      the max length (not sure which unit) of a record +before considering the record too long to fit into memory.

      returns

      the RDD of records

      +
    13. + + +

      + + + def + + + textFileWithFileName(path: String): RDD[(String, String)] + +

      + + Permalink + + +

      Equivalent to sparkContext.textFile() +, but each record is associated with the file path it comes from.

      Equivalent to sparkContext.textFile() +, but each record is associated with the file path it comes from.

      Produces an RDD[(file_name, line)] +which provides a way to know from which file a given line comes from.

      // Considering this folder:
      +// folder/file_1.txt whose content is data1\ndata2\ndata3
      +// folder/file_2.txt whose content is data4\ndata4
      +// folder/folder_1/file_3.txt whose content is data6\ndata7
      +// then:
      +sc.textFileWithFileName("folder")
      +// will return:
      +RDD(
      +  ("file:/path/on/machine/folder/file_1.txt", "data1"),
      +  ("file:/path/on/machine/folder/file_1.txt", "data2"),
      +  ("file:/path/on/machine/folder/file_1.txt", "data3"),
      +  ("file:/path/on/machine/folder/file_2.txt", "data4"),
      +  ("file:/path/on/machine/folder/file_2.txt", "data5"),
      +  ("file:/path/on/machine/folder/folder_1/file_3.txt", "data6"),
      +  ("file:/path/on/machine/folder/folder_1/file_3.txt", "data7")
      +)
      path

      the path of the folder (or structure of folders) to read

      returns

      the RDD of records where a record is a tuple containing the path +of the file the record comes from and the record itself.

      +
    14. + + +

      + + + def + + + toString(): String + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyVal

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + + + + diff --git a/docs/com/spark_helper/SparkHelper$$StringRDDExtensions.html b/docs/com/spark_helper/SparkHelper$$StringRDDExtensions.html new file mode 100644 index 0000000..bf483ea --- /dev/null +++ b/docs/com/spark_helper/SparkHelper$$StringRDDExtensions.html @@ -0,0 +1,422 @@ + + + + StringRDDExtensions - com.spark_helper.SparkHelper.StringRDDExtensions + + + + + + + + + + + + + + + +
    + Class +

    com.spark_helper.SparkHelper

    +

    StringRDDExtensions

    Related Doc: + package SparkHelper +

    + + Permalink + + +
    + +

    + + implicit final + class + + + StringRDDExtensions extends AnyVal + +

    + +
    + Linear Supertypes +
    AnyVal, Any
    +
    + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. StringRDDExtensions
    2. AnyVal
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + +

      + + + new + + + StringRDDExtensions(rdd: RDD[String]) + +

      + + Permalink + + + +
    +
    + + + + + +
    +

    Value Members

    +
    1. + + +

      + + final + def + + + !=(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    2. + + +

      + + final + def + + + ##(): Int + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    3. + + +

      + + final + def + + + ==(arg0: Any): Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    4. + + +

      + + final + def + + + asInstanceOf[T0]: T0 + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    5. + + +

      + + + def + + + getClass(): Class[_ <: AnyVal] + +

      + + Permalink + + +
      Definition Classes
      AnyVal → Any
      +
    6. + + +

      + + final + def + + + isInstanceOf[T0]: Boolean + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    7. + + +

      + + + val + + + rdd: RDD[String] + +

      + + Permalink + + + +
    8. + + +

      + + + def + + + saveAsSingleTextFile(path: String, workingFolder: String, codec: Class[_ <: CompressionCodec]): Unit + +

      + + Permalink + + +

      Saves an RDD in exactly one file.

      Saves an RDD in exactly one file.

      Allows one to save an RDD in one file, while keeping the processing +distributed.

      This variant of saveAsSingleTextFile +performs the storage in a temporary folder instead of directly in the +final output folder. This way the risks of having corrupted files in the +real output folder due to cluster interruptions is minimized.

      rdd.saveAsSingleTextFile("/my/file/path.txt", "/my/working/folder/path", classOf[BZip2Codec])
      path

      the path of the produced file

      workingFolder

      the path where file manipulations will temporarily +happen.

      codec

      the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))

      +
    9. + + +

      + + + def + + + saveAsSingleTextFile(path: String, workingFolder: String): Unit + +

      + + Permalink + + +

      Saves an RDD in exactly one file.

      Saves an RDD in exactly one file.

      Allows one to save an RDD in one file, while keeping the processing +distributed.

      This variant of saveAsSingleTextFile +performs the storage in a temporary folder instead of directly in the +final output folder. This way the risks of having corrupted files in the +real output folder due to cluster interruptions is minimized.

      rdd.saveAsSingleTextFile("/my/file/path.txt", "/my/working/folder/path")
      path

      the path of the produced file

      workingFolder

      the path where file manipulations will temporarily +happen.

      +
    10. + + +

      + + + def + + + saveAsSingleTextFile(path: String, codec: Class[_ <: CompressionCodec]): Unit + +

      + + Permalink + + +

      Saves an RDD in exactly one file.

      Saves an RDD in exactly one file.

      Allows one to save an RDD in one file, while keeping the processing +distributed.

      rdd.saveAsSingleTextFile("/my/file/path.txt", classOf[BZip2Codec])
      path

      the path of the produced file

      codec

      the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))

      +
    11. + + +

      + + + def + + + saveAsSingleTextFile(path: String): Unit + +

      + + Permalink + + +

      Saves an RDD in exactly one file.

      Saves an RDD in exactly one file.

      Allows one to save an RDD in one file, while keeping the processing +distributed.

      rdd.saveAsSingleTextFile("/my/file/path.txt")
      path

      the path of the produced file

      +
    12. + + +

      + + + def + + + saveAsTextFileAndCoalesce(path: String, finalCoalesceLevel: Int, codec: Class[_ <: CompressionCodec]): Unit + +

      + + Permalink + + +

      Saves as text file, and decreases the nbr of output partitions.

      Saves as text file, and decreases the nbr of output partitions.

      Same as rdd.saveAsTextFile() +, but decreases the nbr of partitions in the output folder before doing +so.

      The result is equivalent to rdd.coalesce(x).saveAsTextFile() +, but if x +is very low, coalesce +would make the processing time explode, whereas this methods keeps the +processing distributed, save as text file and then only merges the +result in a lower nbr of partitions.

      rdd.saveAsTextFileAndCoalesce("/produced/folder/path/with/only/30/files", 30, classOf[BZip2Codec])
      path

      the folder where will finally be stored the RDD but spread +on only 30 files (where 30 is the value of the finalCoalesceLevel +parameter).

      finalCoalesceLevel

      the nbr of files within the folder at the end +of this method.

      codec

      the type of compression to use (for instance +classOf[BZip2Codec] or classOf[GzipCodec]))

      +
    13. + + +

      + + + def + + + saveAsTextFileAndCoalesce(path: String, finalCoalesceLevel: Int): Unit + +

      + + Permalink + + +

      Saves as text file, but by decreasing the nbr of partitions of the output.

      Saves as text file, but by decreasing the nbr of partitions of the output.

      Same as rdd.saveAsTextFile() +, but decreases the nbr of partitions in the output folder before doing +so.

      The result is equivalent to rdd.coalesce(x).saveAsTextFile() +, but if x +is very low, coalesce +would make the processing time explode, whereas this methods keeps the +processing distributed, save as text file and then only merges the +result in a lower nbr of partitions.

      rdd.saveAsTextFileAndCoalesce("/produced/folder/path/with/only/30/files", 30)
      path

      the folder where will finally be stored the RDD but spread +on only 30 files (where 30 is the value of the finalCoalesceLevel +parameter).

      finalCoalesceLevel

      the nbr of files within the folder at the end +of this method.

      +
    14. + + +

      + + + def + + + toString(): String + +

      + + Permalink + + +
      Definition Classes
      Any
      +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyVal

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + + + + diff --git a/docs/com/spark_helper/SparkHelper$.html b/docs/com/spark_helper/SparkHelper$.html index f7024b1..25d8011 100644 --- a/docs/com/spark_helper/SparkHelper$.html +++ b/docs/com/spark_helper/SparkHelper$.html @@ -51,19 +51,39 @@

    A facility to deal with RDD/file manipulations based on the Spark API.

    The goal is to remove the maximum of highly used low-level code from your spark job and replace it with methods fully tested whose name is -self-explanatory/readable.

    A few exemples:

    // Same as SparkContext.saveAsTextFile, but the result is a single file:
    -SparkHelper.saveAsSingleTextFile(myOutputRDD, "/my/output/file/path.txt")
    -// Same as SparkContext.textFile, but instead of reading one record per
    -// line, it reads records spread over several lines.
    -// This way, xml, json, yml or any multi-line record file format can be used
    -// with Spark:
    -SparkHelper.textFileWithDelimiter("/my/input/folder/path", sparkContext, "---\n")
    -// Same as SparkContext.textFile, but instead of returning an RDD of
    -// records, it returns an RDD of tuples containing both the record and the
    -// path of the file it comes from:
    -SparkHelper.textFileWithFileName("folder", sparkContext)

    Source import com.spark_helper.SparkHelper._ + +// Same as rdd.saveAsTextFile("path"), but the result is a single file (while +// keeping the processing distributed): +rdd.saveAsSingleTextFile("/my/output/file/path.txt") +rdd.saveAsSingleTextFile("/my/output/file/path.txt", classOf[BZip2Codec]) + +// Same as sc.textFile("path"), but instead of reading one record per line (by +// splitting the input with \n), it splits the file in records based on a custom +// delimiter. This way, xml, json, yml or any multi-line record file format can +// be used with Spark: +sc.textFile("/my/input/folder/path", "---\n") // for a yml file for instance + +// Equivalent to rdd.flatMap(identity) for RDDs of Seqs or Options: +rdd.flatten + +// Equivalent to sc.textFile(), but for each line is tupled with its file path: +sc.textFileWithFileName("/my/input/folder/path") +// which produces: +// RDD(("folder/file_1.txt", "record1fromfile1"), ("folder/file_1.txt", "record2fromfile1"), +// ("folder/file_2.txt", "record1fromfile2"), ...) + +// In the given folder, this generates one file per key in the given key/value +// RDD. Within each file (named from the key) are all values for this key: +rdd.saveAsTextFileByKey("/my/output/folder/path") + +// Concept mapper (the following example transforms RDD(1, 3, 2, 7, 8) into RDD(1, 3, 4, 7, 16)): +rdd.partialMap { case a if a % 2 == 0 => 2 * a } + +// For when input files contain commas and textFile can't handle it: +sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt"))

    Source SparkHelper -

    Since

    2017-02

    +

    Since

    2017-02

    To do

    sc.parallelize[T](elmts: T*) instead of sc.parallelize[T](elmts: Array[T])

    Linear Supertypes
    Serializable, Serializable, AnyRef, Any
    @@ -102,7 +122,112 @@

    - +
    +

    Type Members

    +
    1. + + +

      + + implicit + class + + + OptionRDDExtensions[T] extends AnyRef + +

      + + Permalink + + + +
    2. + + +

      + + implicit final + class + + + PairRDDExtensions extends AnyVal + +

      + + Permalink + + + +
    3. + + +

      + + implicit + class + + + RDDExtensions[T] extends AnyRef + +

      + + Permalink + + + +
    4. + + +

      + + implicit + class + + + SeqRDDExtensions[T] extends AnyRef + +

      + + Permalink + + + +
    5. + + +

      + + implicit final + class + + + SparkContextExtensions extends AnyVal + +

      + + Permalink + + + +
    6. + + +

      + + implicit final + class + + + StringRDDExtensions extends AnyVal + +

      + + Permalink + + + +
    +
    @@ -199,62 +324,6 @@

    )

    -

  • - - -

    - - - def - - - decreaseCoalescence(highCoalescenceLevelFolder: String, lowerCoalescenceLevelFolder: String, finalCoalescenceLevel: Int, sparkContext: SparkContext, compressionCodec: Class[_ <: CompressionCodec]): Unit - -

    - - Permalink - - -

    Decreases the nbr of partitions of a folder.

    Decreases the nbr of partitions of a folder.

    This is often handy when the last step of your job needs to run on -thousands of files, but you want to store your final output on let's say -only 300 files.

    It's like a FileUtil.copyMerge, but the merging produces more than one -file.

    Be aware that this methods deletes the provided input folder.

    SparkHelper.decreaseCoalescence(
    -  "/folder/path/with/2000/files",
    -  "/produced/folder/path/with/only/300/files",
    -  300,
    -  sparkContext,
    -  classOf[BZip2Codec])
    highCoalescenceLevelFolder

    the folder which contains 10000 files

    lowerCoalescenceLevelFolder

    the folder which will contain the same -data as highCoalescenceLevelFolder but spread on only 300 files (where 300 -is the finalCoalescenceLevel parameter).

    finalCoalescenceLevel

    the nbr of files within the folder at the end -of this method.

    sparkContext

    the SparkContext

    compressionCodec

    the type of compression to use (for instance -classOf[BZip2Codec] or classOf[GzipCodec]))

    -
  • - - -

    - - - def - - - decreaseCoalescence(highCoalescenceLevelFolder: String, lowerCoalescenceLevelFolder: String, finalCoalescenceLevel: Int, sparkContext: SparkContext): Unit - -

    - - Permalink - - -

    Decreases the nbr of partitions of a folder.

    Decreases the nbr of partitions of a folder.

    This is often handy when the last step of your job needs to run on -thousands of files, but you want to store your final output on let's say -only 300 files.

    It's like a FileUtil.copyMerge, but the merging produces more than one -file.

    Be aware that this methods deletes the provided input folder.

    SparkHelper.decreaseCoalescence(
    -  "/folder/path/with/2000/files",
    -  "/produced/folder/path/with/only/300/files",
    -  300,
    -  sparkContext)
    highCoalescenceLevelFolder

    the folder which contains 10000 files

    lowerCoalescenceLevelFolder

    the folder which will contain the same -data as highCoalescenceLevelFolder but spread on only 300 files (where 300 -is the finalCoalescenceLevel parameter).

    finalCoalescenceLevel

    the nbr of files within the folder at the end -of this method.

    sparkContext

    the SparkContext

  • @@ -414,193 +483,6 @@

    Definition Classes
    AnyRef
    -

  • - - -

    - - - def - - - saveAsSingleTextFile(outputRDD: RDD[String], outputFile: String, workingFolder: String, compressionCodec: Class[_ <: CompressionCodec]): Unit - -

    - - Permalink - - -

    Saves an RDD in exactly one file.

    Saves an RDD in exactly one file.

    Allows one to save an RDD in one file, while keeping the processing -parallelized.

    This variant of saveAsSingleTextFile performs the storage in a temporary -folder instead of directly in the final output folder. This way the risks -of having corrupted files in the real output folder due to cluster -interruptions is minimized.

    SparkHelper.saveAsSingleTextFile(
    -  myRddToStore,
    -  "/my/file/path.txt",
    -  "/my/working/folder/path",
    -  classOf[BZip2Codec])
    outputRDD

    the RDD of strings to store in one file

    outputFile

    the path of the produced file

    workingFolder

    the path where file manipulations will temporarily -happen.

    compressionCodec

    the type of compression to use (for instance -classOf[BZip2Codec] or classOf[GzipCodec]))

    -
  • - - -

    - - - def - - - saveAsSingleTextFile(outputRDD: RDD[String], outputFile: String, workingFolder: String): Unit - -

    - - Permalink - - -

    Saves an RDD in exactly one file.

    Saves an RDD in exactly one file.

    Allows one to save an RDD in one file, while keeping the processing -parallelized.

    This variant of saveAsSingleTextFile performs the storage in a temporary -folder instead of directly in the final output folder. This way the -risks of having corrupted files in the real output folder due to cluster -interruptions is minimized.

    SparkHelper.saveAsSingleTextFile(
    -  myRddToStore, "/my/file/path.txt", "/my/working/folder/path")
    outputRDD

    the RDD of strings to store in one file

    outputFile

    the path of the produced file

    workingFolder

    the path where file manipulations will temporarily -happen.

    -
  • - - -

    - - - def - - - saveAsSingleTextFile(outputRDD: RDD[String], outputFile: String, compressionCodec: Class[_ <: CompressionCodec]): Unit - -

    - - Permalink - - -

    Saves an RDD in exactly one file.

    Saves an RDD in exactly one file.

    Allows one to save an RDD in one file, while keeping the processing -parallelized.

    SparkHelper.saveAsSingleTextFile(
    -  myRddToStore, "/my/file/path.txt", classOf[BZip2Codec])
    outputRDD

    the RDD of strings to store in one file

    outputFile

    the path of the produced file

    compressionCodec

    the type of compression to use (for instance -classOf[BZip2Codec] or classOf[GzipCodec]))

    -
  • - - -

    - - - def - - - saveAsSingleTextFile(outputRDD: RDD[String], outputFile: String): Unit - -

    - - Permalink - - -

    Saves an RDD in exactly one file.

    Saves an RDD in exactly one file.

    Allows one to save an RDD in one file, while keeping the processing -parallelized.

    SparkHelper.saveAsSingleTextFile(myRddToStore, "/my/file/path.txt")
    outputRDD

    the RDD of strings to store in one file

    outputFile

    the path of the produced file

    -
  • - - -

    - - - def - - - saveAsTextFileAndCoalesce(outputRDD: RDD[String], outputFolder: String, finalCoalescenceLevel: Int, compressionCodec: Class[_ <: CompressionCodec]): Unit - -

    - - Permalink - - -

    Saves as text file, but by decreasing the nbr of partitions of the output.

    Saves as text file, but by decreasing the nbr of partitions of the output.

    Same as decreaseCoalescence, but the storage of the RDD in an intermediate -folder is included.

    This still makes the processing parallelized, but the output is coalesced.

    SparkHelper.saveAsTextFileAndCoalesce(
    -  myRddToStore,
    -  "/produced/folder/path/with/only/300/files",
    -  300,
    -  classOf[BZip2Codec])
    outputRDD

    the RDD to store, processed for instance on 10000 tasks -(which would thus be stored as 10000 files).

    outputFolder

    the folder where will finally be stored the RDD but -spread on only 300 files (where 300 is the value of the -finalCoalescenceLevel parameter).

    finalCoalescenceLevel

    the nbr of files within the folder at the end -of this method.

    compressionCodec

    the type of compression to use (for instance -classOf[BZip2Codec] or classOf[GzipCodec]))

    -
  • - - -

    - - - def - - - saveAsTextFileAndCoalesce(outputRDD: RDD[String], outputFolder: String, finalCoalescenceLevel: Int): Unit - -

    - - Permalink - - -

    Saves as text file, but by decreasing the nbr of partitions of the output.

    Saves as text file, but by decreasing the nbr of partitions of the output.

    Same as decreaseCoalescence, but the storage of the RDD in an intermediate -folder is included.

    This still makes the processing parallelized, but the output is coalesced.

    SparkHelper.saveAsTextFileAndCoalesce(
    -  myRddToStore, "/produced/folder/path/with/only/300/files", 300)
    outputRDD

    the RDD to store, processed for instance on 10000 tasks -(which would thus be stored as 10000 files).

    outputFolder

    the folder where will finally be stored the RDD but -spread on only 300 files (where 300 is the value of the -finalCoalescenceLevel parameter).

    finalCoalescenceLevel

    the nbr of files within the folder at the end -of this method.

    -
  • - - -

    - - - def - - - saveAsTextFileByKey(keyValueRDD: RDD[(String, String)], outputFolder: String, keyNbr: Int, compressionCodec: Class[_ <: CompressionCodec]): Unit - -

    - - Permalink - - -

    Saves and repartitions a key/value RDD on files whose name is the key.

    Saves and repartitions a key/value RDD on files whose name is the key.

    Within the provided outputFolder, will be one file per key in your -keyValueRDD. And within a file for a given key are only values for this -key.

    You need to know the nbr of keys beforehand (in general you use this to -split your dataset in subsets, or to output one file per client, so you -know how many keys you have). So you need to put as keyNbr the exact nbr -of keys you'll have.

    This is not scalable. This shouldn't be considered for any data flow with -normal or big volumes.

    SparkHelper.saveAsTextFileByKey(
    -  myKeyValueRddToStore, "/my/output/folder/path", 12, classOf[BZip2Codec])
    keyValueRDD

    the key/value RDD

    outputFolder

    the foldder where will be storrred key files

    keyNbr

    the nbr of expected keys (which is the nbr of outputed files)

    compressionCodec

    the type of compression to use (for instance -classOf[BZip2Codec] or classOf[GzipCodec]))

    -
  • - - -

    - - - def - - - saveAsTextFileByKey(keyValueRDD: RDD[(String, String)], outputFolder: String, keyNbr: Int): Unit - -

    - - Permalink - - -

    Saves and repartitions a key/value RDD on files whose name is the key.

    Saves and repartitions a key/value RDD on files whose name is the key.

    Within the provided outputFolder, will be one file per key in your -keyValueRDD. And within a file for a given key are only values for this -key.

    You need to know the nbr of keys beforehand (in general you use this to -split your dataset in subsets, or to output one file per client, so you -know how many keys you have). So you need to put as keyNbr the exact nbr -of keys you'll have.

    This is not scalable. This shouldn't be considered for any data flow with -normal or big volumes.

    SparkHelper.saveAsTextFileByKey(
    -  myKeyValueRddToStore, "/my/output/folder/path", 12)
    keyValueRDD

    the key/value RDD

    outputFolder

    the foldder where will be storrred key files

    keyNbr

    the nbr of expected keys (which is the nbr of outputed files)

  • @@ -618,93 +500,6 @@

    Definition Classes
    AnyRef
    -

  • - - -

    - - - def - - - textFileWithDelimiter(hdfsPath: String, sparkContext: SparkContext, delimiter: String, maxRecordLength: String = "1000000"): RDD[String] - -

    - - Permalink - - -

    Equivalent to sparkContext.textFile(), but for a specific record delimiter.

    Equivalent to sparkContext.textFile(), but for a specific record delimiter.

    By default, sparkContext.textFile() will provide one record per line. But -what if the format you want to read considers that one record (one entity) -is stored in more than one line (yml, xml, ...)?

    For instance in order to read a yml file, which is a format for which a -record (a single entity) is spread other several lines, you can modify the -record delimiter with "---\n" instead of "\n". Same goes when reading an -xml file where a record might be spread over several lines or worse the -whole xml file is one line.

    // Let's say data we want to use with Spark looks like this (one record is
    -// a customer, but it's spread over several lines):
    -<Customers>\n
    -<Customer>\n
    -<Address>34 thingy street, someplace, sometown</Address>\n
    -</Customer>\n
    -<Customer>\n
    -<Address>12 thingy street, someplace, sometown</Address>\n
    -</Customer>\n
    -</Customers>
    -//Then you can use it this way:
    -val computedRecords = SparkHelper.textFileWithDelimiter(
    -  "my/path/to/customers.xml", sparkContext, <Customer>\n
    -).collect()
    -val expectedRecords = Array(
    -  <Customers>\n,
    -  (
    -    <Address>34 thingy street, someplace, sometown</Address>\n +
    -    </Customer>\n
    -  ),
    -  (
    -    <Address>12 thingy street, someplace, sometown</Address>\n +
    -    </Customer>\n +
    -    </Customers>
    -  )
    -)
    -assert(computedRecords == expectedRecords)
    hdfsPath

    the path of the file to read (folder or file, '*' works as -well).

    sparkContext

    the SparkContext

    delimiter

    the specific record delimiter which replaces "\n"

    maxRecordLength

    the max length (not sure which unit) of a record -before considering the record too long to fit into memory.

    returns

    the RDD of records

    -
  • - - -

    - - - def - - - textFileWithFileName(hdfsPath: String, sparkContext: SparkContext): RDD[(String, String)] - -

    - - Permalink - - -

    Equivalent to sparkContext.textFile(), but for each line is associated -with its file path.

    Equivalent to sparkContext.textFile(), but for each line is associated -with its file path.

    Produces a RDD[(file_name, line)] which provides a way to know from which -file a given line comes from.

    // Considering this folder:
    -// folder/file_1.txt whose content is data1\ndata2\ndata3
    -// folder/file_2.txt whose content is data4\ndata4
    -// folder/folder_1/file_3.txt whose content is data6\ndata7
    -// then:
    -SparkHelper.textFileWithFileName("folder", sparkContext)
    -// will return:
    -RDD(
    -  ("file:/path/on/machine/folder/file_1.txt", "data1"),
    -  ("file:/path/on/machine/folder/file_1.txt", "data2"),
    -  ("file:/path/on/machine/folder/file_1.txt", "data3"),
    -  ("file:/path/on/machine/folder/file_2.txt", "data4"),
    -  ("file:/path/on/machine/folder/file_2.txt", "data5"),
    -  ("file:/path/on/machine/folder/folder_1/file_3.txt", "data6"),
    -  ("file:/path/on/machine/folder/folder_1/file_3.txt", "data7")
    -)
    hdfsPath

    the path of the folder (or structure of folders) to read

    sparkContext

    the SparkContext

    returns

    the RDD of records where a record is a tuple containing the path -of the file the record comes from and the record itself.

  • diff --git a/docs/com/spark_helper/monitoring/Test.html b/docs/com/spark_helper/monitoring/Test.html index f1cd8b7..4246185 100644 --- a/docs/com/spark_helper/monitoring/Test.html +++ b/docs/com/spark_helper/monitoring/Test.html @@ -50,7 +50,7 @@

    A class which represents a KPI to validate.

    This is intended to be used as parameter of Monitor.updateByKpiValidation -and Monitor.updateByKpisValidation methods.

    Some exemples of Test objects:

    Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT)
    +and Monitor.updateByKpisValidation methods.

    Some examples of Test objects:

    Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT)
     Test("pctOfSomethingElse", 0.27d, SUPERIOR_THAN, 0.3d, PCT)
     Test("someNbr", 1235d, EQUAL_TO, 1235d, NBR)
    description

    the name/description of the KPI which will appear on the validation report.

    kpiValue

    the value for this KPI

    thresholdType

    the type of threshold (SUPERIOR_THAN, INFERIOR_THAN or @@ -110,7 +110,7 @@

    Creates a Test object.

    Creates a Test object. -Some exemples of Test objects:

    Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT)
    +Some examples of Test objects:

    Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT)
     Test("pctOfSomethingElse", 0.27d, SUPERIOR_THAN, 0.3d, PCT)
     Test("someNbr", 1235d, EQUAL_TO, 1235d, NBR)
    description

    the name/description of the KPI which will appear on the validation report.

    kpiValue

    the value for this KPI

    thresholdType

    the type of threshold (SUPERIOR_THAN, INFERIOR_THAN or diff --git a/docs/com/spark_helper/monitoring/ThresholdType.html b/docs/com/spark_helper/monitoring/ThresholdType.html index b0c6eef..61447b9 100644 --- a/docs/com/spark_helper/monitoring/ThresholdType.html +++ b/docs/com/spark_helper/monitoring/ThresholdType.html @@ -49,7 +49,7 @@

    -

    An enumeration which represents the type of threshol to use (EQUAL_TO, +

    An enumeration which represents the type of threshold to use (EQUAL_TO, SUPERIOR_THAN or INFERIOR_THAN)

    Linear Supertypes
    AnyRef, Any
    diff --git a/docs/com/spark_helper/monitoring/package.html b/docs/com/spark_helper/monitoring/package.html index f89cf5e..c7c36f9 100644 --- a/docs/com/spark_helper/monitoring/package.html +++ b/docs/com/spark_helper/monitoring/package.html @@ -100,7 +100,7 @@

    A class which represents a KPI to validate.

    A class which represents a KPI to validate.

    This is intended to be used as parameter of Monitor.updateByKpiValidation -and Monitor.updateByKpisValidation methods.

    Some exemples of Test objects:

    Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT)
    +and Monitor.updateByKpisValidation methods.

    Some examples of Test objects:

    Test("pctOfWhatever", 0.06d, INFERIOR_THAN, 0.1d, PCT)
     Test("pctOfSomethingElse", 0.27d, SUPERIOR_THAN, 0.3d, PCT)
     Test("someNbr", 1235d, EQUAL_TO, 1235d, NBR)
    description

    the name/description of the KPI which will appear on the validation report.

    kpiValue

    the value for this KPI

    thresholdType

    the type of threshold (SUPERIOR_THAN, INFERIOR_THAN or @@ -121,7 +121,7 @@

    Permalink -

    An enumeration which represents the type of threshol to use (EQUAL_TO, +

    An enumeration which represents the type of threshold to use (EQUAL_TO, SUPERIOR_THAN or INFERIOR_THAN)

  • diff --git a/docs/com/spark_helper/package.html b/docs/com/spark_helper/package.html index 0912242..60ce9ba 100644 --- a/docs/com/spark_helper/package.html +++ b/docs/com/spark_helper/package.html @@ -90,14 +90,31 @@

    joda-time).

    A facility which deals with usual date needs (wrapper around joda-time).

    The goal is to remove the maximum of highly used low-level code from your spark job and replace it with methods fully tested whose name is -self-explanatory/readable.

    A few exemples:

    assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101"))
    -assert(DateHelper.today() == "20170310") // If today's "20170310"
    -assert(DateHelper.yesterday() == "20170309") // If today's "20170310"
    -assert(DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") == "170327")
    -assert(DateHelper.now("HH:mm") == "10:24")
    -assert(DateHelper.currentTimestamp() == "1493105229736")
    -assert(DateHelper.nDaysBefore(3) == "20170307") // If today's "20170310"
    -assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310")

    Source import com.spark_helper.DateHelper + +DateHelper.daysBetween("20161230", "20170101") // List("20161230", "20161231", "20170101") +DateHelper.today // "20170310" +DateHelper.yesterday // "20170309" +DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") // "170327" +DateHelper.now("HH:mm") // "10:24" +DateHelper.currentTimestamp // "1493105229736" +DateHelper.nDaysBefore(3) // "20170307" +DateHelper.nDaysAfterDate(3, "20170307") // "20170310" +DateHelper.nextDay("20170310") // "20170311" +DateHelper.nbrOfDaysSince("20170302") // 8 +DateHelper.nbrOfDaysBetween("20170327", "20170401") // 5 +DateHelper.dayOfWeek("20160614") // 2 + +import com.spark_helper.DateHelper._ + +2.daysAgo // "20170308" +"20161230" to "20170101" // List("20161230", "20161231", "20170101") +3.daysBefore("20170310") // "20170307" +5.daysAfter // "20170315" +4.daysAfter("20170310") // "20170314" +"20170302".isCompliantWith("yyyyMMdd") +"20170310".nextDay // "20170311" +"20170310".previousDay // "20170309"

    Source DateHelper

    Since

    2017-02

  • @@ -122,38 +139,39 @@

    spark job and replace it with methods fully tested whose name is self-explanatory/readable.

    For instance, one don't want to remove a file from hdfs using 3 lines of code and thus could instead just use -HdfsHelper.deleteFile("my/hdfs/file/path.csv").

    A few exemples:

    import com.spark_helper.HdfsHelper
    +HdfsHelper.deleteFile("my/hdfs/file/path.csv").

    A few examples:

    import com.spark_helper.HdfsHelper
     
     // A bunch of methods wrapping the FileSystem API, such as:
    -HdfsHelper.fileExists("my/hdfs/file/path.txt")
    -assert(HdfsHelper.listFileNamesInFolder("my/folder/path") == List("file_name_1.txt", "file_name_2.csv"))
    -assert(HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") == "20170306")
    -assert(HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") == 3)
    -HdfsHelper.deleteFile("my/hdfs/file/path.csv")
    -HdfsHelper.moveFolder("my/hdfs/folder")
    +HdfsHelper.fileExists("my/hdfs/file/path.txt") // HdfsHelper.folderExists("my/hdfs/folder")
    +HdfsHelper.listFileNamesInFolder("my/folder/path") // List("file_name_1.txt", "file_name_2.csv")
    +HdfsHelper.fileModificationDate("my/hdfs/file/path.txt") // "20170306"
    +HdfsHelper.nbrOfDaysSinceFileWasLastModified("my/hdfs/file/path.txt") // 3
    +HdfsHelper.deleteFile("my/hdfs/file/path.csv") // HdfsHelper.deleteFolder("my/hdfs/folder")
    +HdfsHelper.moveFolder("old/path", "new/path") // HdfsHelper.moveFile("old/path.txt", "new/path.txt")
    +HdfsHelper.createEmptyHdfsFile("/some/hdfs/file/path.token") // HdfsHelper.createFolder("my/hdfs/folder")
    +
    +// File content helpers:
     HdfsHelper.compressFile("hdfs/path/to/uncompressed_file.txt", classOf[GzipCodec])
     HdfsHelper.appendHeader("my/hdfs/file/path.csv", "colum0,column1")
     
     // Some Xml/Typesafe helpers for hadoop as well:
    -HdfsHelper.isHdfsXmlCompliantWithXsd(
    -  "my/hdfs/file/path.xml", getClass.getResource("/some_xml.xsd"))
    +HdfsHelper.isHdfsXmlCompliantWithXsd("my/hdfs/file/path.xml", getClass.getResource("/some_xml.xsd"))
     HdfsHelper.loadXmlFileFromHdfs("my/hdfs/file/path.xml")
     
    -// Very handy to load a config (typesafe format) stored on hdfs at the
    -// begining of a spark job:
    +// Very handy to load a config (typesafe format) stored on hdfs at the beginning of a spark job:
     HdfsHelper.loadTypesafeConfigFromHdfs("my/hdfs/file/path.conf"): Config
     
    -// In order to write small amount of data in a file on hdfs without the
    -// whole spark stack:
    -HdfsHelper.writeToHdfsFile(
    -  Array("some", "relatively small", "text"),
    -  "/some/hdfs/file/path.txt")
    +// In order to write small amount of data in a file on hdfs without the whole spark stack:
    +HdfsHelper.writeToHdfsFile(Array("some", "relatively small", "text"), "/some/hdfs/file/path.txt")
    +// or:
    +import com.spark_helper.HdfsHelper._
    +Array("some", "relatively small", "text").writeToHdfs("/some/hdfs/file/path.txt")
    +"hello world".writeToHdfs("/some/hdfs/file/path.txt")
     
    -// Deletes all files/folders in "hdfs/path/to/folder" for which the
    -// timestamp is older than 10 days:
    +// Deletes all files/folders in "hdfs/path/to/folder" for which the timestamp is older than 10 days:
     HdfsHelper.purgeFolder("hdfs/path/to/folder", 10)

    Source HdfsHelper -

    Since

    2017-02

    +

    Since

    2017-02

    To do

    Create a touch method

  • @@ -170,11 +188,11 @@

    Permalink -

    A logger dedicated to Spak jobs.

    A logger dedicated to Spak jobs.

    It's a simple logger/report which contains a report that one can update from +

    A logger dedicated to Spark jobs.

    A logger dedicated to Spark jobs.

    It's a simple logger/report which contains a report that one can update from the driver and a success state. The idea is to persist job executions logs -and errors (and forget about grepping unreadable yarn logs).

    It's designed for perdiodic spark jobs (handles storage and purge of logs) +and errors (and forget about grepping unreadable yarn logs).

    It's designed for periodic spark jobs (handles storage and purge of logs) and provides a way to handle kpis validation.

    Logs are stored on the go which means one can have a direct real time access -of the job logs/status and it's current state (which can overwise be a pain +of the job logs/status and it's current state (which can otherwise be a pain if it means going through yarn logs, or even for certain production environments going through additional layers of software logs to get to yarn logs).

    One of the issues this logger aims at tackling is the handling of exceptions @@ -183,9 +201,9 @@

    want to perform a few actions before letting the job crash. The idea is thus to surround (driver side) a Spark pipeline within a try catch and redirect the exception to the logger for a clean logging.

    This is a "driver-only" logger and is not intended at logging concurrent -actions from executors.

    Produced reports can easily be inserted in a notification email whenerver +actions from executors.

    Produced reports can easily be inserted in a notification email whenever the job fails, which saves a lot of time to maintainers operating on heavy -production environements.

    The produced persisted report is also a way for downstream jobs to know the +production environments.

    The produced persisted report is also a way for downstream jobs to know the status of their input data.

    Let's go through a simple Spark job example monitored with this Monitor facility:

    Monitor.setTitle("My job title")
     Monitor.addDescription(
    @@ -204,7 +222,7 @@ 

    Test("Nbr of output records", processedData.count(), SUPERIOR_THAN, 10e6d, NBR), Test("Some pct of invalid output", your_complex_kpi, INFERIOR_THAN, 3, PCT) ), - "My pipeline descirption" + "My pipeline description" ) if (outputIsValid) @@ -212,9 +230,9 @@

    } catch { case iie: InvalidInputException => - Monitor.error(iie, "My pipeline descirption", diagnostic = "No input data!") + Monitor.error(iie, "My pipeline description", diagnostic = "No input data!") case e: Throwable => - Monitor.error(e, "My pipeline descirption") // whatever unexpected error + Monitor.error(e, "My pipeline description") // whatever unexpected error } if (Monitor.isSuccess()) { @@ -226,7 +244,7 @@

    // HDFS (this saves the logs in the folder set with Monitor.setLogFolder): Monitor.store() -// At the end of the job, if the job isn't successfull, you might want to +// At the end of the job, if the job isn't successful, you might want to // crash it (for instance to get a notification from your scheduler): if (!Monitor.isSuccess()) throw new Exception() // or send an email, or ...

    At any time during the job, logs can be accessed from file path/to/log/folder/current.ongoing

    If we were to read the stored report after this simple pipeline, here are @@ -234,8 +252,8 @@

    My job description (whatever you want); for instance: Documentation: https://github.com/xavierguihot/spark_helper -[10:23] Begining -[10:23-10:23] My pipeline descirption: failed +[10:23] Beginning +[10:23-10:23] My pipeline description: failed Diagnostic: No input data! org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://my/hdfs/input/path at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:285) @@ -245,18 +263,18 @@

    My job description (whatever you want); for instance: Documentation: https://github.com/xavierguihot/spark_helper -[10:23] Begining -[10:23-10:36] My pipeline descirption: failed +[10:23] Beginning +[10:23-10:36] My pipeline description: failed java.lang.NumberFormatException: For input string: "a" java.lang.NumberFormatException.forInputString(NumberFormatException.java:65) java.lang.Integer.parseInt(Integer.java:492) ... -[10:36] Duration: 00:13:47

    Another scenario, successfull spark pipeline and KPIs are valid; all good!:

              My job title
    +[10:36] Duration: 00:13:47

    Another scenario, successful spark pipeline and KPIs are valid; all good!:

              My job title
     
     My job description (whatever you want); for instance:
     Documentation: https://github.com/xavierguihot/spark_helper
    -[10:23] Begining
    -[10:23-10:41] My pipeline descirption: success
    +[10:23] Beginning
    +[10:23-10:41] My pipeline description: success
       KPI: Nbr of output records
         Value: 14669071.0
         Must be superior than 10000000.0
    @@ -268,7 +286,7 @@ 

    [10:41-10:42] My second pipeline description: success [10:42] Duration: 00:19:23

    Source Monitor -

    Since

    2017-02

    +

    Since

    2017-02

    To do

    would a State monad be appropriate?

  • @@ -287,19 +305,39 @@

    A facility to deal with RDD/file manipulations based on the Spark API.

    A facility to deal with RDD/file manipulations based on the Spark API.

    The goal is to remove the maximum of highly used low-level code from your spark job and replace it with methods fully tested whose name is -self-explanatory/readable.

    A few exemples:

    // Same as SparkContext.saveAsTextFile, but the result is a single file:
    -SparkHelper.saveAsSingleTextFile(myOutputRDD, "/my/output/file/path.txt")
    -// Same as SparkContext.textFile, but instead of reading one record per
    -// line, it reads records spread over several lines.
    -// This way, xml, json, yml or any multi-line record file format can be used
    -// with Spark:
    -SparkHelper.textFileWithDelimiter("/my/input/folder/path", sparkContext, "---\n")
    -// Same as SparkContext.textFile, but instead of returning an RDD of
    -// records, it returns an RDD of tuples containing both the record and the
    -// path of the file it comes from:
    -SparkHelper.textFileWithFileName("folder", sparkContext)

    Source import com.spark_helper.SparkHelper._ + +// Same as rdd.saveAsTextFile("path"), but the result is a single file (while +// keeping the processing distributed): +rdd.saveAsSingleTextFile("/my/output/file/path.txt") +rdd.saveAsSingleTextFile("/my/output/file/path.txt", classOf[BZip2Codec]) + +// Same as sc.textFile("path"), but instead of reading one record per line (by +// splitting the input with \n), it splits the file in records based on a custom +// delimiter. This way, xml, json, yml or any multi-line record file format can +// be used with Spark: +sc.textFile("/my/input/folder/path", "---\n") // for a yml file for instance + +// Equivalent to rdd.flatMap(identity) for RDDs of Seqs or Options: +rdd.flatten + +// Equivalent to sc.textFile(), but for each line is tupled with its file path: +sc.textFileWithFileName("/my/input/folder/path") +// which produces: +// RDD(("folder/file_1.txt", "record1fromfile1"), ("folder/file_1.txt", "record2fromfile1"), +// ("folder/file_2.txt", "record1fromfile2"), ...) + +// In the given folder, this generates one file per key in the given key/value +// RDD. Within each file (named from the key) are all values for this key: +rdd.saveAsTextFileByKey("/my/output/folder/path") + +// Concept mapper (the following example transforms RDD(1, 3, 2, 7, 8) into RDD(1, 3, 4, 7, 16)): +rdd.partialMap { case a if a % 2 == 0 => 2 * a } + +// For when input files contain commas and textFile can't handle it: +sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt"))

    Source SparkHelper -

    Since

    2017-02

    +

    Since

    2017-02

    To do

    sc.parallelize[T](elmts: T*) instead of sc.parallelize[T](elmts: Array[T])

  • diff --git a/docs/index.html b/docs/index.html index eb6e8e8..4ec3b1d 100644 --- a/docs/index.html +++ b/docs/index.html @@ -25,7 +25,7 @@
    -
    #ABCDEFGHIJKLMNOPQRSTUVWXYZdeprecated
    +
    #ABCDEFGHIJKLMNOPQRSTUVWXYZdeprecated
    @@ -42,6 +42,18 @@
  • +
  • + org +
      +
      1. + org.apache +
          +
          1. + org.apache.spark +
            1. (object)
              TextFileOverwrite
            +
            +
          +
      2. diff --git a/docs/index.js b/docs/index.js index 8f335a2..6668d87 100644 --- a/docs/index.js +++ b/docs/index.js @@ -1 +1 @@ -Index.PACKAGES = {"com" : [], "com.spark_helper" : [{"object" : "com\/spark_helper\/DateHelper$.html", "name" : "com.spark_helper.DateHelper"}, {"object" : "com\/spark_helper\/HdfsHelper$.html", "name" : "com.spark_helper.HdfsHelper"}, {"object" : "com\/spark_helper\/Monitor$.html", "name" : "com.spark_helper.Monitor"}, {"object" : "com\/spark_helper\/SparkHelper$.html", "name" : "com.spark_helper.SparkHelper"}], "com.spark_helper.monitoring" : [{"object" : "com\/spark_helper\/monitoring\/EQUAL_TO$.html", "name" : "com.spark_helper.monitoring.EQUAL_TO"}, {"object" : "com\/spark_helper\/monitoring\/INFERIOR_THAN$.html", "name" : "com.spark_helper.monitoring.INFERIOR_THAN"}, {"trait" : "com\/spark_helper\/monitoring\/KpiType.html", "name" : "com.spark_helper.monitoring.KpiType"}, {"object" : "com\/spark_helper\/monitoring\/NBR$.html", "name" : "com.spark_helper.monitoring.NBR"}, {"object" : "com\/spark_helper\/monitoring\/PCT$.html", "name" : "com.spark_helper.monitoring.PCT"}, {"object" : "com\/spark_helper\/monitoring\/SUPERIOR_THAN$.html", "name" : "com.spark_helper.monitoring.SUPERIOR_THAN"}, {"case class" : "com\/spark_helper\/monitoring\/Test.html", "name" : "com.spark_helper.monitoring.Test"}, {"trait" : "com\/spark_helper\/monitoring\/ThresholdType.html", "name" : "com.spark_helper.monitoring.ThresholdType"}]}; \ No newline at end of file +Index.PACKAGES = {"com.spark_helper.monitoring" : [{"object" : "com\/spark_helper\/monitoring\/EQUAL_TO$.html", "name" : "com.spark_helper.monitoring.EQUAL_TO"}, {"object" : "com\/spark_helper\/monitoring\/INFERIOR_THAN$.html", "name" : "com.spark_helper.monitoring.INFERIOR_THAN"}, {"trait" : "com\/spark_helper\/monitoring\/KpiType.html", "name" : "com.spark_helper.monitoring.KpiType"}, {"object" : "com\/spark_helper\/monitoring\/NBR$.html", "name" : "com.spark_helper.monitoring.NBR"}, {"object" : "com\/spark_helper\/monitoring\/PCT$.html", "name" : "com.spark_helper.monitoring.PCT"}, {"object" : "com\/spark_helper\/monitoring\/SUPERIOR_THAN$.html", "name" : "com.spark_helper.monitoring.SUPERIOR_THAN"}, {"case class" : "com\/spark_helper\/monitoring\/Test.html", "name" : "com.spark_helper.monitoring.Test"}, {"trait" : "com\/spark_helper\/monitoring\/ThresholdType.html", "name" : "com.spark_helper.monitoring.ThresholdType"}], "org.apache.spark" : [{"object" : "org\/apache\/spark\/TextFileOverwrite$.html", "name" : "org.apache.spark.TextFileOverwrite"}], "org.apache" : [], "com.spark_helper" : [{"object" : "com\/spark_helper\/DateHelper$.html", "name" : "com.spark_helper.DateHelper"}, {"object" : "com\/spark_helper\/HdfsHelper$.html", "name" : "com.spark_helper.HdfsHelper"}, {"object" : "com\/spark_helper\/Monitor$.html", "name" : "com.spark_helper.Monitor"}, {"object" : "com\/spark_helper\/SparkHelper$.html", "name" : "com.spark_helper.SparkHelper"}], "org" : [], "com" : []}; \ No newline at end of file diff --git a/docs/index/index-a.html b/docs/index/index-a.html index 557f98b..5d8c458 100644 --- a/docs/index/index-a.html +++ b/docs/index/index-a.html @@ -16,6 +16,9 @@
        addDescription
        +
        +
        apache
        +
        appendFooter
        diff --git a/docs/index/index-d.html b/docs/index/index-d.html index 860d355..a6dd1d1 100644 --- a/docs/index/index-d.html +++ b/docs/index/index-d.html @@ -19,12 +19,21 @@
        dayOfWeek
        +
        +
        daysAfter
        + +
        +
        daysAgo
        + +
        +
        daysBefore
        +
        daysBetween
        decreaseCoalescence
        - +
        deleteFile
        diff --git a/docs/index/index-f.html b/docs/index/index-f.html index e17cd19..3e21430 100644 --- a/docs/index/index-f.html +++ b/docs/index/index-f.html @@ -19,6 +19,9 @@
        fileModificationDateTime
        +
        +
        flatten
        +
        folderExists
        diff --git a/docs/index/index-i.html b/docs/index/index-i.html index 07f867e..d1f7b53 100644 --- a/docs/index/index-i.html +++ b/docs/index/index-i.html @@ -13,6 +13,15 @@
        INFERIOR_THAN
        +
        +
        IntExtensions
        + +
        +
        int
        + +
        +
        isCompliantWith
        +
        isDateCompliantWithFormat
        diff --git a/docs/index/index-n.html b/docs/index/index-n.html index 914674a..128fca2 100644 --- a/docs/index/index-n.html +++ b/docs/index/index-n.html @@ -13,6 +13,9 @@
        NBR
        +
        +
        nDaysAfter
        +
        nDaysAfterDate
        @@ -36,7 +39,7 @@
        nextDay
        - +
        now
        diff --git a/docs/index/index-o.html b/docs/index/index-o.html new file mode 100644 index 0000000..4a559d5 --- /dev/null +++ b/docs/index/index-o.html @@ -0,0 +1,20 @@ + + + + + + + + + + + + +
        +
        OptionRDDExtensions
        + +
        +
        org
        + +
        + diff --git a/docs/index/index-p.html b/docs/index/index-p.html index c004302..bdee856 100644 --- a/docs/index/index-p.html +++ b/docs/index/index-p.html @@ -13,9 +13,15 @@
        PCT
        +
        +
        PairRDDExtensions
        + +
        +
        partialMap
        +
        previousDay
        - +
        purgeFolder
        diff --git a/docs/index/index-r.html b/docs/index/index-r.html index 8ef3476..317ec21 100644 --- a/docs/index/index-r.html +++ b/docs/index/index-r.html @@ -11,6 +11,12 @@
        +
        RDDExtensions
        + +
        +
        rdd
        + +
        reformatDate
        diff --git a/docs/index/index-s.html b/docs/index/index-s.html index f70ca30..c0e0953 100644 --- a/docs/index/index-s.html +++ b/docs/index/index-s.html @@ -11,32 +11,86 @@
        +
        SPARK_BRANCH
        + +
        +
        SPARK_BUILD_DATE
        + +
        +
        SPARK_BUILD_USER
        + +
        +
        SPARK_REPO_URL
        + +
        +
        SPARK_REVISION
        + +
        +
        SPARK_VERSION
        + +
        SUPERIOR_THAN
        +
        +
        SeqExtensions
        + +
        +
        SeqRDDExtensions
        + +
        +
        SparkContextExtensions
        +
        SparkHelper
        -
        saveAsSingleTextFile
        +
        StringExtensions
        + +
        +
        StringRDDExtensions
        +
        +
        saveAsSingleTextFile
        +
        saveAsTextFileAndCoalesce
        - +
        saveAsTextFileByKey
        - + +
        +
        sc
        + +
        +
        seq
        + +
        +
        setConf
        + +
        +
        setFileSystem
        + +
        +
        setFormat
        +
        setLogFolder
        setTitle
        +
        +
        spark
        +
        spark_helper
        store
        +
        +
        string
        +
        success
        diff --git a/docs/index/index-t.html b/docs/index/index-t.html index bb8d1a4..c58eef1 100644 --- a/docs/index/index-t.html +++ b/docs/index/index-t.html @@ -13,18 +13,24 @@
        Test
        +
        +
        TextFileOverwrite
        +
        ThresholdType
        -
        textFileWithDelimiter
        - +
        textFile
        +
        textFileWithFileName
        - +
        thresholdType
        +
        +
        to
        +
        toString
        diff --git a/docs/index/index-w.html b/docs/index/index-w.html index 231f199..f7f4a02 100644 --- a/docs/index/index-w.html +++ b/docs/index/index-w.html @@ -13,6 +13,9 @@
        withPurge
        +
        +
        writeToHdfs
        +
        writeToHdfsFile
        diff --git a/docs/org/apache/package.html b/docs/org/apache/package.html new file mode 100644 index 0000000..724d793 --- /dev/null +++ b/docs/org/apache/package.html @@ -0,0 +1,118 @@ + + + + apache - org.apache + + + + + + + + + + + + + + + +
        + Package +

        org

        +

        apache

        + + Permalink + + +
        + +

        + + + package + + + apache + +

        + +
        + + +
        +
        + + +
        + Visibility +
        1. Public
        2. All
        +
        +
        + +
        +
        + + + + + + +
        +

        Value Members

        +
        1. + + +

          + + + package + + + spark + +

          + + Permalink + + + +
        +
        + + + + +
        + +
        + + +
        + +
        +
        +

        Ungrouped

        + +
        +
        + +
        + +
        + + + + + + diff --git a/docs/org/apache/spark/TextFileOverwrite$.html b/docs/org/apache/spark/TextFileOverwrite$.html new file mode 100644 index 0000000..908c792 --- /dev/null +++ b/docs/org/apache/spark/TextFileOverwrite$.html @@ -0,0 +1,499 @@ + + + + TextFileOverwrite - org.apache.spark.TextFileOverwrite + + + + + + + + + + + + + + + +
        + Object +

        org.apache.spark

        +

        TextFileOverwrite

        Related Doc: + package spark +

        + + Permalink + + +
        + +

        + + + object + + + TextFileOverwrite + +

        + +
        + Linear Supertypes +
        AnyRef, Any
        +
        + + +
        +
        +
        + Ordering +
          + +
        1. Alphabetic
        2. +
        3. By Inheritance
        4. +
        +
        +
        + Inherited
        +
        +
          +
        1. TextFileOverwrite
        2. AnyRef
        3. Any
        4. +
        +
        + +
          +
        1. Hide All
        2. +
        3. Show All
        4. +
        +
        +
        + Visibility +
        1. Public
        2. All
        +
        +
        + +
        +
        + + + + + + +
        +

        Value Members

        +
        1. + + +

          + + final + def + + + !=(arg0: Any): Boolean + +

          + + Permalink + + +
          Definition Classes
          AnyRef → Any
          +
        2. + + +

          + + final + def + + + ##(): Int + +

          + + Permalink + + +
          Definition Classes
          AnyRef → Any
          +
        3. + + +

          + + final + def + + + ==(arg0: Any): Boolean + +

          + + Permalink + + +
          Definition Classes
          AnyRef → Any
          +
        4. + + +

          + + final + def + + + asInstanceOf[T0]: T0 + +

          + + Permalink + + +
          Definition Classes
          Any
          +
        5. + + +

          + + + def + + + clone(): AnyRef + +

          + + Permalink + + +
          Attributes
          protected[java.lang]
          Definition Classes
          AnyRef
          Annotations
          + @throws( + + ... + ) + +
          +
        6. + + +

          + + final + def + + + eq(arg0: AnyRef): Boolean + +

          + + Permalink + + +
          Definition Classes
          AnyRef
          +
        7. + + +

          + + + def + + + equals(arg0: Any): Boolean + +

          + + Permalink + + +
          Definition Classes
          AnyRef → Any
          +
        8. + + +

          + + + def + + + finalize(): Unit + +

          + + Permalink + + +
          Attributes
          protected[java.lang]
          Definition Classes
          AnyRef
          Annotations
          + @throws( + + classOf[java.lang.Throwable] + ) + +
          +
        9. + + +

          + + final + def + + + getClass(): Class[_] + +

          + + Permalink + + +
          Definition Classes
          AnyRef → Any
          +
        10. + + +

          + + + def + + + hashCode(): Int + +

          + + Permalink + + +
          Definition Classes
          AnyRef → Any
          +
        11. + + +

          + + final + def + + + isInstanceOf[T0]: Boolean + +

          + + Permalink + + +
          Definition Classes
          Any
          +
        12. + + +

          + + final + def + + + ne(arg0: AnyRef): Boolean + +

          + + Permalink + + +
          Definition Classes
          AnyRef
          +
        13. + + +

          + + final + def + + + notify(): Unit + +

          + + Permalink + + +
          Definition Classes
          AnyRef
          +
        14. + + +

          + + final + def + + + notifyAll(): Unit + +

          + + Permalink + + +
          Definition Classes
          AnyRef
          +
        15. + + +

          + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + +

          + + Permalink + + +
          Definition Classes
          AnyRef
          +
        16. + + +

          + + + def + + + textFile(paths: Seq[String], minPartitions: Int, sc: SparkContext): RDD[String] + +

          + + Permalink + + + +
        17. + + +

          + + + def + + + toString(): String + +

          + + Permalink + + +
          Definition Classes
          AnyRef → Any
          +
        18. + + +

          + + final + def + + + wait(): Unit + +

          + + Permalink + + +
          Definition Classes
          AnyRef
          Annotations
          + @throws( + + ... + ) + +
          +
        19. + + +

          + + final + def + + + wait(arg0: Long, arg1: Int): Unit + +

          + + Permalink + + +
          Definition Classes
          AnyRef
          Annotations
          + @throws( + + ... + ) + +
          +
        20. + + +

          + + final + def + + + wait(arg0: Long): Unit + +

          + + Permalink + + +
          Definition Classes
          AnyRef
          Annotations
          + @throws( + + ... + ) + +
          +
        +
        + + + + +
        + +
        +
        +

        Inherited from AnyRef

        +
        +

        Inherited from Any

        +
        + +
        + +
        +
        +

        Ungrouped

        + +
        +
        + +
        + +
        + + + + + + diff --git a/docs/org/apache/spark/package.html b/docs/org/apache/spark/package.html new file mode 100644 index 0000000..9ae6205 --- /dev/null +++ b/docs/org/apache/spark/package.html @@ -0,0 +1,246 @@ + + + + spark - org.apache.spark + + + + + + + + + + + + + + + +
        + Package +

        org.apache

        +

        spark

        + + Permalink + + +
        + +

        + + + package + + + spark + +

        + +
        + Linear Supertypes +
        AnyRef, Any
        +
        + + +
        +
        +
        + Ordering +
          + +
        1. Alphabetic
        2. +
        3. By Inheritance
        4. +
        +
        +
        + Inherited
        +
        +
          +
        1. spark
        2. AnyRef
        3. Any
        4. +
        +
        + +
          +
        1. Hide All
        2. +
        3. Show All
        4. +
        +
        +
        + Visibility +
        1. Public
        2. All
        +
        +
        + +
        +
        + + + + + + +
        +

        Value Members

        +
        1. + + +

          + + + val + + + SPARK_BRANCH: String + +

          + + Permalink + + + +
        2. + + +

          + + + val + + + SPARK_BUILD_DATE: String + +

          + + Permalink + + + +
        3. + + +

          + + + val + + + SPARK_BUILD_USER: String + +

          + + Permalink + + + +
        4. + + +

          + + + val + + + SPARK_REPO_URL: String + +

          + + Permalink + + + +
        5. + + +

          + + + val + + + SPARK_REVISION: String + +

          + + Permalink + + + +
        6. + + +

          + + + val + + + SPARK_VERSION: String + +

          + + Permalink + + + +
        7. + + +

          + + + object + + + TextFileOverwrite + +

          + + Permalink + + + +
        +
        + + + + +
        + +
        +
        +

        Inherited from AnyRef

        +
        +

        Inherited from Any

        +
        + +
        + +
        +
        +

        Ungrouped

        + +
        +
        + +
        + +
        + + + + + + diff --git a/docs/org/package.html b/docs/org/package.html new file mode 100644 index 0000000..329e4fd --- /dev/null +++ b/docs/org/package.html @@ -0,0 +1,118 @@ + + + + org - org + + + + + + + + + + + + + + + +
        + Package + +

        org

        + + Permalink + + +
        + +

        + + + package + + + org + +

        + +
        + + +
        +
        + + +
        + Visibility +
        1. Public
        2. All
        +
        +
        + +
        +
        + + + + + + +
        +

        Value Members

        +
        1. + + +

          + + + package + + + apache + +

          + + Permalink + + + +
        +
        + + + + +
        + +
        + + +
        + +
        +
        +

        Ungrouped

        + +
        +
        + +
        + +
        + + + + + + diff --git a/docs/package.html b/docs/package.html index b941069..a6a120c 100644 --- a/docs/package.html +++ b/docs/package.html @@ -87,6 +87,23 @@

        +
      3. + + +

        + + + package + + + org + +

        + + Permalink + + +
      4. diff --git a/src/main/scala/com/spark_helper/DateHelper.scala b/src/main/scala/com/spark_helper/DateHelper.scala index d393718..229a648 100644 --- a/src/main/scala/com/spark_helper/DateHelper.scala +++ b/src/main/scala/com/spark_helper/DateHelper.scala @@ -15,14 +15,31 @@ import scala.util.Try * A few examples: * * {{{ - * assert(DateHelper.daysBetween("20161230", "20170101") == List("20161230", "20161231", "20170101")) - * assert(DateHelper.today() == "20170310") // If today's "20170310" - * assert(DateHelper.yesterday() == "20170309") // If today's "20170310" - * assert(DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") == "170327") - * assert(DateHelper.now("HH:mm") == "10:24") - * assert(DateHelper.currentTimestamp() == "1493105229736") - * assert(DateHelper.nDaysBefore(3) == "20170307") // If today's "20170310" - * assert(DateHelper.nDaysAfterDate(3, "20170307") == "20170310") + * import com.spark_helper.DateHelper + * + * DateHelper.daysBetween("20161230", "20170101") // List("20161230", "20161231", "20170101") + * DateHelper.today // "20170310" + * DateHelper.yesterday // "20170309" + * DateHelper.reformatDate("20170327", "yyyyMMdd", "yyMMdd") // "170327" + * DateHelper.now("HH:mm") // "10:24" + * DateHelper.currentTimestamp // "1493105229736" + * DateHelper.nDaysBefore(3) // "20170307" + * DateHelper.nDaysAfterDate(3, "20170307") // "20170310" + * DateHelper.nextDay("20170310") // "20170311" + * DateHelper.nbrOfDaysSince("20170302") // 8 + * DateHelper.nbrOfDaysBetween("20170327", "20170401") // 5 + * DateHelper.dayOfWeek("20160614") // 2 + * + * import com.spark_helper.DateHelper._ + * + * 2.daysAgo // "20170308" + * "20161230" to "20170101" // List("20161230", "20161231", "20170101") + * 3.daysBefore("20170310") // "20170307" + * 5.daysAfter // "20170315" + * 4.daysAfter("20170310") // "20170314" + * "20170302".isCompliantWith("yyyyMMdd") + * "20170310".nextDay // "20170311" + * "20170310".previousDay // "20170309" * }}} * * Source = 0, - "the purgeAge provided \"" + purgeAge.toString + "\" must be superior to 0.") + "the purgeAge provided \"" + purgeAge.toString + "\" must be superior to 0." + ) hdfs .listStatus(new Path(folderPath)) diff --git a/src/main/scala/com/spark_helper/Monitor.scala b/src/main/scala/com/spark_helper/Monitor.scala index f5eeeca..060787c 100644 --- a/src/main/scala/com/spark_helper/Monitor.scala +++ b/src/main/scala/com/spark_helper/Monitor.scala @@ -534,7 +534,8 @@ object Monitor { require( logDirectory.nonEmpty, "to save the report, please specify the log folder using " + - "Monitor.setLogFolder(\"hdfs/path/to/log/folder\")") + "Monitor.setLogFolder(\"hdfs/path/to/log/folder\")" + ) } } diff --git a/src/main/scala/com/spark_helper/SparkHelper.scala b/src/main/scala/com/spark_helper/SparkHelper.scala index f87edf9..9d05d96 100644 --- a/src/main/scala/com/spark_helper/SparkHelper.scala +++ b/src/main/scala/com/spark_helper/SparkHelper.scala @@ -24,17 +24,37 @@ import scala.util.Random * A few examples: * * {{{ - * // Same as sc.saveAsTextFile(path), but the result is a single file: + * import com.spark_helper.SparkHelper._ + * + * // Same as rdd.saveAsTextFile("path"), but the result is a single file (while + * // keeping the processing distributed): * rdd.saveAsSingleTextFile("/my/output/file/path.txt") - * // Same as SparkContext.textFile, but instead of reading one record per - * // line, it reads records spread over several lines. - * // This way, xml, json, yml or any multi-line record file format can be used - * // with Spark: - * SparkHelper.textFileWithDelimiter("/my/input/folder/path", sparkContext, "---\n") - * // Same as SparkContext.textFile, but instead of returning an RDD of - * // records, it returns an RDD of tuples containing both the record and the - * // path of the file it comes from: - * SparkHelper.textFileWithFileName("folder", sparkContext) + * rdd.saveAsSingleTextFile("/my/output/file/path.txt", classOf[BZip2Codec]) + * + * // Same as sc.textFile("path"), but instead of reading one record per line (by + * // splitting the input with \n), it splits the file in records based on a custom + * // delimiter. This way, xml, json, yml or any multi-line record file format can + * // be used with Spark: + * sc.textFile("/my/input/folder/path", "---\n") // for a yml file for instance + * + * // Equivalent to rdd.flatMap(identity) for RDDs of Seqs or Options: + * rdd.flatten + * + * // Equivalent to sc.textFile(), but for each line is tupled with its file path: + * sc.textFileWithFileName("/my/input/folder/path") + * // which produces: + * // RDD(("folder/file_1.txt", "record1fromfile1"), ("folder/file_1.txt", "record2fromfile1"), + * // ("folder/file_2.txt", "record1fromfile2"), ...) + * + * // In the given folder, this generates one file per key in the given key/value + * // RDD. Within each file (named from the key) are all values for this key: + * rdd.saveAsTextFileByKey("/my/output/folder/path") + * + * // Concept mapper (the following example transforms RDD(1, 3, 2, 7, 8) into RDD(1, 3, 4, 7, 16)): + * rdd.partialMap { case a if a % 2 == 0 => 2 * a } + * + * // For when input files contain commas and textFile can't handle it: + * sc.textFile(Seq("path/hello,world.txt", "path/hello_world.txt")) * }}} * * Source com.github.xavierguihot spark_helper - v1.1.1 + 2.0.0 ``` @@ -330,7 +328,7 @@ allprojects { } dependencies { - compile 'com.github.xavierguihot:spark_helper:v1.1.1' + compile 'com.github.xavierguihot:spark_helper:2.0.0' } ``` diff --git a/build.sbt b/build.sbt index bc2ebbc..15d0838 100644 --- a/build.sbt +++ b/build.sbt @@ -1,6 +1,6 @@ name := "spark_helper" -version := "1.1.1" +version := "2.0.0" scalaVersion := "2.11.12"