Using dataframe to write JSON

arnold-jr · arnold-jr · commit f625e9d4422a · 2016-12-22T18:44:59.000-06:00
diff --git a/build.sbt b/build.sbt
@@ -30,7 +30,9 @@ lazy val core = (project in file(".")).
     mainClass in assembly := Some("edu.vanderbilt.accre.stackex.StackExApp"),
     libraryDependencies ++= Seq(
       "org.apache.spark" %% "spark-core" % "1.6.2" % "provided",
-      "org.apache.spark" %% "spark-sql" % "1.6.2" % "provided"
+      "org.apache.spark" %% "spark-sql" % "1.6.2" % "provided",
+      "net.sourceforge.htmlcleaner" % "htmlcleaner" % "2.18",
+      "com.databricks" %% "spark-xml" % "0.4.1"
     )
   ).
   dependsOn(util)
diff --git a/run_spark.sh b/run_spark.sh
@@ -1,14 +1,14 @@
 #!/bin/bash
 
 if [ $# -ne 0 ]; then
-  echo $0: "usage: ./run_spark.sh input" 
+  echo $0: "usage: ./run_spark.sh" 
   exit 1
 fi
 
 echo $SPARK_HOME
 
-input1=src/main/resources/Posts.xml
-output=output
+input1=src/test/resources/Posts.xml
+output=output/$(date "+%Y-%m-%d_%H.%M.%S")
 
 echo Reading input from $input1
 echo Writing output to $output
diff --git a/src/main/scala-2.10/edu/vanderbilt/accre/stackex/Post.scala b/src/main/scala-2.10/edu/vanderbilt/accre/stackex/Post.scala
@@ -0,0 +1,52 @@
+package edu.vanderbilt.accre.stackex
+
+import scala.xml.{NodeSeq, XML}
+import scala.util.Try
+
+/**
+  * Created by arnold-jr on 12/22/16.
+  */
+case class Post (id: Int,
+                 postTypeId: Int,
+                 body: String,
+                 score: Int,
+                 tags: List[String])
+
+object Post {
+
+  val fieldNames = List("Id", "PostTypeId", "Body", "Score", "Tags")
+
+  /**
+    * Parses a possibly malformed XML string and returns an XML element
+    *
+    * @param line
+    * @return XML Node
+    *
+    */
+  private def loadString(line: String): NodeSeq =
+    Try(XML.loadString(line)) getOrElse NodeSeq.Empty
+
+
+
+  /**
+    * Gets attribute values from an XML NodeSeq
+    *
+    * @param elem XML.NodeSeq constructed from "row" tag
+    * @param attr attribute identifier
+    * @return String with attribute payload
+    */
+  def getAttribute(elem: NodeSeq)(attr: String) = (elem \ ("@" + attr)).text
+
+  def apply(line: String)() = {
+    val elem = loadString(line)
+    val getAttr = getAttribute(elem)(_)
+
+    new Post(getInt(getAttr("Id")),
+      getInt(getAttr("PostTypeId")),
+      getTextFromHtml(getAttr("Body")),
+      getInt(getAttr("Score")),
+      getTags(getAttr("Tags"))
+    )
+  }
+
+}
diff --git a/src/main/scala-2.10/edu/vanderbilt/accre/stackex/StackExApp.scala b/src/main/scala-2.10/edu/vanderbilt/accre/stackex/StackExApp.scala
@@ -1,12 +1,15 @@
-/** StackExApp.scala
-  * Created by arnold-jr on 11/8/16.
-  */
 package edu.vanderbilt.accre.stackex
 
 import edu.vanderbilt.accre.xmltojson.XMLToJSONConverter
+import org.apache.spark.sql.types.{StringType, StructField, StructType}
 import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.sql.SQLContext
 
 
+/** StackExApp.scala
+  * Created by arnold-jr on 11/8/16.
+  */
+
 object StackExApp {
 
   def parseArgs(args: Array[String]) = {
@@ -25,24 +28,53 @@ object StackExApp {
 
     val conf = new SparkConf()
       .setAppName("Stack-Ex Application")
+    val sc = new SparkContext(conf)
 
 
-    val sc = new SparkContext(conf)
+    def xmlToCustomJson() = {
+      // Creates a new RDD with one XML element per line
+      val postsXML = sc.textFile(postsFile)
+
+      // Specifies how to convert the data
+      val attributeMapper = Map(
+        "Body" -> getTextFromHtml,
+        "Tags" -> getTags
+      )
 
-    val postsXML = sc.textFile(postsFile)
+      val converter = XMLToJSONConverter(attributeMapper)
 
-    val fString = (s: String) => s
-    val converter = XMLToJSONConverter(Map("Body" -> fString))
+      val postsJSON = postsXML
+        .map(line => converter.xmlToJson(line))
 
-    val postsJSON = postsXML
-      .map(line => converter.xmlToJson(line))
+      if (true) {
+        (postsJSON take 10) foreach println
+      }
 
-    if (true) {
-      (postsJSON take 10) foreach println
+      postsJSON.saveAsTextFile(outputFile)
+
+    }
+
+
+    def writeXMLToJSON() = {
+      val sqlContext = new SQLContext(sc)
+
+      import sqlContext.implicits._
+
+      // Creates a new DataFrame with one XML element per line
+      val df = sc.textFile(postsFile)
+        .map(line => Post(line))
+        .filter(p => p.id != Int.MinValue)
+        .toDF(Post.fieldNames: _*)
+
+      val postsJSON = df.toJSON
+
+      postsJSON take 5 foreach println
+
+      postsJSON.saveAsTextFile(outputFile)
     }
 
-    // Writes output to file
-    postsJSON.saveAsTextFile(outputFile)
+    writeXMLToJSON()
+
 
     sc.stop()
 
diff --git a/src/main/scala-2.10/edu/vanderbilt/accre/stackex/package.scala b/src/main/scala-2.10/edu/vanderbilt/accre/stackex/package.scala
@@ -0,0 +1,49 @@
+package edu.vanderbilt.accre
+
+import org.htmlcleaner.{HtmlCleaner, HtmlNode, TagNode, TagNodeVisitor}
+import org.apache.commons.lang.StringEscapeUtils.escapeHtml
+
+import scala.util.Try
+
+/**
+  * Created by arnold-jr on 12/22/16.
+  */
+package object stackex {
+
+  val tagNodeVisitor = new TagNodeVisitor {
+
+    val includedTags = List("b", "blockquote", "dl", "dt", "em", "h1", "h2",
+      "h3", "i", "li", "ol", "p", "strong", "ul")
+    override def visit(tagNode: TagNode, htmlNode: HtmlNode): Boolean = {
+      htmlNode match {
+        case t: TagNode =>
+          if (!(includedTags contains t.getName)) {
+            t.removeFromTree()
+          }
+        case _ =>
+      }
+      true
+    }
+  }
+
+  def getInt(s: String): Int = Try(s.toInt) getOrElse Int.MinValue
+
+
+  val getTextFromHtml: String => String  = (html: String) => {
+    val cleaner = new HtmlCleaner
+    val rootNode = cleaner.clean(html).getElementsByName("body",false)(0)
+
+    // Prunes the tree
+    rootNode.traverse(tagNodeVisitor)
+
+    rootNode.getText.toString
+  }
+
+  val getFullText = (text: String) => escapeHtml(text)
+
+  val getTags: String => List[String] = (text: String) => {
+    val tagPattern = "(?<=&lt;)\\S+?(?=&gt;)".r
+    (tagPattern findAllIn escapeHtml(text)) toList
+  }
+
+}
diff --git a/src/main/scala-2.10/scratch00.sc b/src/main/scala-2.10/scratch00.sc
diff --git a/src/test/scala-2.10/StringEscapeUtils.sc b/src/test/scala-2.10/StringEscapeUtils.sc
@@ -0,0 +1,18 @@
+import org.apache.commons.lang.StringEscapeUtils.{escapeHtml, unescapeHtml}
+
+escapeHtml("<neural-networks><definitions>")
+
+escapeHtml("&lt;neural&gt;")
+
+val text = "&lt;neural-networks&gt;&lt;definitions&gt;"
+
+val e = escapeHtml(text)
+val u = unescapeHtml(text)
+
+val tagPattern = "(?<=&lt;)\\S+?(?=&gt;)".r
+(tagPattern findAllIn text) toList
+
+(tagPattern findAllIn e) toList
+
+(tagPattern findAllIn u) toList
+
diff --git a/src/test/scala-2.10/TestStackExApp.scala b/src/test/scala-2.10/TestStackExApp.scala
@@ -0,0 +1,85 @@
+import edu.vanderbilt.accre.stackex._
+import org.scalatest.WordSpec
+
+/**
+  * Created by arnold-jr on 12/21/16.
+  */
+
+class TestStackExApp extends WordSpec {
+
+  val line =
+    """<row Id="2435" PostTypeId="1" CreationDate="2016-12-06T19:50:13.853"
+      |Score="-1" ViewCount="12" Body="&lt;p&gt;If I have a dataset of images,
+      |and I extract all cnn feature vectors from them.&#xA;After that I generate
+      |the pca model of these features by doing:&lt;/p&gt; &#xA; &#xA; &lt;pre&gt;
+      |&lt;code&gt;pca.fit(ALL_features)&#xA; &lt;/code&gt; &lt;/pre&gt; &#xA;
+      |&#xA; &lt;p&gt;IF I have a new image and I need to check the similarity
+      |between this image and the whole dataset, what I have to do?&lt;/p&gt;
+      |&#xA; &#xA; &lt;ol&gt; &#xA; &lt;li&gt;Extract cnn features from this
+      |image.&lt;/li&gt; &#xA; &lt;li&gt;How to use the previous pca
+      |model?&lt;/li&gt; &#xA; &lt;li&gt;How to check the similarity between
+      |the dataset features and the new image features?&lt;/li&gt; &#xA;
+      |&lt;/ol&gt; &#xA; &#xA; &lt;p&gt;Is by doing this? or how?&lt;/p&gt;
+      |&#xA; &#xA; &lt;pre&gt; &lt;code&gt;self.pca.transform(self.db_feats)&#xA;
+      |&lt;/code&gt; &lt;/pre&gt; &#xA;" OwnerUserId="1644"
+      |LastActivityDate="2016-12-06T19:50:13.853" Title="PCA pca.fit VS
+      |pca.transform" Tags="&lt;machine-learning&gt; &lt;deep-learning&gt;
+      |&lt;image-recognition&gt; &lt;conv-neural-network&gt;" AnswerCount="0"
+      |CommentCount="0"/>""".stripMargin
+
+  val tagText = """"&lt;machine-learning&gt;&lt;deep-learning&gt;
+               |&lt;image-recognition&gt;&lt;conv-neural-network&gt;""""
+
+  "getTags" when {
+    "applied to Tag text" should {
+      "return a list of tags as String" in {
+        assert(getTags(tagText) ==
+          List("machine-learning", "deep-learning", "image-recognition",
+            "conv-neural-network")
+        )
+      }
+    }
+  }
+
+  "getTextFromHtml" when {
+    "passed some nested html" should {
+      "return the body text in the correct order" in {
+        assert(
+          getTextFromHtml("<p><em>Emphatic</em><a>excluded </a> parallel</p>") ==
+            "Emphatic parallel"
+        )
+      }
+    }
+    "passed a valid html snippet" should {
+      "return the body text" in {
+        assert(getTextFromHtml("<p>some text.</p>") == "some text.")
+      }
+    }
+    "passed some nested html" should {
+      "return the body text" in {
+        assert(getTextFromHtml("<p><p>some text.</p></p>") == "some text.")
+      }
+    }
+    "passed some heterogeneous nested html" should {
+      "return the body text" in {
+        assert(getTextFromHtml("<i><p>some text.</p></i>") == "some text.")
+      }
+    }
+    "passed some heterogeneous 3-level-nested html" should {
+      "return the body text" in {
+        assert(getTextFromHtml("<p><em><i>some text.</i></em></p>") ==
+          "some text.")
+      }
+    }
+    "passed some excluded html" should {
+      "return the body text" in {
+        assert(
+          getTextFromHtml("<a>excluded text</a><p>some text.</p>") ==
+            "some text."
+        )
+      }
+    }
+  }
+
+
+}
diff --git a/src/test/scala-2.10/databricks.sc b/src/test/scala-2.10/databricks.sc
@@ -0,0 +1,35 @@
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.types.{StringType, StructField, StructType}
+
+val customSchema = StructType(Array(
+  StructField("_Id", StringType, nullable = true),
+  StructField("_PostTypeId", StringType, nullable = true),
+  StructField("_ParentID", StringType, nullable = true),
+  StructField("_AcceptedAnswerId", StringType, nullable = true),
+  StructField("_CreationDate", StringType, nullable = true),
+  StructField("_Score", StringType, nullable = true),
+  StructField("_ViewCount", StringType, nullable = true),
+  StructField("_Body", StringType, nullable = true),
+  StructField("_OwnerUserId", StringType, nullable = true),
+  StructField("_LastEditorUserId", StringType, nullable = true),
+  StructField("_LastEditorDisplayName", StringType, nullable = true),
+  StructField("_LastEditDate", StringType, nullable = true),
+  StructField("_LastActivityDate", StringType, nullable = true),
+  StructField("_CommunityOwnedDate", StringType, nullable = true),
+  StructField("_ClosedDate", StringType, nullable = true),
+  StructField("_Title", StringType, nullable = true),
+  StructField("_Tags", StringType, nullable = true),
+  StructField("_AnswerCount", StringType, nullable = true),
+  StructField("_CommentCount", StringType, nullable = true),
+  StructField("_FavoriteCount", StringType, nullable = true)
+))
+
+val sc = new SparkContext(new SparkConf().setMaster("local(1)").setAppName("foo"))
+val sqlContext = new SQLContext(sc)
+val df = sqlContext.read
+  .format("com.databricks.spark.xml")
+  .option("rowTag", "book")
+  .load("/Users/joshuaarnold/books.xml")
+
+sc.stop()
diff --git a/util/src/main/scala-2.10/edu/vanderbilt/accre/xmltojson/XMLToJSONConverter.scala b/util/src/main/scala-2.10/edu/vanderbilt/accre/xmltojson/XMLToJSONConverter.scala
diff --git a/util/src/test/scala-2.10/TestXMLtoJSONConverter.scala b/util/src/test/scala-2.10/TestXMLtoJSONConverter.scala
diff --git a/util/src/test/scala-2.10/scratch01.sc b/util/src/test/scala-2.10/scratch01.sc