From 746bc61a04bf031cc8841899218ac59d51fe826e Mon Sep 17 00:00:00 2001 From: David Smith Date: Fri, 29 Mar 2019 05:44:22 -0400 Subject: [PATCH 01/39] Initial switch to label propagation clustering. --- src/main/scala/PassimApp.scala | 36 ++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index c4160fb..318b9dd 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -1230,36 +1230,46 @@ transform($pageCol, if ( !hdfsExists(spark, clusterFname) ) { val pass = spark.read.parquet(passFname) - spark.conf.set("spark.sql.shuffle.partitions", spark.sparkContext.defaultParallelism) + // spark.conf.set("spark.sql.shuffle.partitions", spark.sparkContext.defaultParallelism) val passGraph = GraphFrame( pass.select('nid as "id", 'uid, 'gid, 'begin, 'end), pass.select('nid, explode('edges) as "eid") .groupBy("eid").agg(min("nid") as "src", max("nid") as "dst")) - passGraph.cache() + // passGraph.cache() - spark.sparkContext.setCheckpointDir(config.outputPath + "/tmp") - val cc = passGraph.connectedComponents.run() + val lp = passGraph.labelPropagation.maxIter(11).run() - val merge_spans = udf { (spans: Seq[Row]) => - PassFun.mergeSpansLR(0, spans.map { s => (Span(s.getInt(0), s.getInt(1)), 0L) }) - .map { _._1 } - } + // spark.sparkContext.setCheckpointDir(config.outputPath + "/tmp") + // val cc = passGraph.connectedComponents.run() + + // val merge_spans = udf { (spans: Seq[Row]) => + // PassFun.mergeSpansLR(0, spans.map { s => (Span(s.getInt(0), s.getInt(1)), 0L) }) + // .map { _._1 } + // } val clusters = - cc.groupBy("component", "uid") + lp.groupBy("label", "uid") .agg(merge_spans(collect_list(struct("begin", "end"))) as "spans") - .select('component as "cluster", 'uid, explode('spans) as "span") + .select('label as "cluster", 'uid, explode('spans) as "span") .select('cluster, 'uid, $"span.*") clusters.cache() + // val clusters = + // cc.groupBy("component", "uid") + // .agg(merge_spans(collect_list(struct("begin", "end"))) as "spans") + // .select('component as "cluster", 'uid, explode('spans) as "span") + // .select('cluster, 'uid, $"span.*") + // clusters.cache() + clusters.join(clusters.groupBy("cluster").agg(count("uid") as "size"), "cluster") .select('uid, 'cluster, 'size, 'begin, 'end) .write.parquet(clusterFname) - passGraph.unpersist() - cc.unpersist() - spark.conf.set("spark.sql.shuffle.partitions", corpus.rdd.getNumPartitions * 3) + // clusters.unpersist() + // passGraph.unpersist() + // cc.unpersist() + // spark.conf.set("spark.sql.shuffle.partitions", corpus.rdd.getNumPartitions * 3) } if ( !hdfsExists(spark, outFname) ) { From b330e73dc4abd0dd7d22a2cf4c87fdd2c88013c0 Mon Sep 17 00:00:00 2001 From: David Smith Date: Mon, 15 Apr 2019 08:07:46 -0400 Subject: [PATCH 02/39] Upgrade to spark 2.4.1 --- build.sbt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build.sbt b/build.sbt index c3b8427..17de0a8 100644 --- a/build.sbt +++ b/build.sbt @@ -6,9 +6,9 @@ scalaVersion := "2.11.8" resolvers += Resolver.mavenLocal -libraryDependencies += "org.apache.spark" %% "spark-core" % "2.4.0" -libraryDependencies += "org.apache.spark" %% "spark-graphx" % "2.4.0" -libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.0" +libraryDependencies += "org.apache.spark" %% "spark-core" % "2.4.1" +libraryDependencies += "org.apache.spark" %% "spark-graphx" % "2.4.1" +libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.1" resolvers += "Spark Packages Repo" at "http://dl.bintray.com/spark-packages/maven" From fa2f49836c375d1f78fb01294c8b96c689de4f42 Mon Sep 17 00:00:00 2001 From: David Smith Date: Sat, 27 Apr 2019 10:48:06 -0400 Subject: [PATCH 03/39] Use --labelPropagation for label propagation clustering. --- src/main/scala/PassimApp.scala | 45 +++++++++++++++++----------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 318b9dd..9c583c4 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -21,6 +21,7 @@ import org.graphframes._ case class Config(version: String = BuildInfo.version, boilerplate: Boolean = false, + labelPropagation: Boolean = false, n: Int = 5, minDF: Int = 2, maxDF: Int = 100, minRep: Int = 5, minAlg: Int = 20, gap: Int = 100, relOver: Double = 0.8, mergeDiverge: Double = 0.3, maxRep: Int = 10, context: Int = 0, @@ -957,6 +958,8 @@ transform($pageCol, val parser = new scopt.OptionParser[Config]("passim") { opt[Unit]("boilerplate") action { (_, c) => c.copy(boilerplate = true) } text("Detect boilerplate within groups.") + opt[Unit]("labelPropagation") action { (_, c) => + c.copy(labelPropagation = true) } text("Cluster with label propagation.") opt[Int]('n', "n") action { (x, c) => c.copy(n = x) } validate { x => if ( x > 0 ) success else failure("n-gram order must be > 0") } text("index n-gram features; default=5") @@ -1230,46 +1233,42 @@ transform($pageCol, if ( !hdfsExists(spark, clusterFname) ) { val pass = spark.read.parquet(passFname) - // spark.conf.set("spark.sql.shuffle.partitions", spark.sparkContext.defaultParallelism) + if ( !config.labelPropagation ) { + spark.conf.set("spark.sql.shuffle.partitions", spark.sparkContext.defaultParallelism) + } val passGraph = GraphFrame( pass.select('nid as "id", 'uid, 'gid, 'begin, 'end), pass.select('nid, explode('edges) as "eid") .groupBy("eid").agg(min("nid") as "src", max("nid") as "dst")) - // passGraph.cache() - - val lp = passGraph.labelPropagation.maxIter(11).run() - // spark.sparkContext.setCheckpointDir(config.outputPath + "/tmp") - // val cc = passGraph.connectedComponents.run() + val groups = if ( config.labelPropagation ) { + passGraph.labelPropagation.maxIter(11).run().withColumnRenamed("label", "cluster") + } else { + spark.sparkContext.setCheckpointDir(config.outputPath + "/tmp") + passGraph.connectedComponents.run().withColumnRenamed("component", "cluster") + } - // val merge_spans = udf { (spans: Seq[Row]) => - // PassFun.mergeSpansLR(0, spans.map { s => (Span(s.getInt(0), s.getInt(1)), 0L) }) - // .map { _._1 } - // } + val merge_spans = udf { (spans: Seq[Row]) => + PassFun.mergeSpansLR(0, spans.map { s => (Span(s.getInt(0), s.getInt(1)), 0L) }) + .map { _._1 } + } val clusters = - lp.groupBy("label", "uid") + groups.groupBy("cluster", "uid") .agg(merge_spans(collect_list(struct("begin", "end"))) as "spans") - .select('label as "cluster", 'uid, explode('spans) as "span") + .select('cluster, 'uid, explode('spans) as "span") .select('cluster, 'uid, $"span.*") clusters.cache() - // val clusters = - // cc.groupBy("component", "uid") - // .agg(merge_spans(collect_list(struct("begin", "end"))) as "spans") - // .select('component as "cluster", 'uid, explode('spans) as "span") - // .select('cluster, 'uid, $"span.*") - // clusters.cache() - clusters.join(clusters.groupBy("cluster").agg(count("uid") as "size"), "cluster") .select('uid, 'cluster, 'size, 'begin, 'end) .write.parquet(clusterFname) - // clusters.unpersist() - // passGraph.unpersist() - // cc.unpersist() - // spark.conf.set("spark.sql.shuffle.partitions", corpus.rdd.getNumPartitions * 3) + clusters.unpersist() + if ( !config.labelPropagation ) { + spark.conf.set("spark.sql.shuffle.partitions", corpus.rdd.getNumPartitions * 3) + } } if ( !hdfsExists(spark, outFname) ) { From 6da4bba93982c477ec3cfb8f7defa6c8a25b918a Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 21 May 2019 14:46:31 +0200 Subject: [PATCH 04/39] Pick a single source for boilerplate. --- src/main/scala/PassimApp.scala | 190 ++++++++++++++++++++++----------- 1 file changed, 129 insertions(+), 61 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index c4160fb..322541f 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -76,24 +76,24 @@ case class Span(val begin: Int, val end: Int) { case class Post(feat: Long, tf: Int, post: Int) -case class IdSeries(id: Long, series: Long) - case class PassAlign(id1: String, id2: String, s1: String, s2: String, b1: Int, e1: Int, n1: Int, b2: Int, e2: Int, n2: Int, matches: Int, score: Float) case class AlignedStrings(s1: String, s2: String, matches: Int, score: Float) -case class NewDoc(id: String, text: String, pages: Seq[Page], aligned: Boolean) - case class LinkedSpan(span: Span, links: ArrayBuffer[Long]) case class ExtentPair(seq1: Int, seq2: Int, begin1: Int, begin2: Int, end1: Int, end2: Int, tok1: Int, tok2: Int) case class WitInfo(start: Int, length: Int, begin: Int, text: String) +case class SpanPair(b1: Int, e1: Int, b2: Int, e2: Int) + case class LineInfo(start: Int, text: String) +case class NewDoc(id: String, text: String, pages: Seq[Page], span: SpanPair) + object PassFun { def increasingMatches(matches: Iterable[(Int,Int,Int)]): Array[(Int,Int,Int)] = { val in = matches.toArray.sorted @@ -387,6 +387,7 @@ transform($pageCol, } } + // TODO: Make minLines a parameter val alignedPassages = udf { (s1: String, s2: String) => var start = 0 var b1 = 0 @@ -459,43 +460,46 @@ transform($pageCol, p } - val splitDoc = udf { (id: String, text: String, pages: Seq[Row], - begin: Seq[Int], end: Seq[Int]) => + val splitDoc = udf { (id: String, text: String, pages: Seq[Row], spans: Seq[Row]) => val pp = if ( pages == null ) Array[Page]() // Try doesn't catch nulls else Try(pages.map(PassimApp.rowToPage).toArray).getOrElse(Array[Page]()) val reg = if ( pp.size == 0 ) Array[Region]() else pp(0).regions val docs = new ArrayBuffer[NewDoc] - if ( begin == null || begin.size <= 0 ) { - docs += NewDoc(id, text, pp, false) + if ( spans == null || spans.size <= 0 ) { + docs += NewDoc(id, text, pp, null) } else { var start = 0 var breg = 0 var ereg = 0 - for ( i <- 0 until begin.size ) { - if ( (begin(i) - start) >= 2 ) { - // Should check that this document is more than just a few whitespace characters - while ( ereg < reg.size && reg(ereg).start < begin(i) ) ereg += 1 - docs += NewDoc(id + "_" + start + "_" + begin(i), - text.substring(start, begin(i)), - subpage(pp, reg.slice(breg, ereg).map(_.offset(-start))), - false) - breg = ereg + for ( span <- spans ) { + span match { + case Row(b1: Int, e1: Int, b2: Int, e2: Int) => + if ( (b1 - start) >= 2 ) { + // Should check that this document is more than just a few whitespace characters + while ( ereg < reg.size && reg(ereg).start < b1 ) ereg += 1 + docs += NewDoc(id + "_" + start + "_" + b1, + text.substring(start, b1), + subpage(pp, reg.slice(breg, ereg).map(_.offset(-start))), + null) + breg = ereg + } + while ( ereg < reg.size && reg(ereg).start < e1 ) ereg += 1 + docs += NewDoc(id + "_" + b1 + "_" + e1, + text.substring(b1, e1), + subpage(pp, reg.slice(breg, ereg).map(_.offset(-b1))), + SpanPair(b1, e1, b2, e2)) + breg = ereg + start = e1 } - while ( ereg < reg.size && reg(ereg).start < end(i) ) ereg += 1 - docs += NewDoc(id + "_" + begin(i) + "_" + end(i), - text.substring(begin(i), end(i)), - subpage(pp, reg.slice(breg, ereg).map(_.offset(-begin(i)))), - true) - breg = ereg - start = end(i) } - if ( (text.size - end.last) >= 2 ) { + val lastend = spans.last.getInt(1) + if ( (text.size - lastend) >= 2 ) { if ( ereg < reg.size ) ereg = reg.size - docs += NewDoc(id + "_" + end.last + "_" + text.size, - text.substring(end.last, text.size), - subpage(pp, reg.slice(breg, ereg).map(_.offset(-end.last))), - false) + docs += NewDoc(id + "_" + lastend + "_" + text.size, + text.substring(lastend, text.size), + subpage(pp, reg.slice(breg, ereg).map(_.offset(-lastend))), + null) } } docs.toArray @@ -503,20 +507,62 @@ transform($pageCol, def boilerSplit(passages: DataFrame, raw: DataFrame): DataFrame = { import passages.sparkSession.implicits._ val pageField = if ( raw.columns.contains("pages") ) "pages" else "null" + val srcSpans = udf { (lines: Seq[Row]) => + val res = ListBuffer[SpanPair]() + var curb1 = -1 + var cure1 = -1 + var curb2 = -1 + var cure2 = -1 + for ( cur <- lines ) { + cur match { + case Row(b1: Int, len1: Int, b2: Int, len2: Int) => + val e1 = b1 + len1 + val e2 = b2 + len2 + if ( b1 > cure1 || b2 > cure2 ) { + if ( curb1 > -1 ) { + res += SpanPair(curb1, cure1, curb2, cure2) + } + curb1 = b1 + cure1 = e1 + curb2 = b2 + cure2 = e2 + } else { + cure1 = e1 + cure2 = e2 + } + } + } + if ( curb1 > -1 ) { + res += SpanPair(curb1, cure1, curb2, cure2) + } + res.toSeq + } + // TODO: Just pick one and split the destination document. BUT! If + // we point to a span in the source document, but the source + // document itself gets split, what do we do? We don't even need + // to assume errors in the alignment: If A copies two ads from + // different sources and then puts them side by side, and then B + // copies both ads, then B will be segmented, and point to two + // segments of A. Do we have to resolve this at the line level? passages - .select('id2 as "id", 'b2 as "begin", 'e2 as "end") + .groupBy("id", "start", "length") + .agg(max("wit") as "src") + .groupBy($"id", $"src.id" as "src") + .agg(sort_array(collect_list(struct($"start", $"length", + $"src.begin" as "sstart", length($"src.text") as "slength"))) as "lines") .groupBy("id") - .agg(mergeAligned(collect_list("begin"), collect_list("end")) as "spans") - .select('id, $"spans._1" as "begin", $"spans._2" as "end") + .agg(max(struct(size($"lines") as "count", $"src" as "id", $"lines")) as "src") + .select($"id", $"src.id" as "src", srcSpans($"src.lines") as "spans") .join(raw, Seq("id"), "right_outer") - .withColumn("subdoc", explode(splitDoc('id, 'text, expr(pageField), 'begin, 'end))) - .drop("begin", "end") - .withColumnRenamed("id", "docid") + .withColumn("subdoc", explode(splitDoc('id, 'text, expr(pageField), 'spans))) + .withColumn("src", when($"subdoc.span".isNull, null).otherwise(struct('src as "id", + $"subdoc.span.b2" as "start", $"subdoc.span.e2" - $"subdoc.span.b2" as "length"))) + .withColumn("doc", when($"subdoc.span".isNull, null).otherwise(struct('id, + $"subdoc.span.b1" as "start", $"subdoc.span.e1" - $"subdoc.span.b1" as "length"))) .withColumn("id", $"subdoc.id") .withColumn("text", $"subdoc.text") .withColumn("pages", $"subdoc.pages") - .withColumn("aligned", $"subdoc.aligned") - .drop("subdoc") + .drop("subdoc", "spans") } def clusterJoin(config: Config, clusters: DataFrame, corpus: DataFrame): DataFrame = { import clusters.sparkSession.implicits._ @@ -695,6 +741,18 @@ transform($pageCol, } def boilerPassages(config: Config, corpus: DataFrame): DataFrame = { import align.sparkSession.implicits._ + val lineRecord = udf { + (b1: Int, b2: Int, pairs: Seq[Row]) => + var off1 = b1 + var off2 = b2 + pairs.map { (p: Row) => + val s1 = p.getString(0) + val s2 = p.getString(1) + off2 += s2.length + off1 += s1.length + WitInfo(off2 - s2.length, s2.length, off1 - s1.length, s1) + } + } val alignStrings = makeStringAligner(config, openGap = 1) val metaFields = ListBuffer[String]() if ( corpus.columns.contains("date") ) metaFields += "date" @@ -703,9 +761,10 @@ transform($pageCol, .join(corpus.select('uid, col(config.id) as "id", col(config.text) as "text", struct(metaFields.toList.map(expr):_*) as "meta", 'termCharBegin, 'termCharEnd), "uid") - .withColumn("begin", 'termCharBegin('begin)) + .withColumn("begin", lineStart('text, 'termCharBegin('begin))) .withColumn("end", - when('end < size('termCharBegin), 'termCharBegin('end)).otherwise(length('text))) + lineStop('text, + when('end < size('termCharBegin), 'termCharBegin('end)).otherwise(length('text)))) .select('mid, struct('first, 'id, 'meta, 'begin, 'end, getPassage('text, 'begin, 'end) as "text") as "info") .groupBy("mid") @@ -719,6 +778,10 @@ transform($pageCol, "info[0].begin + pass._1.end as e1", "info[1].begin + pass._2.begin as b2", "info[1].begin + pass._2.end as e2") + .select('id2 as "id", 'id1 as "src", 'meta1 as "meta", + explode(lineRecord('b1, 'b2, 'pairs)) as "wit") + .select('id, $"wit.start", $"wit.length", + struct('meta, 'src as "id", $"wit.begin", $"wit.text") as "wit") } def aggregateAlignments(config: Config, corpus: DataFrame, extents: DataFrame): DataFrame = { import align.sparkSession.implicits._ @@ -931,6 +994,26 @@ transform($pageCol, Math.max(0, Math.min(terms.size, end))).mkString(" ") } val getPassage = udf { (text: String, begin: Int, end: Int) => text.substring(begin, end) } + val lineStart = udf { (text: String, begin: Int) => + var start = begin + while ( start > 0 && (begin - start) < 20 && text.charAt(start - 1) != '\n' ) { + start -= 1 + } + if ( start > 0 && text.charAt(start - 1) != '\n' ) + begin + else + start + } + val lineStop = udf { (text: String, end: Int) => + var stop = end + while ( stop < text.length && (stop - end) < 20 && text.charAt(stop - 1) != '\n' ) { + stop += 1 + } + if ( stop < text.length && text.charAt(stop - 1) != '\n' ) + end + else + stop + } def hdfsExists(spark: SparkSession, path: String) = { val hdfsPath = new Path(path) val fs = hdfsPath.getFileSystem(spark.sparkContext.hadoopConfiguration) @@ -944,7 +1027,7 @@ transform($pageCol, .set("spark.driver.maxResultSize", "4g") .registerKryoClasses(Array(classOf[Coords], classOf[Region], classOf[Span], classOf[Post], classOf[PassAlign], - classOf[TokText], classOf[IdSeries],classOf[ExtentPair])) + classOf[TokText], classOf[ExtentPair])) val spark = SparkSession .builder() @@ -1176,20 +1259,11 @@ transform($pageCol, } if ( config.boilerplate || config.docwise ) { - val pass = extents.boilerPassages(config, corpus) + if ( !hdfsExists(spark, passFname) ) { + extents.boilerPassages(config, corpus).write.parquet(passFname) + } + val pass = spark.read.parquet(passFname) if ( config.docwise ) { - val lineRecord = udf { - (b1: Int, b2: Int, pairs: Seq[Row]) => - var off1 = b1 - var off2 = b2 - pairs.map { (p: Row) => - val s1 = p.getString(0) - val s2 = p.getString(1) - off2 += s2.length - off1 += s1.length - WitInfo(off2 - s2.length, s2.length, off1 - s1.length, s1) - } - } val textLines = udf { (text: String) => val res = ListBuffer[LineInfo]() var off = 0 @@ -1199,11 +1273,7 @@ transform($pageCol, } res.toSeq } - pass - .select('id2 as "id", 'id1 as "id1", 'meta1 as "meta", - explode(lineRecord('b1, 'b2, 'pairs)) as "wit") - .select('id, $"wit.start", $"wit.length", - struct('meta, 'id1 as "id", $"wit.begin", $"wit.text") as "wit") + pass // should include target text offset to support later correction .groupBy("id", "start", "length") .agg(sort_array(collect_list("wit")) as "wits") .groupBy("id") @@ -1216,9 +1286,7 @@ transform($pageCol, .drop("tlines", "mvars", "variants") .write.format(config.outputFormat).save(outFname) } else { - pass.drop("pairs").write.parquet(passFname) - boilerSplit(spark.read.parquet(passFname), raw) - .write.format(config.outputFormat).save(outFname) + boilerSplit(pass, raw).write.format(config.outputFormat).save(outFname) } sys.exit(0) } From 6ede718fd5fb0f7060f709367213d1cc84833834 Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 21 May 2019 08:48:58 -0400 Subject: [PATCH 05/39] Upgrade to spark 2.4.3. --- build.sbt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build.sbt b/build.sbt index 17de0a8..822d08f 100644 --- a/build.sbt +++ b/build.sbt @@ -6,9 +6,9 @@ scalaVersion := "2.11.8" resolvers += Resolver.mavenLocal -libraryDependencies += "org.apache.spark" %% "spark-core" % "2.4.1" -libraryDependencies += "org.apache.spark" %% "spark-graphx" % "2.4.1" -libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.1" +libraryDependencies += "org.apache.spark" %% "spark-core" % "2.4.3" +libraryDependencies += "org.apache.spark" %% "spark-graphx" % "2.4.3" +libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.3" resolvers += "Spark Packages Repo" at "http://dl.bintray.com/spark-packages/maven" From e02765fc78311949471d2f86b8bc6719a1020fce Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 21 May 2019 15:48:36 +0200 Subject: [PATCH 06/39] Fix references to full doc. --- src/main/scala/PassimApp.scala | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 322541f..689413b 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -92,7 +92,7 @@ case class SpanPair(b1: Int, e1: Int, b2: Int, e2: Int) case class LineInfo(start: Int, text: String) -case class NewDoc(id: String, text: String, pages: Seq[Page], span: SpanPair) +case class NewDoc(id: String, text: String, pages: Seq[Page], docspan: Span, srcspan: Span) object PassFun { def increasingMatches(matches: Iterable[(Int,Int,Int)]): Array[(Int,Int,Int)] = { @@ -467,7 +467,7 @@ transform($pageCol, val reg = if ( pp.size == 0 ) Array[Region]() else pp(0).regions val docs = new ArrayBuffer[NewDoc] if ( spans == null || spans.size <= 0 ) { - docs += NewDoc(id, text, pp, null) + docs += NewDoc(id, text, pp, null, null) } else { var start = 0 var breg = 0 @@ -481,14 +481,14 @@ transform($pageCol, docs += NewDoc(id + "_" + start + "_" + b1, text.substring(start, b1), subpage(pp, reg.slice(breg, ereg).map(_.offset(-start))), - null) + Span(start, b1), null) breg = ereg } while ( ereg < reg.size && reg(ereg).start < e1 ) ereg += 1 docs += NewDoc(id + "_" + b1 + "_" + e1, text.substring(b1, e1), subpage(pp, reg.slice(breg, ereg).map(_.offset(-b1))), - SpanPair(b1, e1, b2, e2)) + Span(b1, e1), Span(b2, e2)) breg = ereg start = e1 } @@ -499,7 +499,7 @@ transform($pageCol, docs += NewDoc(id + "_" + lastend + "_" + text.size, text.substring(lastend, text.size), subpage(pp, reg.slice(breg, ereg).map(_.offset(-lastend))), - null) + Span(lastend, text.size), null) } } docs.toArray @@ -555,10 +555,12 @@ transform($pageCol, .select($"id", $"src.id" as "src", srcSpans($"src.lines") as "spans") .join(raw, Seq("id"), "right_outer") .withColumn("subdoc", explode(splitDoc('id, 'text, expr(pageField), 'spans))) - .withColumn("src", when($"subdoc.span".isNull, null).otherwise(struct('src as "id", - $"subdoc.span.b2" as "start", $"subdoc.span.e2" - $"subdoc.span.b2" as "length"))) - .withColumn("doc", when($"subdoc.span".isNull, null).otherwise(struct('id, - $"subdoc.span.b1" as "start", $"subdoc.span.e1" - $"subdoc.span.b1" as "length"))) + .withColumn("src", when($"subdoc.srcspan".isNull, null).otherwise(struct('src as "id", + $"subdoc.srcspan.begin" as "start", + $"subdoc.srcspan.end" - $"subdoc.srcspan.begin" as "length"))) + .withColumn("doc", when($"subdoc.docspan".isNull, null).otherwise(struct('id, + $"subdoc.docspan.begin" as "start", + $"subdoc.docspan.end" - $"subdoc.docspan.begin" as "length"))) .withColumn("id", $"subdoc.id") .withColumn("text", $"subdoc.text") .withColumn("pages", $"subdoc.pages") From 8c2dca3615418406aac0cf72bc14e6d0286a4842 Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 21 May 2019 11:47:06 -0400 Subject: [PATCH 07/39] Check for out-of-order boilerplate splits. --- src/main/scala/PassimApp.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 689413b..fb1ebf7 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -518,7 +518,7 @@ transform($pageCol, case Row(b1: Int, len1: Int, b2: Int, len2: Int) => val e1 = b1 + len1 val e2 = b2 + len2 - if ( b1 > cure1 || b2 > cure2 ) { + if ( b1 > cure1 || b2 > cure2 || e2 < cure2 ) { if ( curb1 > -1 ) { res += SpanPair(curb1, cure1, curb2, cure2) } From 8ff92161608c723e565852af16b9340ef5d53166 Mon Sep 17 00:00:00 2001 From: David Smith Date: Wed, 22 May 2019 05:53:36 -0400 Subject: [PATCH 08/39] Make minLines a parameter. --- src/main/scala/PassimApp.scala | 126 +++++++++++++++------------------ 1 file changed, 59 insertions(+), 67 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index fb1ebf7..a0032fa 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -23,6 +23,7 @@ case class Config(version: String = BuildInfo.version, boilerplate: Boolean = false, n: Int = 5, minDF: Int = 2, maxDF: Int = 100, minRep: Int = 5, minAlg: Int = 20, gap: Int = 100, relOver: Double = 0.8, mergeDiverge: Double = 0.3, maxRep: Int = 10, + minLines: Int = 5, context: Int = 0, wordLength: Double = 2, pairwise: Boolean = false, @@ -387,72 +388,6 @@ transform($pageCol, } } - // TODO: Make minLines a parameter - val alignedPassages = udf { (s1: String, s2: String) => - var start = 0 - var b1 = 0 - var b2 = 0 - val buf = ArrayBuffer[(Int, Double, Int, Int, String, String)]() - for ( end <- 1 until s2.size ) { - if ( s2(end) == '\n' ) { - val alg1 = s1.substring(start, end+1) - val alg2 = s2.substring(start, end+1) - val t1 = alg1.replaceAll("-", "").replaceAll("\u2010", "-") - val t2 = alg2.replaceAll("-", "").replaceAll("\u2010", "-") - - val matches = alg1.zip(alg2).count(x => x._1 == x._2) - buf += ((t2.size - t1.size, matches * 1.0 / t2.size, b1, b2, t1, t2)) - start = end + 1 - b1 += t1.size - b2 += t2.size - } - } - val lines = buf.toArray - - val minLines = 5 - - val pass = ArrayBuffer[(Span, Span, Array[(String, String)])]() - val pairs = ArrayBuffer[(String, String)]() - var i = 0 - start = 0 - while ( i < lines.size ) { - if ( lines(i)._1.abs > 20 || lines(i)._2 < 0.1 ) { - if ( start < i - && (i + 2) < lines.size - && lines(i+1)._1.abs <= 20 && lines(i+1)._2 >= 0.1 - && (lines(i+1)._3 - lines(i)._3) <= 20 - && (lines(i+1)._4 - lines(i)._4) <= 20 ) { - // continue passage - pairs += ((lines(i)._5, lines(i)._6)) - } else { - if ( (i - start) >= minLines ) { - pass += ((Span(lines(start)._3, lines(i)._3), - Span(lines(start)._4, lines(i)._4), - pairs.toArray)) - } - start = i + 1 - pairs.clear - } - } else { - pairs += ((lines(i)._5, lines(i)._6)) - } - i += 1 - } - if ( (i - start) >= minLines ) { - pass += ((Span(lines(start)._3, lines(lines.size - 1)._3), - Span(lines(start)._4, lines(lines.size - 1)._4), - pairs.toArray)) - } - pass.toSeq - } - - val mergeAligned = udf { (begins: Seq[Int], ends: Seq[Int]) => - val spans = PassFun.mergeSpansLR(0, begins.zip(ends).map(x => Span(x._1, x._2)) - .zip(Range(0, begins.size).map(_.toLong))) - .map(_._1) // TODO? merge nearly adjacent? - (spans.map(_.begin), spans.map(_.end)) // unzip - } - def subpage(p: Seq[Page], r: Array[Region]) = { if ( p.size > 0 ) Seq(p(0).copy(regions = r)) @@ -743,6 +678,61 @@ transform($pageCol, } def boilerPassages(config: Config, corpus: DataFrame): DataFrame = { import align.sparkSession.implicits._ + val alignedPassages = udf { (s1: String, s2: String) => + var start = 0 + var b1 = 0 + var b2 = 0 + val buf = ArrayBuffer[(Int, Double, Int, Int, String, String)]() + for ( end <- 1 until s2.size ) { + if ( s2(end) == '\n' ) { + val alg1 = s1.substring(start, end+1) + val alg2 = s2.substring(start, end+1) + val t1 = alg1.replaceAll("-", "").replaceAll("\u2010", "-") + val t2 = alg2.replaceAll("-", "").replaceAll("\u2010", "-") + + val matches = alg1.zip(alg2).count(x => x._1 == x._2) + buf += ((t2.size - t1.size, matches * 1.0 / t2.size, b1, b2, t1, t2)) + start = end + 1 + b1 += t1.size + b2 += t2.size + } + } + val lines = buf.toArray + + val pass = ArrayBuffer[(Span, Span, Array[(String, String)])]() + val pairs = ArrayBuffer[(String, String)]() + var i = 0 + start = 0 + while ( i < lines.size ) { + if ( lines(i)._1.abs > 20 || lines(i)._2 < 0.1 ) { + if ( start < i + && (i + 2) < lines.size + && lines(i+1)._1.abs <= 20 && lines(i+1)._2 >= 0.1 + && (lines(i+1)._3 - lines(i)._3) <= 20 + && (lines(i+1)._4 - lines(i)._4) <= 20 ) { + // continue passage + pairs += ((lines(i)._5, lines(i)._6)) + } else { + if ( (i - start) >= config.minLines ) { + pass += ((Span(lines(start)._3, lines(i)._3), + Span(lines(start)._4, lines(i)._4), + pairs.toArray)) + } + start = i + 1 + pairs.clear + } + } else { + pairs += ((lines(i)._5, lines(i)._6)) + } + i += 1 + } + if ( (i - start) >= config.minLines ) { + pass += ((Span(lines(start)._3, lines(lines.size - 1)._3), + Span(lines(start)._4, lines(lines.size - 1)._4), + pairs.toArray)) + } + pass.toSeq + } val lineRecord = udf { (b1: Int, b2: Int, pairs: Seq[Row]) => var off1 = b1 @@ -881,7 +871,7 @@ transform($pageCol, } val collectTexts = udf { (texts: Seq[String], seqs: Seq[Int]) => - val textDict = seqs zip texts toMap + val textDict = seqs.zip(texts).toMap var allTexts = "" for (seq <- seqs.sorted) { @@ -1053,6 +1043,8 @@ transform($pageCol, c.copy(minRep = x) } text("Minimum number of n-gram matches between documents; default=5") opt[Int]('a', "min-align") action { (x, c) => c.copy(minAlg = x) } text("Minimum length of alignment; default=20") + opt[Int]('L', "min-lines") action { (x, c) => + c.copy(minLines = x) } text("Minimum number of lines in boilerplate and docwise alignments; default=5") opt[Int]('g', "gap") action { (x, c) => c.copy(gap = x) } text("Minimum size of the gap that separates passages; default=100") opt[Int]('c', "context") action { (x, c) => From 60e0b54c00cf8ac90c0a4ca4de546c06126367ec Mon Sep 17 00:00:00 2001 From: David Smith Date: Wed, 22 May 2019 07:25:49 -0400 Subject: [PATCH 09/39] Enforce minLines in boilerSplit. --- src/main/scala/PassimApp.scala | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index a0032fa..dc11a52 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -439,7 +439,7 @@ transform($pageCol, } docs.toArray } - def boilerSplit(passages: DataFrame, raw: DataFrame): DataFrame = { + def boilerSplit(config: Config, passages: DataFrame, raw: DataFrame): DataFrame = { import passages.sparkSession.implicits._ val pageField = if ( raw.columns.contains("pages") ) "pages" else "null" val srcSpans = udf { (lines: Seq[Row]) => @@ -448,26 +448,29 @@ transform($pageCol, var cure1 = -1 var curb2 = -1 var cure2 = -1 + var lineCount = 0 for ( cur <- lines ) { cur match { case Row(b1: Int, len1: Int, b2: Int, len2: Int) => val e1 = b1 + len1 val e2 = b2 + len2 if ( b1 > cure1 || b2 > cure2 || e2 < cure2 ) { - if ( curb1 > -1 ) { + if ( curb1 > -1 && lineCount > config.minLines ) { res += SpanPair(curb1, cure1, curb2, cure2) } curb1 = b1 cure1 = e1 curb2 = b2 cure2 = e2 + lineCount = 1 } else { cure1 = e1 cure2 = e2 + lineCount += 1 } } } - if ( curb1 > -1 ) { + if ( curb1 > -1 && lineCount > config.minLines ) { res += SpanPair(curb1, cure1, curb2, cure2) } res.toSeq @@ -1280,7 +1283,7 @@ transform($pageCol, .drop("tlines", "mvars", "variants") .write.format(config.outputFormat).save(outFname) } else { - boilerSplit(pass, raw).write.format(config.outputFormat).save(outFname) + boilerSplit(config, pass, raw).write.format(config.outputFormat).save(outFname) } sys.exit(0) } From 795e600acb8941401eb0a8c5ed2ad52edc43add9 Mon Sep 17 00:00:00 2001 From: David Smith Date: Mon, 1 Jul 2019 14:58:20 +0200 Subject: [PATCH 10/39] Include alignments in witness information. --- src/main/scala/PassimApp.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index dc11a52..fc1260e 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -87,7 +87,7 @@ case class LinkedSpan(span: Span, links: ArrayBuffer[Long]) case class ExtentPair(seq1: Int, seq2: Int, begin1: Int, begin2: Int, end1: Int, end2: Int, tok1: Int, tok2: Int) -case class WitInfo(start: Int, length: Int, begin: Int, text: String) +case class WitInfo(start: Int, length: Int, begin: Int, text: String, alg1: String, alg2: String) case class SpanPair(b1: Int, e1: Int, b2: Int, e2: Int) @@ -685,7 +685,7 @@ transform($pageCol, var start = 0 var b1 = 0 var b2 = 0 - val buf = ArrayBuffer[(Int, Double, Int, Int, String, String)]() + val buf = ArrayBuffer[(Int, Double, Int, Int, String, String, String, String)]() for ( end <- 1 until s2.size ) { if ( s2(end) == '\n' ) { val alg1 = s1.substring(start, end+1) @@ -694,7 +694,7 @@ transform($pageCol, val t2 = alg2.replaceAll("-", "").replaceAll("\u2010", "-") val matches = alg1.zip(alg2).count(x => x._1 == x._2) - buf += ((t2.size - t1.size, matches * 1.0 / t2.size, b1, b2, t1, t2)) + buf += ((t2.size - t1.size, matches * 1.0 / t2.size, b1, b2, t1, t2, alg1, alg2)) start = end + 1 b1 += t1.size b2 += t2.size @@ -702,8 +702,8 @@ transform($pageCol, } val lines = buf.toArray - val pass = ArrayBuffer[(Span, Span, Array[(String, String)])]() - val pairs = ArrayBuffer[(String, String)]() + val pass = ArrayBuffer[(Span, Span, Array[(String, String, String, String)])]() + val pairs = ArrayBuffer[(String, String, String, String)]() var i = 0 start = 0 while ( i < lines.size ) { @@ -714,7 +714,7 @@ transform($pageCol, && (lines(i+1)._3 - lines(i)._3) <= 20 && (lines(i+1)._4 - lines(i)._4) <= 20 ) { // continue passage - pairs += ((lines(i)._5, lines(i)._6)) + pairs += ((lines(i)._5, lines(i)._6, lines(i)._7, lines(i)._8)) } else { if ( (i - start) >= config.minLines ) { pass += ((Span(lines(start)._3, lines(i)._3), @@ -725,7 +725,7 @@ transform($pageCol, pairs.clear } } else { - pairs += ((lines(i)._5, lines(i)._6)) + pairs += ((lines(i)._5, lines(i)._6, lines(i)._7, lines(i)._8)) } i += 1 } @@ -745,7 +745,7 @@ transform($pageCol, val s2 = p.getString(1) off2 += s2.length off1 += s1.length - WitInfo(off2 - s2.length, s2.length, off1 - s1.length, s1) + WitInfo(off2 - s2.length, s2.length, off1 - s1.length, s1, p.getString(2), p.getString(3)) } } val alignStrings = makeStringAligner(config, openGap = 1) @@ -776,7 +776,7 @@ transform($pageCol, .select('id2 as "id", 'id1 as "src", 'meta1 as "meta", explode(lineRecord('b1, 'b2, 'pairs)) as "wit") .select('id, $"wit.start", $"wit.length", - struct('meta, 'src as "id", $"wit.begin", $"wit.text") as "wit") + struct('meta, 'src as "id", $"wit.begin", $"wit.text", $"wit.alg1", $"wit.alg2") as "wit") } def aggregateAlignments(config: Config, corpus: DataFrame, extents: DataFrame): DataFrame = { import align.sparkSession.implicits._ From cbd055519f02897cffbd758f89fb46e2ad2f78cd Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 2 Jul 2019 11:45:56 +0200 Subject: [PATCH 11/39] Include page regions in docwise output. --- src/main/scala/PassimApp.scala | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index fc1260e..11988f3 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -755,12 +755,13 @@ transform($pageCol, align.drop("gid") .join(corpus.select('uid, col(config.id) as "id", col(config.text) as "text", struct(metaFields.toList.map(expr):_*) as "meta", - 'termCharBegin, 'termCharEnd), "uid") + 'termCharBegin, 'termCharEnd, 'pages), "uid") .withColumn("begin", lineStart('text, 'termCharBegin('begin))) .withColumn("end", lineStop('text, when('end < size('termCharBegin), 'termCharBegin('end)).otherwise(length('text)))) - .select('mid, struct('first, 'id, 'meta, 'begin, 'end, + .withColumn("regions", expr(s"filter(pages[0].regions, r -> r.start < end AND (r.start + r.length) > begin)")) + .select('mid, struct('first, 'id, 'meta, 'regions, 'begin, 'end, getPassage('text, 'begin, 'end) as "text") as "info") .groupBy("mid") .agg(sort_array(collect_list("info"), false) as "info") // "first" == true sorts first @@ -768,15 +769,18 @@ transform($pageCol, .select('info, explode(alignedPassages($"alg.s1", $"alg.s2")) as "pass") .selectExpr("info[0].id as id1", "info[1].id as id2", "info[0].meta as meta1","info[1].meta as meta2", + "info[0].regions as regions1", "pass._3 as pairs", "info[0].begin + pass._1.begin as b1", "info[0].begin + pass._1.end as e1", "info[1].begin + pass._2.begin as b2", "info[1].begin + pass._2.end as e2") - .select('id2 as "id", 'id1 as "src", 'meta1 as "meta", + .select('id2 as "id", 'id1 as "src", 'meta1 as "meta", 'regions1 as "regions", explode(lineRecord('b1, 'b2, 'pairs)) as "wit") .select('id, $"wit.start", $"wit.length", - struct('meta, 'src as "id", $"wit.begin", $"wit.text", $"wit.alg1", $"wit.alg2") as "wit") + struct('meta, + expr("filter(regions, r -> r.start < (wit.begin + length(wit.text)) AND (r.start + r.length) > wit.begin)") as "regions", + 'src as "id", $"wit.begin", $"wit.text", $"wit.alg1", $"wit.alg2") as "wit") } def aggregateAlignments(config: Config, corpus: DataFrame, extents: DataFrame): DataFrame = { import align.sparkSession.implicits._ From d61c981c0d20f23f9d557433ecc11591695c1139 Mon Sep 17 00:00:00 2001 From: David Smith Date: Fri, 26 Jul 2019 17:56:37 -0400 Subject: [PATCH 12/39] Add linewise option for OCR training production. --- src/main/scala/PassimApp.scala | 74 ++++++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 7 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 11988f3..7096246 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -28,7 +28,8 @@ case class Config(version: String = BuildInfo.version, wordLength: Double = 2, pairwise: Boolean = false, aggregate: Boolean = false, - docwise: Boolean = false, names: Boolean = false, postings: Boolean = false, + docwise: Boolean = false, linewise: Boolean = false, + names: Boolean = false, postings: Boolean = false, id: String = "id", group: String = "series", text: String = "text", fields: String = "", filterpairs: String = "gid < gid2", inputFormat: String = "json", outputFormat: String = "json", @@ -332,6 +333,27 @@ object PassimApp { .drop("_tokens") } } + def pageBox(pageCol: String): DataFrame = { + val pageFields = df.select(expr(s"inline($pageCol)")).columns + .filter { _ != "regions" }.map { f => s"p.$f as $f" }.mkString(", ") + df.withColumn(pageCol, + expr(s""" +transform($pageCol, + p -> struct($pageFields, + array(aggregate(p.regions, + struct(p.regions[0].start as start, + p.regions[0].length as length, + struct(p.regions[0].coords.x as x, + p.regions[0].coords.y as y, + p.regions[0].coords.w as w, + p.regions[0].coords.h as h) as coords), + (acc, r) -> struct(least(acc.start, r.start) as start, + greatest(acc.start + acc.length, r.start + r.length) - least(acc.start, r.start) as length, + struct(least(acc.coords.x, r.coords.x) as x, + least(acc.coords.y, r.coords.y) as y, + greatest(acc.coords.x + acc.coords.w, r.coords.x + r.coords.w) - least(acc.coords.x, r.coords.x) as w, + greatest(acc.coords.y + acc.coords.h, r.coords.y + r.coords.h) - least(acc.coords.y, r.coords.y) as h) as coords))) as regions))""")) + } def selectRegions(pageCol: String): DataFrame = { if ( df.columns.contains(pageCol) ) { // Do these transformations in SQL to avoid Java's persnicketiness about int/long casting @@ -749,6 +771,8 @@ transform($pageCol, } } val alignStrings = makeStringAligner(config, openGap = 1) + val pageFields = corpus.select(expr(s"inline(pages)")).columns + .filter { _ != "regions" }.map { f => s"p.$f as $f" }.mkString(", ") val metaFields = ListBuffer[String]() if ( corpus.columns.contains("date") ) metaFields += "date" metaFields += (if ( corpus.columns.contains("gold") ) "gold" else "0 as gold") @@ -760,8 +784,9 @@ transform($pageCol, .withColumn("end", lineStop('text, when('end < size('termCharBegin), 'termCharBegin('end)).otherwise(length('text)))) - .withColumn("regions", expr(s"filter(pages[0].regions, r -> r.start < end AND (r.start + r.length) > begin)")) - .select('mid, struct('first, 'id, 'meta, 'regions, 'begin, 'end, + .withColumn("pages", + expr(s"filter(transform(pages, p -> struct($pageFields, filter(p.regions, r -> r.start < end AND (r.start + r.length) > begin) as regions)), p -> size(p.regions) > 0)")) + .select('mid, struct('first, 'id, 'meta, 'pages, 'begin, 'end, getPassage('text, 'begin, 'end) as "text") as "info") .groupBy("mid") .agg(sort_array(collect_list("info"), false) as "info") // "first" == true sorts first @@ -769,17 +794,17 @@ transform($pageCol, .select('info, explode(alignedPassages($"alg.s1", $"alg.s2")) as "pass") .selectExpr("info[0].id as id1", "info[1].id as id2", "info[0].meta as meta1","info[1].meta as meta2", - "info[0].regions as regions1", + "info[0].pages as pages1", "pass._3 as pairs", "info[0].begin + pass._1.begin as b1", "info[0].begin + pass._1.end as e1", "info[1].begin + pass._2.begin as b2", "info[1].begin + pass._2.end as e2") - .select('id2 as "id", 'id1 as "src", 'meta1 as "meta", 'regions1 as "regions", + .select('id2 as "id", 'id1 as "src", 'meta1 as "meta", 'pages1 as "pages", explode(lineRecord('b1, 'b2, 'pairs)) as "wit") .select('id, $"wit.start", $"wit.length", struct('meta, - expr("filter(regions, r -> r.start < (wit.begin + length(wit.text)) AND (r.start + r.length) > wit.begin)") as "regions", + expr(s"filter(transform(pages, p -> struct($pageFields, filter(p.regions, r -> r.start < (wit.begin + length(wit.text)) AND (r.start + r.length) > wit.begin) as regions)), p -> size(p.regions) > 0)") as "pages", 'src as "id", $"wit.begin", $"wit.text", $"wit.alg1", $"wit.alg2") as "wit") } def aggregateAlignments(config: Config, corpus: DataFrame, extents: DataFrame): DataFrame = { @@ -1066,6 +1091,8 @@ transform($pageCol, c.copy(pairwise = true) } text("Output pairwise alignments") opt[Unit]('d', "docwise") action { (_, c) => c.copy(docwise = true) } text("Output docwise alignments") + opt[Unit]("linewise") action { (_, c) => + c.copy(linewise = true) } text("Output linewise alignments") opt[Unit]('N', "names") action { (_, c) => c.copy(names = true) } text("Output names and exit") opt[Unit]('P', "postings") action { (_, c) => @@ -1259,11 +1286,13 @@ transform($pageCol, } } - if ( config.boilerplate || config.docwise ) { + if ( config.boilerplate || config.docwise || config.linewise) { if ( !hdfsExists(spark, passFname) ) { extents.boilerPassages(config, corpus).write.parquet(passFname) } val pass = spark.read.parquet(passFname) + val pageFields = pass.select(expr(s"inline(wit.pages)")).columns + .filter { _ != "regions" }.map { f => s"p.$f as $f" }.mkString(", ") if ( config.docwise ) { val textLines = udf { (text: String) => val res = ListBuffer[LineInfo]() @@ -1286,6 +1315,37 @@ transform($pageCol, expr("transform(tlines, r -> struct(r.text as text, mvars[r.start] as wits))")) .drop("tlines", "mvars", "variants") .write.format(config.outputFormat).save(outFname) + } else if ( config.linewise ) { + val gap = 4 + val coreAlignment = udf { (alg1: String, alg2: String) => + val re = s"\\-{$gap,}\\s*".r + (List((0,0)) ++ + (re.findAllMatchIn(alg1).map { m => (m.start, m.toString.length) }.toList ++ + re.findAllMatchIn(alg2).map { m => (m.start, m.toString.length) }.toList).sorted ++ + List((alg1.length, 0))) + .sliding(2) + .map { p => + val begin = p(0)._1 + p(0)._2 + val end = p(1)._1 + (alg1.substring(0, begin).replaceAll("-", "").length, + alg1.substring(begin, end).replaceAll("-", "").replaceAll("\u2010", "-"), + alg2.substring(0, begin).replaceAll("-", "").length, + alg2.substring(begin, end).replaceAll("-", "").replaceAll("\u2010", "-")) + } + .filter { p => p._2.length > gap && p._4.length > gap } + .toSeq + + } + pass + .withColumn("core", coreAlignment($"wit.alg1", $"wit.alg2")) + .select('id, 'start + 'core(0)("_3") as "begin", + 'core(0)("_4") as "text", + $"wit.id" as "wid", $"wit.begin" + 'core(0)("_1") as "wbegin", + 'core(0)("_2") as "wtext", + $"wit.pages" as "wpages") + .withColumn("wpages", expr(s"filter(transform(wpages, p -> struct($pageFields, filter(p.regions, r -> r.start < (wbegin + length(wtext)) AND (r.start + r.length) > wbegin) as regions)), p -> size(p.regions) > 0)")) + .pageBox("wpages") + .write.format(config.outputFormat).save(outFname) } else { boilerSplit(config, pass, raw).write.format(config.outputFormat).save(outFname) } From c25081bc05da49873e600a262ccdafd3d23a3885 Mon Sep 17 00:00:00 2001 From: David Smith Date: Wed, 7 Aug 2019 08:48:30 -0400 Subject: [PATCH 13/39] Output both token and line bounding boxes; don't strip gaps. --- src/main/scala/PassimApp.scala | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 7096246..c86840a 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -1337,13 +1337,18 @@ transform($pageCol, } pass - .withColumn("core", coreAlignment($"wit.alg1", $"wit.alg2")) - .select('id, 'start + 'core(0)("_3") as "begin", - 'core(0)("_4") as "text", - $"wit.id" as "wid", $"wit.begin" + 'core(0)("_1") as "wbegin", - 'core(0)("_2") as "wtext", - $"wit.pages" as "wpages") - .withColumn("wpages", expr(s"filter(transform(wpages, p -> struct($pageFields, filter(p.regions, r -> r.start < (wbegin + length(wtext)) AND (r.start + r.length) > wbegin) as regions)), p -> size(p.regions) > 0)")) + // .withColumn("core", coreAlignment($"wit.alg1", $"wit.alg2")) + // .select('id, 'start + 'core(0)("_3") as "begin", + // 'core(0)("_4") as "text", + // $"wit.id" as "wid", $"wit.begin" + 'core(0)("_1") as "wbegin", + // 'core(0)("_2") as "wtext", + // $"wit.pages" as "wpages") + // .withColumn("wpages", expr(s"filter(transform(wpages, p -> struct($pageFields, filter(p.regions, r -> r.start < (wbegin + length(wtext)) AND (r.start + r.length) > wbegin) as regions)), p -> size(p.regions) > 0)")) + .select('id, 'start as "begin", + translate($"wit.alg2", "\u2010-", "-") as "text", + $"wit.id" as "wid", $"wit.begin" as "wbegin", $"wit.text" as "wtext", + $"wit.alg2" as "talg", $"wit.alg1" as "walg", + $"wit.pages" as "wpages", $"wit.pages" as "wpagesTokens") .pageBox("wpages") .write.format(config.outputFormat).save(outFname) } else { From 89b31b68e3d47848abd358b7586f3f15fe217347 Mon Sep 17 00:00:00 2001 From: David Smith Date: Fri, 16 Aug 2019 12:36:27 -0400 Subject: [PATCH 14/39] Output page information for target texts. --- src/main/scala/PassimApp.scala | 36 ++++++---------------------------- 1 file changed, 6 insertions(+), 30 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index c86840a..6052422 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -794,17 +794,18 @@ transform($pageCol, .select('info, explode(alignedPassages($"alg.s1", $"alg.s2")) as "pass") .selectExpr("info[0].id as id1", "info[1].id as id2", "info[0].meta as meta1","info[1].meta as meta2", - "info[0].pages as pages1", + "info[0].pages as pages1", "info[1].pages as pages2", "pass._3 as pairs", "info[0].begin + pass._1.begin as b1", "info[0].begin + pass._1.end as e1", "info[1].begin + pass._2.begin as b2", "info[1].begin + pass._2.end as e2") - .select('id2 as "id", 'id1 as "src", 'meta1 as "meta", 'pages1 as "pages", + .select('id2 as "id", 'id1 as "src", 'meta1 as "meta", 'pages1, 'pages2, explode(lineRecord('b1, 'b2, 'pairs)) as "wit") .select('id, $"wit.start", $"wit.length", + expr(s"filter(transform(pages2, p -> struct($pageFields, filter(p.regions, r -> r.start < (wit.start + wit.length) AND (r.start + r.length) > wit.start) as regions)), p -> size(p.regions) > 0)") as "pages", struct('meta, - expr(s"filter(transform(pages, p -> struct($pageFields, filter(p.regions, r -> r.start < (wit.begin + length(wit.text)) AND (r.start + r.length) > wit.begin) as regions)), p -> size(p.regions) > 0)") as "pages", + expr(s"filter(transform(pages1, p -> struct($pageFields, filter(p.regions, r -> r.start < (wit.begin + length(wit.text)) AND (r.start + r.length) > wit.begin) as regions)), p -> size(p.regions) > 0)") as "pages", 'src as "id", $"wit.begin", $"wit.text", $"wit.alg1", $"wit.alg2") as "wit") } def aggregateAlignments(config: Config, corpus: DataFrame, extents: DataFrame): DataFrame = { @@ -1316,39 +1317,14 @@ transform($pageCol, .drop("tlines", "mvars", "variants") .write.format(config.outputFormat).save(outFname) } else if ( config.linewise ) { - val gap = 4 - val coreAlignment = udf { (alg1: String, alg2: String) => - val re = s"\\-{$gap,}\\s*".r - (List((0,0)) ++ - (re.findAllMatchIn(alg1).map { m => (m.start, m.toString.length) }.toList ++ - re.findAllMatchIn(alg2).map { m => (m.start, m.toString.length) }.toList).sorted ++ - List((alg1.length, 0))) - .sliding(2) - .map { p => - val begin = p(0)._1 + p(0)._2 - val end = p(1)._1 - (alg1.substring(0, begin).replaceAll("-", "").length, - alg1.substring(begin, end).replaceAll("-", "").replaceAll("\u2010", "-"), - alg2.substring(0, begin).replaceAll("-", "").length, - alg2.substring(begin, end).replaceAll("-", "").replaceAll("\u2010", "-")) - } - .filter { p => p._2.length > gap && p._4.length > gap } - .toSeq - - } pass - // .withColumn("core", coreAlignment($"wit.alg1", $"wit.alg2")) - // .select('id, 'start + 'core(0)("_3") as "begin", - // 'core(0)("_4") as "text", - // $"wit.id" as "wid", $"wit.begin" + 'core(0)("_1") as "wbegin", - // 'core(0)("_2") as "wtext", - // $"wit.pages" as "wpages") - // .withColumn("wpages", expr(s"filter(transform(wpages, p -> struct($pageFields, filter(p.regions, r -> r.start < (wbegin + length(wtext)) AND (r.start + r.length) > wbegin) as regions)), p -> size(p.regions) > 0)")) .select('id, 'start as "begin", translate($"wit.alg2", "\u2010-", "-") as "text", $"wit.id" as "wid", $"wit.begin" as "wbegin", $"wit.text" as "wtext", $"wit.alg2" as "talg", $"wit.alg1" as "walg", + 'pages as "tpages", 'pages as "tpagesTokens", $"wit.pages" as "wpages", $"wit.pages" as "wpagesTokens") + .pageBox("tpages") .pageBox("wpages") .write.format(config.outputFormat).save(outFname) } else { From 5e17bc48d9646babe7472c432756f4a779ca60f1 Mon Sep 17 00:00:00 2001 From: David Smith Date: Fri, 16 Aug 2019 13:20:41 -0400 Subject: [PATCH 15/39] Default to plain-text, non-XML tokenization. --- README.md | 7 +- src/main/java/PlainTokenizer.java | 742 ++++++++++++++++++++++++++++++ src/main/scala/PassimApp.scala | 2 +- 3 files changed, 745 insertions(+), 6 deletions(-) create mode 100644 src/main/java/PlainTokenizer.java diff --git a/README.md b/README.md index cd67042..8b05efd 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ of files containing JSON records. The record for a single document with the required `id` and `text` fields, as well as a `series` field, would look like: ``` -{"id": "d1", "series": "abc", "text": "This is text that's interpreted as XML; the tags are ignored by default."} +{"id": "d1", "series": "abc", "text": "This is text."} ``` Note that this is must be a single line in the file. This JSON record @@ -75,7 +75,7 @@ included in the record for each document will be passed through into the output. In particular, a `date` field, if present, will be used to sort passages within each cluster. -Natural language text is redundant, and adding XML markup and JSON +Natural language text is redundant, and adding markup and JSON field names increases the redundancy. Spark and passim support several compression schemes. For relatively small files, gzip is adequate; however, when the input files are large enough that the do @@ -155,9 +155,6 @@ for further configuration options. ## Marking Locations inside Documents -As mentioned above, the `text` field is interpreted as XML. The -parser expands character entities and ignores tags. - Documents may document their extent on physical pages with the `pages` field. This field is an array of `Page` regions with the following schema (here written in Scala): ``` case class Coords(x: Int, y: Int, w: Int, h: Int, b: Int) diff --git a/src/main/java/PlainTokenizer.java b/src/main/java/PlainTokenizer.java new file mode 100644 index 0000000..465c306 --- /dev/null +++ b/src/main/java/PlainTokenizer.java @@ -0,0 +1,742 @@ +// BSD License (http://lemurproject.org/galago-license) +package passim; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Pattern; + +import java.io.UnsupportedEncodingException; + +/** + *

This class processes document text into tokens that can be indexed.

+ * + *

The text is assumed to contain some HTML/XML tags. The tokenizer tries + * to extract as much data as possible from each document, even if it is not + * well formed (e.g. there are start tags with no ending tags). The resulting + * document object contains an array of terms and an array of tags.

+ * + * @author trevor + */ +public class PlainTokenizer { + + protected static final boolean[] splits; + protected static HashSet ignoredTags; + protected String ignoreUntil; + protected List whitelist; + + static { + splits = buildSplits(); + ignoredTags = buildIgnoredTags(); + } + protected String text; + protected int position; + protected int lastSplit; + ArrayList tokens; + HashMap> openTags; + ArrayList closedTags; + ArrayList tokenPositions; + private boolean tokenizeTagContent = true; + + public static class Pair { + + public Pair(int start, int end) { + this.start = start; + this.end = end; + } + public int start; + public int end; + + @Override + public String toString() { + return String.format("%d,%d", start, end); + } + } + + protected enum StringStatus { + + Clean, + NeedsSimpleFix, + NeedsComplexFix, + NeedsAcronymProcessing + } + + public PlainTokenizer() { + text = null; + position = 0; + lastSplit = -1; + + tokens = new ArrayList(); + openTags = new HashMap>(); + closedTags = new ArrayList(); + tokenPositions = new ArrayList(); + whitelist = new ArrayList(); + } + + protected static boolean[] buildSplits() { + boolean[] localSplits = new boolean[257]; + + for (int i = 0; i < localSplits.length; i++) { + localSplits[i] = false; + } + char[] splitChars = {' ', '\t', '\n', '\r', // spaces + ';', '\"', '&', '/', ':', '!', '#', + '?', '$', '%', '(', ')', '@', '^', + '*', '+', '-', ',', '=', '>', '<', '[', + ']', '{', '}', '|', '`', '~', '_' + }; + + for (char c : splitChars) { + localSplits[(byte) c] = true; + } + + for (byte c = 0; c <= 32; c++) { + localSplits[c] = true; + } + + return localSplits; + } + + public void addField(String f) { + whitelist.add(Pattern.compile(f)); + } + + protected static HashSet buildIgnoredTags() { + HashSet tags = new HashSet<>(); + tags.add("style"); + tags.add("script"); + return tags; + } + + static class ClosedTag { + + public ClosedTag(BeginTag begin, int start, int end) { + this.name = begin.name; + this.attributes = begin.attributes; + + this.byteStart = begin.bytePosition; + this.termStart = begin.termPosition; + + this.byteEnd = start; + this.termEnd = end; + } + String name; + Map attributes; + int byteStart; + int termStart; + int byteEnd; + int termEnd; + } + + static class BeginTag { + + public BeginTag(String name, Map attributes, int bytePosition, int end) { + this.name = name; + this.attributes = attributes; + + this.bytePosition = bytePosition; + this.termPosition = end; + } + String name; + Map attributes; + int bytePosition; + int termPosition; + } + + /** + * Resets parsing in preparation for the next document. + */ + public void reset() { + ignoreUntil = null; + text = null; + position = 0; + lastSplit = -1; + + tokens.clear(); + openTags.clear(); + closedTags.clear(); + + if (tokenPositions != null) { + tokenPositions.clear(); + } + } + + protected void skipComment() { + if (text.substring(position).startsWith("", position + 1); + + if (position >= 0) { + position += 2; + } + } else { + position = text.indexOf(">", position + 1); + } + + if (position < 0) { + position = text.length(); + } + } + + protected void skipProcessingInstruction() { + position = text.indexOf("?>", position + 1); + + if (position < 0) { + position = text.length(); + } + } + + protected void parseEndTag() { + // 1. read name (skipping the ') { + break; + } + } + + String tagName = text.substring(position + 2, i).toLowerCase(); + + if (ignoreUntil != null && ignoreUntil.equals(tagName)) { + ignoreUntil = null; + } + if (ignoreUntil == null) { + closeTag(tagName); // advance to end '>' + } + while (i < text.length() && text.charAt(i) != '>') { + i++; + } + position = i; + } + + protected void closeTag(final String tagName) { + if (!openTags.containsKey(tagName)) { + return; + } + ArrayList tagList = openTags.get(tagName); + + if (tagList.size() > 0) { + int last = tagList.size() - 1; + + BeginTag openTag = tagList.get(last); + ClosedTag closedTag = new ClosedTag(openTag, position, tokens.size()); + closedTags.add(closedTag); + + tagList.remove(last); + + // switch out of Do not tokenize mode. + if (!tokenizeTagContent) { + tokenizeTagContent = true; + } + } + + } + + protected int indexOfNonSpace(int start) { + if (start < 0) { + return Integer.MIN_VALUE; + } + for (int i = start; i < text.length(); i++) { + char c = text.charAt(i); + if (!Character.isSpaceChar(c)) { + return i; + } + } + + return Integer.MIN_VALUE; + } + + protected int indexOfEndAttribute(int start, int tagEnd) { + if (start < 0) { + return Integer.MIN_VALUE; // attribute ends at the first non-quoted space, or + // the first '>'. + } + boolean inQuote = false; + boolean lastEscape = false; + + for (int i = start; i <= tagEnd; i++) { + char c = text.charAt(i); + + if ((c == '\"' || c == '\'') && !lastEscape) { + inQuote = !inQuote; + if (!inQuote) { + return i; + } + } else if (!inQuote && (Character.isSpaceChar(c) || c == '>')) { + return i; + } else if (c == '\\' && !lastEscape) { + lastEscape = true; + } else { + lastEscape = false; + } + } + + return Integer.MIN_VALUE; + } + + protected int indexOfSpace(int start) { + if (start < 0) { + return Integer.MIN_VALUE; + } + for (int i = start; i < text.length(); i++) { + char c = text.charAt(i); + if (Character.isSpaceChar(c)) { + return i; + } + } + + return Integer.MIN_VALUE; + } + + protected int indexOfEquals(int start, int end) { + if (start < 0) { + return Integer.MIN_VALUE; + } + for (int i = start; i < end; i++) { + char c = text.charAt(i); + if (c == '=') { + return i; + } + } + + return Integer.MIN_VALUE; + } + + protected void parseBeginTag() { + // 1. read the name, skipping the '<' + int i; + + for (i = position + 1; i < text.length(); i++) { + char c = text.charAt(i); + if (Character.isSpaceChar(c) || c == '>') { + break; + } + } + + String tagName = text.substring(position + 1, i).toLowerCase(); + + // 2. read attr pairs + i = indexOfNonSpace(i); + int tagEnd = text.indexOf(">", i + 1); + boolean closeIt = false; + + HashMap attributes = new HashMap(); + while (i < tagEnd && i >= 0 && tagEnd >= 0) { + // scan ahead for non space + int start = indexOfNonSpace(i); + + if (start > 0) { + if (text.charAt(start) == '>') { + i = start; + break; + } else if (text.charAt(start) == '/' + && text.length() > start + 1 + && text.charAt(start + 1) == '>') { + i = start + 1; + closeIt = true; + break; + } + } + + int end = indexOfEndAttribute(start, tagEnd); + int equals = indexOfEquals(start, end); + + // try to find an equals sign + if (equals < 0 || equals == start || end == equals) { + // if there's no equals, try to move to the next thing + if (end < 0) { + i = tagEnd; + break; + } else { + i = end; + continue; + } + } + + // there is an equals, so try to parse the value + int startKey = start; + int endKey = equals; + + int startValue = equals + 1; + int endValue = end; + + if (text.charAt(startValue) == '\"' || text.charAt(startValue) == '\'') { + startValue++; + } + if (startValue >= endValue || startKey >= endKey) { + i = end; + continue; + } + + String key = text.substring(startKey, endKey); + String value = text.substring(startValue, endValue); + + attributes.put(key.toLowerCase(), value); + + if (end >= text.length()) { + endParsing(); + break; + } + + if (text.charAt(end) == '\"' || text.charAt(end) == '\'') { + end++; + } + + i = end; + } + + position = i; + + if (!ignoredTags.contains(tagName)) { + BeginTag tag = new BeginTag(tagName, attributes, position + 1, tokens.size()); + + if (!openTags.containsKey(tagName)) { + ArrayList tagList = new ArrayList<>(); + tagList.add(tag); + openTags.put(tagName, tagList); + } else { + openTags.get(tagName).add(tag); + } + + if (attributes.containsKey("tokenizetagcontent") && !closeIt) { + String parseAttr = attributes.get("tokenizetagcontent"); + try { + boolean tokenize = Boolean.parseBoolean(parseAttr); + tokenizeTagContent = tokenize; + } catch (Exception e) { + } + } + + if (closeIt) { + closeTag(tagName); + } + } else if (!closeIt) { + ignoreUntil = tagName; + } + + } + + protected void endParsing() { + position = text.length(); + } + + public static String processToken(String t) { + return tokenComplexFix(t); + } + + protected void onSplit() { + if (position - lastSplit > 1) { + int start = lastSplit + 1; + String token = text.substring(start, position); + StringStatus status = checkTokenStatus(token); + + switch (status) { + case NeedsSimpleFix: + token = tokenSimpleFix(token); + break; + + case NeedsComplexFix: + token = tokenComplexFix(token); + break; + + case NeedsAcronymProcessing: + tokenAcronymProcessing(token, start, position); + break; + + case Clean: + // do nothing + break; + } + + if (status != StringStatus.NeedsAcronymProcessing) { + addToken(token, start, position); + } + } + + lastSplit = position; + } + + public static byte[] StringToBytes(String word) { + try { + return word.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException("UTF-8 is not supported by your Java Virtual Machine."); + } + } + + /** + * Adds a token to the document object. This method currently drops tokens + * longer than 100 bytes long right now. + * + * @param token The token to add. + * @param start The starting byte offset of the token in the document text. + * @param end The ending byte offset of the token in the document text. + */ + protected void addToken(final String token, int start, int end) { + final int maxTokenLength = 100; + // zero length tokens aren't interesting + if (token.length() <= 0) { + return; + } + // we want to make sure the token is short enough that someone + // might actually type it. UTF-8 can expand one character to 6 bytes. + if (token.length() > maxTokenLength / 6 + && StringToBytes(token).length >= maxTokenLength) { + return; + } + tokens.add(token); + tokenPositions.add(new Pair(start, end)); + } + + protected static String tokenComplexFix(String token) { + token = tokenSimpleFix(token); + token = token.toLowerCase(); + + return token; + } + + /** + * This method does three kinds of processing: + *
    + *
  • If the token contains periods at the beginning or the end, + * they are removed.
  • + *
  • If the token contains single letters followed by periods, such + * as I.B.M., C.I.A., or U.S.A., the periods are removed.
  • + *
  • If, instead, the token contains longer strings of text with + * periods in the middle, the token is split into + * smaller tokens ("umass.edu" becomes {"umass", "edu"}). Notice + * that this means ("ph.d." becomes {"ph", "d"}).
  • + *
+ * + * @param token + * @param start + * @param end + */ + protected void tokenAcronymProcessing(String token, int start, int end) { + token = tokenComplexFix(token); + + // remove start and ending periods + while (token.startsWith(".")) { + token = token.substring(1); + start = start + 1; + } + + while (token.endsWith(".")) { + token = token.substring(0, token.length() - 1); + end -= 1; + } + + // does the token have any periods left? + if (token.indexOf('.') >= 0) { + // is this an acronym? then there will be periods + // at odd positions: + boolean isAcronym = token.length() > 0; + for (int pos = 1; pos < token.length(); pos += 2) { + if (token.charAt(pos) != '.') { + isAcronym = false; + } + } + + if (isAcronym) { + token = token.replace(".", ""); + addToken(token, start, end); + } else { + int s = 0; + for (int e = 0; e < token.length(); e++) { + if (token.charAt(e) == '.') { + if (e - s > 1) { + String subtoken = token.substring(s, e); + addToken(subtoken, start + s, start + e); + } + s = e + 1; + } + } + + if (token.length() - s > 1) { + String subtoken = token.substring(s); + addToken(subtoken, start + s, end); + } + } + } else { + addToken(token, start, end); + } + } + + /** + * Scans through the token, removing apostrophes and converting + * uppercase to lowercase letters. + * + * @param token + * @return + */ + protected static String tokenSimpleFix(String token) { + char[] chars = token.toCharArray(); + int j = 0; + + for (int i = 0; i < chars.length; i++) { + char c = chars[i]; + boolean isAsciiUppercase = (c >= 'A' && c <= 'Z'); + boolean isApostrophe = (c == '\''); + + if (isAsciiUppercase) { + chars[j] = (char) (chars[i] + 'a' - 'A'); + } else if (isApostrophe) { + // it's an apostrophe, skip it + j--; + } else { + chars[j] = chars[i]; + } + + j++; + } + + token = new String(chars, 0, j); + return token; + } + + /** + * This method scans the token, looking for uppercase characters and + * special characters. If the token contains only numbers and lowercase + * letters, it needs no further processing, and it returns Clean. + * If it also contains uppercase letters or apostrophes, it returns + * NeedsSimpleFix. If it contains special characters (especially Unicode + * characters), it returns NeedsComplexFix. Finally, if any periods are + * present, this returns NeedsAcronymProcessing. + * + * @param token + * @return + */ + protected StringStatus checkTokenStatus(final String token) { + StringStatus status = StringStatus.Clean; + char[] chars = token.toCharArray(); + + for (int i = 0; i < chars.length; i++) { + char c = chars[i]; + boolean isAsciiLowercase = (c >= 'a' && c <= 'z'); + boolean isAsciiNumber = (c >= '0' && c <= '9'); + + if (isAsciiLowercase || isAsciiNumber) { + continue; + } + boolean isAsciiUppercase = (c >= 'A' && c <= 'Z'); + boolean isPeriod = (c == '.'); + boolean isApostrophe = (c == '\''); + + if ((isAsciiUppercase || isApostrophe) && status == StringStatus.Clean) { + status = StringStatus.NeedsSimpleFix; + } else if (!isPeriod) { + status = StringStatus.NeedsComplexFix; + } else { + status = StringStatus.NeedsAcronymProcessing; + break; + } + } + + return status; + } + + protected void onStartBracket() { + if (position + 1 < text.length()) { + char c = text.charAt(position + 1); + + if (c == '/') { + parseEndTag(); + } else if (c == '!') { + skipComment(); + } else if (c == '?') { + skipProcessingInstruction(); + } else { + parseBeginTag(); + } + } else { + endParsing(); + } + + lastSplit = position; + } + + /** + * Translates tags from the internal ClosedTag format to the + * Tag type. Uses the whitelist in the tokenizer to omit tags + * that are not matched by any patterns in the whitelist + */ + protected ArrayList coalesceTags() { + ArrayList result = new ArrayList(); + + // close all open tags + for (ArrayList tagList : openTags.values()) { + for (BeginTag tag : tagList) { + for (Pattern p : whitelist) { + if (p.matcher(tag.name).matches()) { + result.add(new Tag(tag.name, tag.attributes, tag.termPosition, tag.termPosition, tag.bytePosition, tag.bytePosition)); + break; + } + } + } + } + + for (ClosedTag tag : closedTags) { + for (Pattern p : whitelist) { + if (p.matcher(tag.name).matches()) { + result.add(new Tag(tag.name, tag.attributes, tag.termStart, tag.termEnd, tag.byteStart, tag.byteEnd)); + break; + } + } + } + + Collections.sort(result); + return result; + } + + /** + * Parses the text in the document.text attribute and fills in the + * document.terms and document.tags arrays. + * + * @param document + */ + public void tokenize(Document document) { + reset(); + text = document.text; + + try { + // this loop is looking for tags, split characters, and XML escapes, + // which start with ampersands. All other characters are assumed to + // be word characters. The onSplit() method takes care of extracting + // word text and storing it in the terms array. The onStartBracket + // method parses tags. ignoreUntil is used to ignore comments and + // script data. + for (; position >= 0 && position < text.length(); position++) { + char c = text.charAt(position); + + if (ignoreUntil != null) { + continue; + } else if (c < 256 && splits[c] && tokenizeTagContent) { + onSplit(); + } + } + } catch (Exception e) { + Logger.getLogger(getClass().toString()).log(Level.WARNING, + "Parse failure: " + document.name); + } + + if (ignoreUntil == null) { + onSplit(); + } + document.terms = new ArrayList(this.tokens); + for (Pair p : this.tokenPositions) { + document.termCharBegin.add(p.start); + document.termCharEnd.add(p.end); + } + document.tags = coalesceTags(); + } + + public ArrayList getTokenPositions() { + return this.tokenPositions; + } +} diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 9c583c4..4a05847 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -312,7 +312,7 @@ object PassimApp { } implicit class TextTokenizer(df: DataFrame) { val tokenizeCol = udf {(text: String) => - val tok = new passim.TagTokenizer() + val tok = new passim.PlainTokenizer() var d = new passim.Document("raw", text) tok.tokenize(d) From 010ad85a38947052b13454965fa66708dad932f4 Mon Sep 17 00:00:00 2001 From: David Smith Date: Mon, 26 Aug 2019 17:09:42 -0400 Subject: [PATCH 16/39] More conservative token offsets in passage edge alignment. --- src/main/scala/PassimApp.scala | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 4a05847..9931a2d 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -155,8 +155,8 @@ object PassFun { var (res1, res2) = (idx1, idx2) val pad = " this text is long and should match " val ps = pad count { _ == ' ' } - val t1 = if ( anchor == "L" ) (pad + text1) else (text1 + pad) - val t2 = if ( anchor == "L" ) (pad + text2) else (text2 + pad) + val t1 = if ( anchor == "L" ) (pad + text1 + " ") else (" " + text1 + pad) + val t2 = if ( anchor == "L" ) (pad + text2 + " ") else (" " + text2 + pad) val alg = jaligner.SmithWatermanGotoh.align(new Sequence(t1), new Sequence(t2), matchMatrix, 5.0f, 0.5f) val s1 = alg.getSequence1() @@ -167,13 +167,13 @@ object PassFun { if ( s1.size > 0 && s2.size > 0 && extra > 2 ) { if ( anchor == "L" ) { if ( alg.getStart1() == 0 && alg.getStart2() == 0 ) { - res1 += s1.count(_ == ' ') - (if (s1(s1.size - 1) == ' ') 1 else 0) - ps + 1 - res2 += s2.count(_ == ' ') - (if (s2(s2.size - 1) == ' ') 1 else 0) - ps + 1 + res1 += s1.count(_ == ' ') - ps + res2 += s2.count(_ == ' ') - ps } } else if ( anchor == "R" ) { if ( alg.getStart1() + len1 >= t1.size && alg.getStart2() + len2 >= t2.size ) { - res1 -= s1.count(_ == ' ') - (if (s1(0) == ' ') 1 else 0) - ps + 1 - res2 -= s2.count(_ == ' ') - (if (s2(0) == ' ') 1 else 0) - ps + 1 + res1 -= s1.count(_ == ' ') - ps + res2 -= s2.count(_ == ' ') - ps } } } From 160b70667597fd6109675e85dcc1881359b1ecb6 Mon Sep 17 00:00:00 2001 From: David Smith Date: Mon, 26 Aug 2019 17:13:22 -0400 Subject: [PATCH 17/39] Include token offsets in cluster output. --- src/main/scala/PassimApp.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 9931a2d..4778734 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -526,9 +526,11 @@ transform($pageCol, val joint = clusters .join(corpus.drop("terms"), "uid") - .withColumn("begin", 'termCharBegin('begin)) + .withColumnRenamed("begin", "bw") + .withColumnRenamed("end", "ew") + .withColumn("begin", 'termCharBegin('bw)) .withColumn("end", - when('end < size('termCharBegin), 'termCharBegin('end)).otherwise(length('text))) + when('ew < size('termCharBegin), 'termCharBegin('ew)).otherwise(length('text))) .drop("termCharBegin", "termCharEnd") .withColumn(config.text, getPassage(col(config.text), 'begin, 'end)) .selectRegions("pages") From 8e705932ffe0960308c3495f4f151a594495d696 Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 21 May 2019 14:46:31 +0200 Subject: [PATCH 18/39] Pick a single source for boilerplate. --- src/main/scala/PassimApp.scala | 190 ++++++++++++++++++++++----------- 1 file changed, 129 insertions(+), 61 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 4778734..f3b7281 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -77,24 +77,24 @@ case class Span(val begin: Int, val end: Int) { case class Post(feat: Long, tf: Int, post: Int) -case class IdSeries(id: Long, series: Long) - case class PassAlign(id1: String, id2: String, s1: String, s2: String, b1: Int, e1: Int, n1: Int, b2: Int, e2: Int, n2: Int, matches: Int, score: Float) case class AlignedStrings(s1: String, s2: String, matches: Int, score: Float) -case class NewDoc(id: String, text: String, pages: Seq[Page], aligned: Boolean) - case class LinkedSpan(span: Span, links: ArrayBuffer[Long]) case class ExtentPair(seq1: Int, seq2: Int, begin1: Int, begin2: Int, end1: Int, end2: Int, tok1: Int, tok2: Int) case class WitInfo(start: Int, length: Int, begin: Int, text: String) +case class SpanPair(b1: Int, e1: Int, b2: Int, e2: Int) + case class LineInfo(start: Int, text: String) +case class NewDoc(id: String, text: String, pages: Seq[Page], span: SpanPair) + object PassFun { def increasingMatches(matches: Iterable[(Int,Int,Int)]): Array[(Int,Int,Int)] = { val in = matches.toArray.sorted @@ -388,6 +388,7 @@ transform($pageCol, } } + // TODO: Make minLines a parameter val alignedPassages = udf { (s1: String, s2: String) => var start = 0 var b1 = 0 @@ -460,43 +461,46 @@ transform($pageCol, p } - val splitDoc = udf { (id: String, text: String, pages: Seq[Row], - begin: Seq[Int], end: Seq[Int]) => + val splitDoc = udf { (id: String, text: String, pages: Seq[Row], spans: Seq[Row]) => val pp = if ( pages == null ) Array[Page]() // Try doesn't catch nulls else Try(pages.map(PassimApp.rowToPage).toArray).getOrElse(Array[Page]()) val reg = if ( pp.size == 0 ) Array[Region]() else pp(0).regions val docs = new ArrayBuffer[NewDoc] - if ( begin == null || begin.size <= 0 ) { - docs += NewDoc(id, text, pp, false) + if ( spans == null || spans.size <= 0 ) { + docs += NewDoc(id, text, pp, null) } else { var start = 0 var breg = 0 var ereg = 0 - for ( i <- 0 until begin.size ) { - if ( (begin(i) - start) >= 2 ) { - // Should check that this document is more than just a few whitespace characters - while ( ereg < reg.size && reg(ereg).start < begin(i) ) ereg += 1 - docs += NewDoc(id + "_" + start + "_" + begin(i), - text.substring(start, begin(i)), - subpage(pp, reg.slice(breg, ereg).map(_.offset(-start))), - false) - breg = ereg + for ( span <- spans ) { + span match { + case Row(b1: Int, e1: Int, b2: Int, e2: Int) => + if ( (b1 - start) >= 2 ) { + // Should check that this document is more than just a few whitespace characters + while ( ereg < reg.size && reg(ereg).start < b1 ) ereg += 1 + docs += NewDoc(id + "_" + start + "_" + b1, + text.substring(start, b1), + subpage(pp, reg.slice(breg, ereg).map(_.offset(-start))), + null) + breg = ereg + } + while ( ereg < reg.size && reg(ereg).start < e1 ) ereg += 1 + docs += NewDoc(id + "_" + b1 + "_" + e1, + text.substring(b1, e1), + subpage(pp, reg.slice(breg, ereg).map(_.offset(-b1))), + SpanPair(b1, e1, b2, e2)) + breg = ereg + start = e1 } - while ( ereg < reg.size && reg(ereg).start < end(i) ) ereg += 1 - docs += NewDoc(id + "_" + begin(i) + "_" + end(i), - text.substring(begin(i), end(i)), - subpage(pp, reg.slice(breg, ereg).map(_.offset(-begin(i)))), - true) - breg = ereg - start = end(i) } - if ( (text.size - end.last) >= 2 ) { + val lastend = spans.last.getInt(1) + if ( (text.size - lastend) >= 2 ) { if ( ereg < reg.size ) ereg = reg.size - docs += NewDoc(id + "_" + end.last + "_" + text.size, - text.substring(end.last, text.size), - subpage(pp, reg.slice(breg, ereg).map(_.offset(-end.last))), - false) + docs += NewDoc(id + "_" + lastend + "_" + text.size, + text.substring(lastend, text.size), + subpage(pp, reg.slice(breg, ereg).map(_.offset(-lastend))), + null) } } docs.toArray @@ -504,20 +508,62 @@ transform($pageCol, def boilerSplit(passages: DataFrame, raw: DataFrame): DataFrame = { import passages.sparkSession.implicits._ val pageField = if ( raw.columns.contains("pages") ) "pages" else "null" + val srcSpans = udf { (lines: Seq[Row]) => + val res = ListBuffer[SpanPair]() + var curb1 = -1 + var cure1 = -1 + var curb2 = -1 + var cure2 = -1 + for ( cur <- lines ) { + cur match { + case Row(b1: Int, len1: Int, b2: Int, len2: Int) => + val e1 = b1 + len1 + val e2 = b2 + len2 + if ( b1 > cure1 || b2 > cure2 ) { + if ( curb1 > -1 ) { + res += SpanPair(curb1, cure1, curb2, cure2) + } + curb1 = b1 + cure1 = e1 + curb2 = b2 + cure2 = e2 + } else { + cure1 = e1 + cure2 = e2 + } + } + } + if ( curb1 > -1 ) { + res += SpanPair(curb1, cure1, curb2, cure2) + } + res.toSeq + } + // TODO: Just pick one and split the destination document. BUT! If + // we point to a span in the source document, but the source + // document itself gets split, what do we do? We don't even need + // to assume errors in the alignment: If A copies two ads from + // different sources and then puts them side by side, and then B + // copies both ads, then B will be segmented, and point to two + // segments of A. Do we have to resolve this at the line level? passages - .select('id2 as "id", 'b2 as "begin", 'e2 as "end") + .groupBy("id", "start", "length") + .agg(max("wit") as "src") + .groupBy($"id", $"src.id" as "src") + .agg(sort_array(collect_list(struct($"start", $"length", + $"src.begin" as "sstart", length($"src.text") as "slength"))) as "lines") .groupBy("id") - .agg(mergeAligned(collect_list("begin"), collect_list("end")) as "spans") - .select('id, $"spans._1" as "begin", $"spans._2" as "end") + .agg(max(struct(size($"lines") as "count", $"src" as "id", $"lines")) as "src") + .select($"id", $"src.id" as "src", srcSpans($"src.lines") as "spans") .join(raw, Seq("id"), "right_outer") - .withColumn("subdoc", explode(splitDoc('id, 'text, expr(pageField), 'begin, 'end))) - .drop("begin", "end") - .withColumnRenamed("id", "docid") + .withColumn("subdoc", explode(splitDoc('id, 'text, expr(pageField), 'spans))) + .withColumn("src", when($"subdoc.span".isNull, null).otherwise(struct('src as "id", + $"subdoc.span.b2" as "start", $"subdoc.span.e2" - $"subdoc.span.b2" as "length"))) + .withColumn("doc", when($"subdoc.span".isNull, null).otherwise(struct('id, + $"subdoc.span.b1" as "start", $"subdoc.span.e1" - $"subdoc.span.b1" as "length"))) .withColumn("id", $"subdoc.id") .withColumn("text", $"subdoc.text") .withColumn("pages", $"subdoc.pages") - .withColumn("aligned", $"subdoc.aligned") - .drop("subdoc") + .drop("subdoc", "spans") } def clusterJoin(config: Config, clusters: DataFrame, corpus: DataFrame): DataFrame = { import clusters.sparkSession.implicits._ @@ -698,6 +744,18 @@ transform($pageCol, } def boilerPassages(config: Config, corpus: DataFrame): DataFrame = { import align.sparkSession.implicits._ + val lineRecord = udf { + (b1: Int, b2: Int, pairs: Seq[Row]) => + var off1 = b1 + var off2 = b2 + pairs.map { (p: Row) => + val s1 = p.getString(0) + val s2 = p.getString(1) + off2 += s2.length + off1 += s1.length + WitInfo(off2 - s2.length, s2.length, off1 - s1.length, s1) + } + } val alignStrings = makeStringAligner(config, openGap = 1) val metaFields = ListBuffer[String]() if ( corpus.columns.contains("date") ) metaFields += "date" @@ -706,9 +764,10 @@ transform($pageCol, .join(corpus.select('uid, col(config.id) as "id", col(config.text) as "text", struct(metaFields.toList.map(expr):_*) as "meta", 'termCharBegin, 'termCharEnd), "uid") - .withColumn("begin", 'termCharBegin('begin)) + .withColumn("begin", lineStart('text, 'termCharBegin('begin))) .withColumn("end", - when('end < size('termCharBegin), 'termCharBegin('end)).otherwise(length('text))) + lineStop('text, + when('end < size('termCharBegin), 'termCharBegin('end)).otherwise(length('text)))) .select('mid, struct('first, 'id, 'meta, 'begin, 'end, getPassage('text, 'begin, 'end) as "text") as "info") .groupBy("mid") @@ -722,6 +781,10 @@ transform($pageCol, "info[0].begin + pass._1.end as e1", "info[1].begin + pass._2.begin as b2", "info[1].begin + pass._2.end as e2") + .select('id2 as "id", 'id1 as "src", 'meta1 as "meta", + explode(lineRecord('b1, 'b2, 'pairs)) as "wit") + .select('id, $"wit.start", $"wit.length", + struct('meta, 'src as "id", $"wit.begin", $"wit.text") as "wit") } def aggregateAlignments(config: Config, corpus: DataFrame, extents: DataFrame): DataFrame = { import align.sparkSession.implicits._ @@ -934,6 +997,26 @@ transform($pageCol, Math.max(0, Math.min(terms.size, end))).mkString(" ") } val getPassage = udf { (text: String, begin: Int, end: Int) => text.substring(begin, end) } + val lineStart = udf { (text: String, begin: Int) => + var start = begin + while ( start > 0 && (begin - start) < 20 && text.charAt(start - 1) != '\n' ) { + start -= 1 + } + if ( start > 0 && text.charAt(start - 1) != '\n' ) + begin + else + start + } + val lineStop = udf { (text: String, end: Int) => + var stop = end + while ( stop < text.length && (stop - end) < 20 && text.charAt(stop - 1) != '\n' ) { + stop += 1 + } + if ( stop < text.length && text.charAt(stop - 1) != '\n' ) + end + else + stop + } def hdfsExists(spark: SparkSession, path: String) = { val hdfsPath = new Path(path) val fs = hdfsPath.getFileSystem(spark.sparkContext.hadoopConfiguration) @@ -947,7 +1030,7 @@ transform($pageCol, .set("spark.driver.maxResultSize", "4g") .registerKryoClasses(Array(classOf[Coords], classOf[Region], classOf[Span], classOf[Post], classOf[PassAlign], - classOf[TokText], classOf[IdSeries],classOf[ExtentPair])) + classOf[TokText], classOf[ExtentPair])) val spark = SparkSession .builder() @@ -1181,20 +1264,11 @@ transform($pageCol, } if ( config.boilerplate || config.docwise ) { - val pass = extents.boilerPassages(config, corpus) + if ( !hdfsExists(spark, passFname) ) { + extents.boilerPassages(config, corpus).write.parquet(passFname) + } + val pass = spark.read.parquet(passFname) if ( config.docwise ) { - val lineRecord = udf { - (b1: Int, b2: Int, pairs: Seq[Row]) => - var off1 = b1 - var off2 = b2 - pairs.map { (p: Row) => - val s1 = p.getString(0) - val s2 = p.getString(1) - off2 += s2.length - off1 += s1.length - WitInfo(off2 - s2.length, s2.length, off1 - s1.length, s1) - } - } val textLines = udf { (text: String) => val res = ListBuffer[LineInfo]() var off = 0 @@ -1204,11 +1278,7 @@ transform($pageCol, } res.toSeq } - pass - .select('id2 as "id", 'id1 as "id1", 'meta1 as "meta", - explode(lineRecord('b1, 'b2, 'pairs)) as "wit") - .select('id, $"wit.start", $"wit.length", - struct('meta, 'id1 as "id", $"wit.begin", $"wit.text") as "wit") + pass // should include target text offset to support later correction .groupBy("id", "start", "length") .agg(sort_array(collect_list("wit")) as "wits") .groupBy("id") @@ -1221,9 +1291,7 @@ transform($pageCol, .drop("tlines", "mvars", "variants") .write.format(config.outputFormat).save(outFname) } else { - pass.drop("pairs").write.parquet(passFname) - boilerSplit(spark.read.parquet(passFname), raw) - .write.format(config.outputFormat).save(outFname) + boilerSplit(pass, raw).write.format(config.outputFormat).save(outFname) } sys.exit(0) } From 995b0014c64cdc2ed9487e8e08b3d39d05ad52ed Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 21 May 2019 15:48:36 +0200 Subject: [PATCH 19/39] Fix references to full doc. --- src/main/scala/PassimApp.scala | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index f3b7281..e63f5fb 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -93,7 +93,7 @@ case class SpanPair(b1: Int, e1: Int, b2: Int, e2: Int) case class LineInfo(start: Int, text: String) -case class NewDoc(id: String, text: String, pages: Seq[Page], span: SpanPair) +case class NewDoc(id: String, text: String, pages: Seq[Page], docspan: Span, srcspan: Span) object PassFun { def increasingMatches(matches: Iterable[(Int,Int,Int)]): Array[(Int,Int,Int)] = { @@ -468,7 +468,7 @@ transform($pageCol, val reg = if ( pp.size == 0 ) Array[Region]() else pp(0).regions val docs = new ArrayBuffer[NewDoc] if ( spans == null || spans.size <= 0 ) { - docs += NewDoc(id, text, pp, null) + docs += NewDoc(id, text, pp, null, null) } else { var start = 0 var breg = 0 @@ -482,14 +482,14 @@ transform($pageCol, docs += NewDoc(id + "_" + start + "_" + b1, text.substring(start, b1), subpage(pp, reg.slice(breg, ereg).map(_.offset(-start))), - null) + Span(start, b1), null) breg = ereg } while ( ereg < reg.size && reg(ereg).start < e1 ) ereg += 1 docs += NewDoc(id + "_" + b1 + "_" + e1, text.substring(b1, e1), subpage(pp, reg.slice(breg, ereg).map(_.offset(-b1))), - SpanPair(b1, e1, b2, e2)) + Span(b1, e1), Span(b2, e2)) breg = ereg start = e1 } @@ -500,7 +500,7 @@ transform($pageCol, docs += NewDoc(id + "_" + lastend + "_" + text.size, text.substring(lastend, text.size), subpage(pp, reg.slice(breg, ereg).map(_.offset(-lastend))), - null) + Span(lastend, text.size), null) } } docs.toArray @@ -556,10 +556,12 @@ transform($pageCol, .select($"id", $"src.id" as "src", srcSpans($"src.lines") as "spans") .join(raw, Seq("id"), "right_outer") .withColumn("subdoc", explode(splitDoc('id, 'text, expr(pageField), 'spans))) - .withColumn("src", when($"subdoc.span".isNull, null).otherwise(struct('src as "id", - $"subdoc.span.b2" as "start", $"subdoc.span.e2" - $"subdoc.span.b2" as "length"))) - .withColumn("doc", when($"subdoc.span".isNull, null).otherwise(struct('id, - $"subdoc.span.b1" as "start", $"subdoc.span.e1" - $"subdoc.span.b1" as "length"))) + .withColumn("src", when($"subdoc.srcspan".isNull, null).otherwise(struct('src as "id", + $"subdoc.srcspan.begin" as "start", + $"subdoc.srcspan.end" - $"subdoc.srcspan.begin" as "length"))) + .withColumn("doc", when($"subdoc.docspan".isNull, null).otherwise(struct('id, + $"subdoc.docspan.begin" as "start", + $"subdoc.docspan.end" - $"subdoc.docspan.begin" as "length"))) .withColumn("id", $"subdoc.id") .withColumn("text", $"subdoc.text") .withColumn("pages", $"subdoc.pages") From 78d836aa2c17e421a7532cdf083102184188966b Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 21 May 2019 11:47:06 -0400 Subject: [PATCH 20/39] Check for out-of-order boilerplate splits. --- src/main/scala/PassimApp.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index e63f5fb..09b0129 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -519,7 +519,7 @@ transform($pageCol, case Row(b1: Int, len1: Int, b2: Int, len2: Int) => val e1 = b1 + len1 val e2 = b2 + len2 - if ( b1 > cure1 || b2 > cure2 ) { + if ( b1 > cure1 || b2 > cure2 || e2 < cure2 ) { if ( curb1 > -1 ) { res += SpanPair(curb1, cure1, curb2, cure2) } From 63cc02a7d4acfa326a995054624f68fee8e95b4f Mon Sep 17 00:00:00 2001 From: David Smith Date: Wed, 22 May 2019 05:53:36 -0400 Subject: [PATCH 21/39] Make minLines a parameter. --- src/main/scala/PassimApp.scala | 126 +++++++++++++++------------------ 1 file changed, 59 insertions(+), 67 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 09b0129..9875ad5 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -24,6 +24,7 @@ case class Config(version: String = BuildInfo.version, labelPropagation: Boolean = false, n: Int = 5, minDF: Int = 2, maxDF: Int = 100, minRep: Int = 5, minAlg: Int = 20, gap: Int = 100, relOver: Double = 0.8, mergeDiverge: Double = 0.3, maxRep: Int = 10, + minLines: Int = 5, context: Int = 0, wordLength: Double = 2, pairwise: Boolean = false, @@ -388,72 +389,6 @@ transform($pageCol, } } - // TODO: Make minLines a parameter - val alignedPassages = udf { (s1: String, s2: String) => - var start = 0 - var b1 = 0 - var b2 = 0 - val buf = ArrayBuffer[(Int, Double, Int, Int, String, String)]() - for ( end <- 1 until s2.size ) { - if ( s2(end) == '\n' ) { - val alg1 = s1.substring(start, end+1) - val alg2 = s2.substring(start, end+1) - val t1 = alg1.replaceAll("-", "").replaceAll("\u2010", "-") - val t2 = alg2.replaceAll("-", "").replaceAll("\u2010", "-") - - val matches = alg1.zip(alg2).count(x => x._1 == x._2) - buf += ((t2.size - t1.size, matches * 1.0 / t2.size, b1, b2, t1, t2)) - start = end + 1 - b1 += t1.size - b2 += t2.size - } - } - val lines = buf.toArray - - val minLines = 5 - - val pass = ArrayBuffer[(Span, Span, Array[(String, String)])]() - val pairs = ArrayBuffer[(String, String)]() - var i = 0 - start = 0 - while ( i < lines.size ) { - if ( lines(i)._1.abs > 20 || lines(i)._2 < 0.1 ) { - if ( start < i - && (i + 2) < lines.size - && lines(i+1)._1.abs <= 20 && lines(i+1)._2 >= 0.1 - && (lines(i+1)._3 - lines(i)._3) <= 20 - && (lines(i+1)._4 - lines(i)._4) <= 20 ) { - // continue passage - pairs += ((lines(i)._5, lines(i)._6)) - } else { - if ( (i - start) >= minLines ) { - pass += ((Span(lines(start)._3, lines(i)._3), - Span(lines(start)._4, lines(i)._4), - pairs.toArray)) - } - start = i + 1 - pairs.clear - } - } else { - pairs += ((lines(i)._5, lines(i)._6)) - } - i += 1 - } - if ( (i - start) >= minLines ) { - pass += ((Span(lines(start)._3, lines(lines.size - 1)._3), - Span(lines(start)._4, lines(lines.size - 1)._4), - pairs.toArray)) - } - pass.toSeq - } - - val mergeAligned = udf { (begins: Seq[Int], ends: Seq[Int]) => - val spans = PassFun.mergeSpansLR(0, begins.zip(ends).map(x => Span(x._1, x._2)) - .zip(Range(0, begins.size).map(_.toLong))) - .map(_._1) // TODO? merge nearly adjacent? - (spans.map(_.begin), spans.map(_.end)) // unzip - } - def subpage(p: Seq[Page], r: Array[Region]) = { if ( p.size > 0 ) Seq(p(0).copy(regions = r)) @@ -746,6 +681,61 @@ transform($pageCol, } def boilerPassages(config: Config, corpus: DataFrame): DataFrame = { import align.sparkSession.implicits._ + val alignedPassages = udf { (s1: String, s2: String) => + var start = 0 + var b1 = 0 + var b2 = 0 + val buf = ArrayBuffer[(Int, Double, Int, Int, String, String)]() + for ( end <- 1 until s2.size ) { + if ( s2(end) == '\n' ) { + val alg1 = s1.substring(start, end+1) + val alg2 = s2.substring(start, end+1) + val t1 = alg1.replaceAll("-", "").replaceAll("\u2010", "-") + val t2 = alg2.replaceAll("-", "").replaceAll("\u2010", "-") + + val matches = alg1.zip(alg2).count(x => x._1 == x._2) + buf += ((t2.size - t1.size, matches * 1.0 / t2.size, b1, b2, t1, t2)) + start = end + 1 + b1 += t1.size + b2 += t2.size + } + } + val lines = buf.toArray + + val pass = ArrayBuffer[(Span, Span, Array[(String, String)])]() + val pairs = ArrayBuffer[(String, String)]() + var i = 0 + start = 0 + while ( i < lines.size ) { + if ( lines(i)._1.abs > 20 || lines(i)._2 < 0.1 ) { + if ( start < i + && (i + 2) < lines.size + && lines(i+1)._1.abs <= 20 && lines(i+1)._2 >= 0.1 + && (lines(i+1)._3 - lines(i)._3) <= 20 + && (lines(i+1)._4 - lines(i)._4) <= 20 ) { + // continue passage + pairs += ((lines(i)._5, lines(i)._6)) + } else { + if ( (i - start) >= config.minLines ) { + pass += ((Span(lines(start)._3, lines(i)._3), + Span(lines(start)._4, lines(i)._4), + pairs.toArray)) + } + start = i + 1 + pairs.clear + } + } else { + pairs += ((lines(i)._5, lines(i)._6)) + } + i += 1 + } + if ( (i - start) >= config.minLines ) { + pass += ((Span(lines(start)._3, lines(lines.size - 1)._3), + Span(lines(start)._4, lines(lines.size - 1)._4), + pairs.toArray)) + } + pass.toSeq + } val lineRecord = udf { (b1: Int, b2: Int, pairs: Seq[Row]) => var off1 = b1 @@ -884,7 +874,7 @@ transform($pageCol, } val collectTexts = udf { (texts: Seq[String], seqs: Seq[Int]) => - val textDict = seqs zip texts toMap + val textDict = seqs.zip(texts).toMap var allTexts = "" for (seq <- seqs.sorted) { @@ -1058,6 +1048,8 @@ transform($pageCol, c.copy(minRep = x) } text("Minimum number of n-gram matches between documents; default=5") opt[Int]('a', "min-align") action { (x, c) => c.copy(minAlg = x) } text("Minimum length of alignment; default=20") + opt[Int]('L', "min-lines") action { (x, c) => + c.copy(minLines = x) } text("Minimum number of lines in boilerplate and docwise alignments; default=5") opt[Int]('g', "gap") action { (x, c) => c.copy(gap = x) } text("Minimum size of the gap that separates passages; default=100") opt[Int]('c', "context") action { (x, c) => From 07b51d3cf9520887b5b3059c9e7268d96f57383a Mon Sep 17 00:00:00 2001 From: David Smith Date: Wed, 22 May 2019 07:25:49 -0400 Subject: [PATCH 22/39] Enforce minLines in boilerSplit. --- src/main/scala/PassimApp.scala | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 9875ad5..c006e9a 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -440,7 +440,7 @@ transform($pageCol, } docs.toArray } - def boilerSplit(passages: DataFrame, raw: DataFrame): DataFrame = { + def boilerSplit(config: Config, passages: DataFrame, raw: DataFrame): DataFrame = { import passages.sparkSession.implicits._ val pageField = if ( raw.columns.contains("pages") ) "pages" else "null" val srcSpans = udf { (lines: Seq[Row]) => @@ -449,26 +449,29 @@ transform($pageCol, var cure1 = -1 var curb2 = -1 var cure2 = -1 + var lineCount = 0 for ( cur <- lines ) { cur match { case Row(b1: Int, len1: Int, b2: Int, len2: Int) => val e1 = b1 + len1 val e2 = b2 + len2 if ( b1 > cure1 || b2 > cure2 || e2 < cure2 ) { - if ( curb1 > -1 ) { + if ( curb1 > -1 && lineCount > config.minLines ) { res += SpanPair(curb1, cure1, curb2, cure2) } curb1 = b1 cure1 = e1 curb2 = b2 cure2 = e2 + lineCount = 1 } else { cure1 = e1 cure2 = e2 + lineCount += 1 } } } - if ( curb1 > -1 ) { + if ( curb1 > -1 && lineCount > config.minLines ) { res += SpanPair(curb1, cure1, curb2, cure2) } res.toSeq @@ -1285,7 +1288,7 @@ transform($pageCol, .drop("tlines", "mvars", "variants") .write.format(config.outputFormat).save(outFname) } else { - boilerSplit(pass, raw).write.format(config.outputFormat).save(outFname) + boilerSplit(config, pass, raw).write.format(config.outputFormat).save(outFname) } sys.exit(0) } From 0a8a40f79fe020fa15bad9e5e524b5403fd70981 Mon Sep 17 00:00:00 2001 From: David Smith Date: Mon, 1 Jul 2019 14:58:20 +0200 Subject: [PATCH 23/39] Include alignments in witness information. --- src/main/scala/PassimApp.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index c006e9a..23875fd 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -88,7 +88,7 @@ case class LinkedSpan(span: Span, links: ArrayBuffer[Long]) case class ExtentPair(seq1: Int, seq2: Int, begin1: Int, begin2: Int, end1: Int, end2: Int, tok1: Int, tok2: Int) -case class WitInfo(start: Int, length: Int, begin: Int, text: String) +case class WitInfo(start: Int, length: Int, begin: Int, text: String, alg1: String, alg2: String) case class SpanPair(b1: Int, e1: Int, b2: Int, e2: Int) @@ -688,7 +688,7 @@ transform($pageCol, var start = 0 var b1 = 0 var b2 = 0 - val buf = ArrayBuffer[(Int, Double, Int, Int, String, String)]() + val buf = ArrayBuffer[(Int, Double, Int, Int, String, String, String, String)]() for ( end <- 1 until s2.size ) { if ( s2(end) == '\n' ) { val alg1 = s1.substring(start, end+1) @@ -697,7 +697,7 @@ transform($pageCol, val t2 = alg2.replaceAll("-", "").replaceAll("\u2010", "-") val matches = alg1.zip(alg2).count(x => x._1 == x._2) - buf += ((t2.size - t1.size, matches * 1.0 / t2.size, b1, b2, t1, t2)) + buf += ((t2.size - t1.size, matches * 1.0 / t2.size, b1, b2, t1, t2, alg1, alg2)) start = end + 1 b1 += t1.size b2 += t2.size @@ -705,8 +705,8 @@ transform($pageCol, } val lines = buf.toArray - val pass = ArrayBuffer[(Span, Span, Array[(String, String)])]() - val pairs = ArrayBuffer[(String, String)]() + val pass = ArrayBuffer[(Span, Span, Array[(String, String, String, String)])]() + val pairs = ArrayBuffer[(String, String, String, String)]() var i = 0 start = 0 while ( i < lines.size ) { @@ -717,7 +717,7 @@ transform($pageCol, && (lines(i+1)._3 - lines(i)._3) <= 20 && (lines(i+1)._4 - lines(i)._4) <= 20 ) { // continue passage - pairs += ((lines(i)._5, lines(i)._6)) + pairs += ((lines(i)._5, lines(i)._6, lines(i)._7, lines(i)._8)) } else { if ( (i - start) >= config.minLines ) { pass += ((Span(lines(start)._3, lines(i)._3), @@ -728,7 +728,7 @@ transform($pageCol, pairs.clear } } else { - pairs += ((lines(i)._5, lines(i)._6)) + pairs += ((lines(i)._5, lines(i)._6, lines(i)._7, lines(i)._8)) } i += 1 } @@ -748,7 +748,7 @@ transform($pageCol, val s2 = p.getString(1) off2 += s2.length off1 += s1.length - WitInfo(off2 - s2.length, s2.length, off1 - s1.length, s1) + WitInfo(off2 - s2.length, s2.length, off1 - s1.length, s1, p.getString(2), p.getString(3)) } } val alignStrings = makeStringAligner(config, openGap = 1) @@ -779,7 +779,7 @@ transform($pageCol, .select('id2 as "id", 'id1 as "src", 'meta1 as "meta", explode(lineRecord('b1, 'b2, 'pairs)) as "wit") .select('id, $"wit.start", $"wit.length", - struct('meta, 'src as "id", $"wit.begin", $"wit.text") as "wit") + struct('meta, 'src as "id", $"wit.begin", $"wit.text", $"wit.alg1", $"wit.alg2") as "wit") } def aggregateAlignments(config: Config, corpus: DataFrame, extents: DataFrame): DataFrame = { import align.sparkSession.implicits._ From 7ad1a86da20ebcbb6d9e0f9f81af0b35c11b88be Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 2 Jul 2019 11:45:56 +0200 Subject: [PATCH 24/39] Include page regions in docwise output. --- src/main/scala/PassimApp.scala | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 23875fd..8591e90 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -758,12 +758,13 @@ transform($pageCol, align.drop("gid") .join(corpus.select('uid, col(config.id) as "id", col(config.text) as "text", struct(metaFields.toList.map(expr):_*) as "meta", - 'termCharBegin, 'termCharEnd), "uid") + 'termCharBegin, 'termCharEnd, 'pages), "uid") .withColumn("begin", lineStart('text, 'termCharBegin('begin))) .withColumn("end", lineStop('text, when('end < size('termCharBegin), 'termCharBegin('end)).otherwise(length('text)))) - .select('mid, struct('first, 'id, 'meta, 'begin, 'end, + .withColumn("regions", expr(s"filter(pages[0].regions, r -> r.start < end AND (r.start + r.length) > begin)")) + .select('mid, struct('first, 'id, 'meta, 'regions, 'begin, 'end, getPassage('text, 'begin, 'end) as "text") as "info") .groupBy("mid") .agg(sort_array(collect_list("info"), false) as "info") // "first" == true sorts first @@ -771,15 +772,18 @@ transform($pageCol, .select('info, explode(alignedPassages($"alg.s1", $"alg.s2")) as "pass") .selectExpr("info[0].id as id1", "info[1].id as id2", "info[0].meta as meta1","info[1].meta as meta2", + "info[0].regions as regions1", "pass._3 as pairs", "info[0].begin + pass._1.begin as b1", "info[0].begin + pass._1.end as e1", "info[1].begin + pass._2.begin as b2", "info[1].begin + pass._2.end as e2") - .select('id2 as "id", 'id1 as "src", 'meta1 as "meta", + .select('id2 as "id", 'id1 as "src", 'meta1 as "meta", 'regions1 as "regions", explode(lineRecord('b1, 'b2, 'pairs)) as "wit") .select('id, $"wit.start", $"wit.length", - struct('meta, 'src as "id", $"wit.begin", $"wit.text", $"wit.alg1", $"wit.alg2") as "wit") + struct('meta, + expr("filter(regions, r -> r.start < (wit.begin + length(wit.text)) AND (r.start + r.length) > wit.begin)") as "regions", + 'src as "id", $"wit.begin", $"wit.text", $"wit.alg1", $"wit.alg2") as "wit") } def aggregateAlignments(config: Config, corpus: DataFrame, extents: DataFrame): DataFrame = { import align.sparkSession.implicits._ From 909daa1555e9bef095b472b5537b0cbf36d18e6b Mon Sep 17 00:00:00 2001 From: David Smith Date: Fri, 26 Jul 2019 17:56:37 -0400 Subject: [PATCH 25/39] Add linewise option for OCR training production. --- src/main/scala/PassimApp.scala | 74 ++++++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 7 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 8591e90..1c9ae84 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -29,7 +29,8 @@ case class Config(version: String = BuildInfo.version, wordLength: Double = 2, pairwise: Boolean = false, aggregate: Boolean = false, - docwise: Boolean = false, names: Boolean = false, postings: Boolean = false, + docwise: Boolean = false, linewise: Boolean = false, + names: Boolean = false, postings: Boolean = false, id: String = "id", group: String = "series", text: String = "text", fields: String = "", filterpairs: String = "gid < gid2", inputFormat: String = "json", outputFormat: String = "json", @@ -333,6 +334,27 @@ object PassimApp { .drop("_tokens") } } + def pageBox(pageCol: String): DataFrame = { + val pageFields = df.select(expr(s"inline($pageCol)")).columns + .filter { _ != "regions" }.map { f => s"p.$f as $f" }.mkString(", ") + df.withColumn(pageCol, + expr(s""" +transform($pageCol, + p -> struct($pageFields, + array(aggregate(p.regions, + struct(p.regions[0].start as start, + p.regions[0].length as length, + struct(p.regions[0].coords.x as x, + p.regions[0].coords.y as y, + p.regions[0].coords.w as w, + p.regions[0].coords.h as h) as coords), + (acc, r) -> struct(least(acc.start, r.start) as start, + greatest(acc.start + acc.length, r.start + r.length) - least(acc.start, r.start) as length, + struct(least(acc.coords.x, r.coords.x) as x, + least(acc.coords.y, r.coords.y) as y, + greatest(acc.coords.x + acc.coords.w, r.coords.x + r.coords.w) - least(acc.coords.x, r.coords.x) as w, + greatest(acc.coords.y + acc.coords.h, r.coords.y + r.coords.h) - least(acc.coords.y, r.coords.y) as h) as coords))) as regions))""")) + } def selectRegions(pageCol: String): DataFrame = { if ( df.columns.contains(pageCol) ) { // Do these transformations in SQL to avoid Java's persnicketiness about int/long casting @@ -752,6 +774,8 @@ transform($pageCol, } } val alignStrings = makeStringAligner(config, openGap = 1) + val pageFields = corpus.select(expr(s"inline(pages)")).columns + .filter { _ != "regions" }.map { f => s"p.$f as $f" }.mkString(", ") val metaFields = ListBuffer[String]() if ( corpus.columns.contains("date") ) metaFields += "date" metaFields += (if ( corpus.columns.contains("gold") ) "gold" else "0 as gold") @@ -763,8 +787,9 @@ transform($pageCol, .withColumn("end", lineStop('text, when('end < size('termCharBegin), 'termCharBegin('end)).otherwise(length('text)))) - .withColumn("regions", expr(s"filter(pages[0].regions, r -> r.start < end AND (r.start + r.length) > begin)")) - .select('mid, struct('first, 'id, 'meta, 'regions, 'begin, 'end, + .withColumn("pages", + expr(s"filter(transform(pages, p -> struct($pageFields, filter(p.regions, r -> r.start < end AND (r.start + r.length) > begin) as regions)), p -> size(p.regions) > 0)")) + .select('mid, struct('first, 'id, 'meta, 'pages, 'begin, 'end, getPassage('text, 'begin, 'end) as "text") as "info") .groupBy("mid") .agg(sort_array(collect_list("info"), false) as "info") // "first" == true sorts first @@ -772,17 +797,17 @@ transform($pageCol, .select('info, explode(alignedPassages($"alg.s1", $"alg.s2")) as "pass") .selectExpr("info[0].id as id1", "info[1].id as id2", "info[0].meta as meta1","info[1].meta as meta2", - "info[0].regions as regions1", + "info[0].pages as pages1", "pass._3 as pairs", "info[0].begin + pass._1.begin as b1", "info[0].begin + pass._1.end as e1", "info[1].begin + pass._2.begin as b2", "info[1].begin + pass._2.end as e2") - .select('id2 as "id", 'id1 as "src", 'meta1 as "meta", 'regions1 as "regions", + .select('id2 as "id", 'id1 as "src", 'meta1 as "meta", 'pages1 as "pages", explode(lineRecord('b1, 'b2, 'pairs)) as "wit") .select('id, $"wit.start", $"wit.length", struct('meta, - expr("filter(regions, r -> r.start < (wit.begin + length(wit.text)) AND (r.start + r.length) > wit.begin)") as "regions", + expr(s"filter(transform(pages, p -> struct($pageFields, filter(p.regions, r -> r.start < (wit.begin + length(wit.text)) AND (r.start + r.length) > wit.begin) as regions)), p -> size(p.regions) > 0)") as "pages", 'src as "id", $"wit.begin", $"wit.text", $"wit.alg1", $"wit.alg2") as "wit") } def aggregateAlignments(config: Config, corpus: DataFrame, extents: DataFrame): DataFrame = { @@ -1071,6 +1096,8 @@ transform($pageCol, c.copy(pairwise = true) } text("Output pairwise alignments") opt[Unit]('d', "docwise") action { (_, c) => c.copy(docwise = true) } text("Output docwise alignments") + opt[Unit]("linewise") action { (_, c) => + c.copy(linewise = true) } text("Output linewise alignments") opt[Unit]('N', "names") action { (_, c) => c.copy(names = true) } text("Output names and exit") opt[Unit]('P', "postings") action { (_, c) => @@ -1264,11 +1291,13 @@ transform($pageCol, } } - if ( config.boilerplate || config.docwise ) { + if ( config.boilerplate || config.docwise || config.linewise) { if ( !hdfsExists(spark, passFname) ) { extents.boilerPassages(config, corpus).write.parquet(passFname) } val pass = spark.read.parquet(passFname) + val pageFields = pass.select(expr(s"inline(wit.pages)")).columns + .filter { _ != "regions" }.map { f => s"p.$f as $f" }.mkString(", ") if ( config.docwise ) { val textLines = udf { (text: String) => val res = ListBuffer[LineInfo]() @@ -1291,6 +1320,37 @@ transform($pageCol, expr("transform(tlines, r -> struct(r.text as text, mvars[r.start] as wits))")) .drop("tlines", "mvars", "variants") .write.format(config.outputFormat).save(outFname) + } else if ( config.linewise ) { + val gap = 4 + val coreAlignment = udf { (alg1: String, alg2: String) => + val re = s"\\-{$gap,}\\s*".r + (List((0,0)) ++ + (re.findAllMatchIn(alg1).map { m => (m.start, m.toString.length) }.toList ++ + re.findAllMatchIn(alg2).map { m => (m.start, m.toString.length) }.toList).sorted ++ + List((alg1.length, 0))) + .sliding(2) + .map { p => + val begin = p(0)._1 + p(0)._2 + val end = p(1)._1 + (alg1.substring(0, begin).replaceAll("-", "").length, + alg1.substring(begin, end).replaceAll("-", "").replaceAll("\u2010", "-"), + alg2.substring(0, begin).replaceAll("-", "").length, + alg2.substring(begin, end).replaceAll("-", "").replaceAll("\u2010", "-")) + } + .filter { p => p._2.length > gap && p._4.length > gap } + .toSeq + + } + pass + .withColumn("core", coreAlignment($"wit.alg1", $"wit.alg2")) + .select('id, 'start + 'core(0)("_3") as "begin", + 'core(0)("_4") as "text", + $"wit.id" as "wid", $"wit.begin" + 'core(0)("_1") as "wbegin", + 'core(0)("_2") as "wtext", + $"wit.pages" as "wpages") + .withColumn("wpages", expr(s"filter(transform(wpages, p -> struct($pageFields, filter(p.regions, r -> r.start < (wbegin + length(wtext)) AND (r.start + r.length) > wbegin) as regions)), p -> size(p.regions) > 0)")) + .pageBox("wpages") + .write.format(config.outputFormat).save(outFname) } else { boilerSplit(config, pass, raw).write.format(config.outputFormat).save(outFname) } From 04af15e721f265344f71f9f84155456db23cdfc4 Mon Sep 17 00:00:00 2001 From: David Smith Date: Wed, 7 Aug 2019 08:48:30 -0400 Subject: [PATCH 26/39] Output both token and line bounding boxes; don't strip gaps. --- src/main/scala/PassimApp.scala | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 1c9ae84..918470c 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -1342,13 +1342,18 @@ transform($pageCol, } pass - .withColumn("core", coreAlignment($"wit.alg1", $"wit.alg2")) - .select('id, 'start + 'core(0)("_3") as "begin", - 'core(0)("_4") as "text", - $"wit.id" as "wid", $"wit.begin" + 'core(0)("_1") as "wbegin", - 'core(0)("_2") as "wtext", - $"wit.pages" as "wpages") - .withColumn("wpages", expr(s"filter(transform(wpages, p -> struct($pageFields, filter(p.regions, r -> r.start < (wbegin + length(wtext)) AND (r.start + r.length) > wbegin) as regions)), p -> size(p.regions) > 0)")) + // .withColumn("core", coreAlignment($"wit.alg1", $"wit.alg2")) + // .select('id, 'start + 'core(0)("_3") as "begin", + // 'core(0)("_4") as "text", + // $"wit.id" as "wid", $"wit.begin" + 'core(0)("_1") as "wbegin", + // 'core(0)("_2") as "wtext", + // $"wit.pages" as "wpages") + // .withColumn("wpages", expr(s"filter(transform(wpages, p -> struct($pageFields, filter(p.regions, r -> r.start < (wbegin + length(wtext)) AND (r.start + r.length) > wbegin) as regions)), p -> size(p.regions) > 0)")) + .select('id, 'start as "begin", + translate($"wit.alg2", "\u2010-", "-") as "text", + $"wit.id" as "wid", $"wit.begin" as "wbegin", $"wit.text" as "wtext", + $"wit.alg2" as "talg", $"wit.alg1" as "walg", + $"wit.pages" as "wpages", $"wit.pages" as "wpagesTokens") .pageBox("wpages") .write.format(config.outputFormat).save(outFname) } else { From ff5c4dea30fdcd1151e64866426de1380dbe0d2c Mon Sep 17 00:00:00 2001 From: David Smith Date: Fri, 16 Aug 2019 12:36:27 -0400 Subject: [PATCH 27/39] Output page information for target texts. --- src/main/scala/PassimApp.scala | 36 ++++++---------------------------- 1 file changed, 6 insertions(+), 30 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 918470c..1b78b6b 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -797,17 +797,18 @@ transform($pageCol, .select('info, explode(alignedPassages($"alg.s1", $"alg.s2")) as "pass") .selectExpr("info[0].id as id1", "info[1].id as id2", "info[0].meta as meta1","info[1].meta as meta2", - "info[0].pages as pages1", + "info[0].pages as pages1", "info[1].pages as pages2", "pass._3 as pairs", "info[0].begin + pass._1.begin as b1", "info[0].begin + pass._1.end as e1", "info[1].begin + pass._2.begin as b2", "info[1].begin + pass._2.end as e2") - .select('id2 as "id", 'id1 as "src", 'meta1 as "meta", 'pages1 as "pages", + .select('id2 as "id", 'id1 as "src", 'meta1 as "meta", 'pages1, 'pages2, explode(lineRecord('b1, 'b2, 'pairs)) as "wit") .select('id, $"wit.start", $"wit.length", + expr(s"filter(transform(pages2, p -> struct($pageFields, filter(p.regions, r -> r.start < (wit.start + wit.length) AND (r.start + r.length) > wit.start) as regions)), p -> size(p.regions) > 0)") as "pages", struct('meta, - expr(s"filter(transform(pages, p -> struct($pageFields, filter(p.regions, r -> r.start < (wit.begin + length(wit.text)) AND (r.start + r.length) > wit.begin) as regions)), p -> size(p.regions) > 0)") as "pages", + expr(s"filter(transform(pages1, p -> struct($pageFields, filter(p.regions, r -> r.start < (wit.begin + length(wit.text)) AND (r.start + r.length) > wit.begin) as regions)), p -> size(p.regions) > 0)") as "pages", 'src as "id", $"wit.begin", $"wit.text", $"wit.alg1", $"wit.alg2") as "wit") } def aggregateAlignments(config: Config, corpus: DataFrame, extents: DataFrame): DataFrame = { @@ -1321,39 +1322,14 @@ transform($pageCol, .drop("tlines", "mvars", "variants") .write.format(config.outputFormat).save(outFname) } else if ( config.linewise ) { - val gap = 4 - val coreAlignment = udf { (alg1: String, alg2: String) => - val re = s"\\-{$gap,}\\s*".r - (List((0,0)) ++ - (re.findAllMatchIn(alg1).map { m => (m.start, m.toString.length) }.toList ++ - re.findAllMatchIn(alg2).map { m => (m.start, m.toString.length) }.toList).sorted ++ - List((alg1.length, 0))) - .sliding(2) - .map { p => - val begin = p(0)._1 + p(0)._2 - val end = p(1)._1 - (alg1.substring(0, begin).replaceAll("-", "").length, - alg1.substring(begin, end).replaceAll("-", "").replaceAll("\u2010", "-"), - alg2.substring(0, begin).replaceAll("-", "").length, - alg2.substring(begin, end).replaceAll("-", "").replaceAll("\u2010", "-")) - } - .filter { p => p._2.length > gap && p._4.length > gap } - .toSeq - - } pass - // .withColumn("core", coreAlignment($"wit.alg1", $"wit.alg2")) - // .select('id, 'start + 'core(0)("_3") as "begin", - // 'core(0)("_4") as "text", - // $"wit.id" as "wid", $"wit.begin" + 'core(0)("_1") as "wbegin", - // 'core(0)("_2") as "wtext", - // $"wit.pages" as "wpages") - // .withColumn("wpages", expr(s"filter(transform(wpages, p -> struct($pageFields, filter(p.regions, r -> r.start < (wbegin + length(wtext)) AND (r.start + r.length) > wbegin) as regions)), p -> size(p.regions) > 0)")) .select('id, 'start as "begin", translate($"wit.alg2", "\u2010-", "-") as "text", $"wit.id" as "wid", $"wit.begin" as "wbegin", $"wit.text" as "wtext", $"wit.alg2" as "talg", $"wit.alg1" as "walg", + 'pages as "tpages", 'pages as "tpagesTokens", $"wit.pages" as "wpages", $"wit.pages" as "wpagesTokens") + .pageBox("tpages") .pageBox("wpages") .write.format(config.outputFormat).save(outFname) } else { From 13e8fa47129390b7bef55e8bbefd273c6e24e6bc Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 11 Feb 2020 08:16:52 -0500 Subject: [PATCH 28/39] Tweaks to README. --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8b05efd..8672f7e 100644 --- a/README.md +++ b/README.md @@ -100,9 +100,9 @@ compressed. $ passim input.json,directory-of-json-files,some*.json.bz2 output ``` -Output is to a directory that, on completion, will contain an +Output is written to a directory that, on completion, will contain an `out.json` directory with `part-*` files rather than a single file. -This allows multiple workers to efficiently write it (and read it back +This allows multiple workers to write it efficiently (and read it back in) in parallel. In addition, the output directory should contain the parameters used to invoke passim in `conf` and the intermediate cluster membership data in `clusters.parquet`. @@ -132,7 +132,7 @@ alignments between all matching passages, invoke passim with the `--pairwise` flag. These alignments will be in the `align.json` or `align.parquet`, depending on which output format you choose. -Some useful parameters are: +Some other useful parameters are: Parameter | Default value | Description --------- | ------------- | ----------- From cc916f4b594b738a3ab55defb5e6ba1c99544b51 Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 11 Feb 2020 09:11:57 -0500 Subject: [PATCH 29/39] Pruning pairs. --- README.md | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8672f7e..8c7b4d4 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ to sort passages within each cluster. Natural language text is redundant, and adding markup and JSON field names increases the redundancy. Spark and passim support several compression schemes. For relatively small files, gzip is -adequate; however, when the input files are large enough that the do +adequate; however, when the input files are large enough that they do not comfortably fit in memory, bzip2 is preferable since programs can split it into blocks before decompressing. @@ -143,7 +143,7 @@ Parameter | Default value | Description Pass parameters to the underlying Spark processes using the `SPARK_SUBMIT_ARGS` environment variable. For example, to run passim -on a local machine with 10 cores and 200GB of memory, do: +on a local machine with 10 cores and 200GB of memory, run the following command: ``` $ SPARK_SUBMIT_ARGS='--master local[10] --driver-memory 200G --executor-memory 200G' passim input.json output @@ -153,6 +153,29 @@ See the [Spark documentation](https://spark.apache.org/docs/latest/index.html) for further configuration options. +### Pruning the Alignments + +The documents input to passim are indexed to determine which pairs should be aligned. Often, document metadata can provide a priori constraints on which documents should be aligned. If there were no constraints, every pair of documents in the input would be aligned, in both directions. By default, however, documents with the same `series` value will not be aliged. These constraints on alignments are expressed by two arguments to passim: `--fields` and `--filterpairs`. + +The `--fields` argument tells passim which fields in the input records to index when determining which documents to align. Fields has the syntax of SQL `FROM` clause as implemented by Apache Spark, with the exception that multiple fields are separated by semicolons. By default, the value of the fields argument is: +``` +--fields 'hashId(id) as uid;hashId(series) as gid' +``` + +Since document and series identifiers can be long strings, passim runs more efficiently if they are hashed to long integers by the (builtin) `hashId` function. + +The `--filterpairs` argument is an SQL expression that specifies which pairs of documents are candidates for comparison. A candidate pair consists of a "left-hand" document, whose field names are identical to those in the input, and a "right-hand" document, whose field names have a "2" appended to them. The default value for the filterpairs argument is: +``` +--filterpairs 'gid < gid2' +``` +This ensures that documents from the same series are not aligned and, further, ensures that any given pair of documents is aligned in only one direction, as determined by the lexicographic ordering of the hashes of their series IDs. + +As an example, consider aligning only document pairs where the "left-hand" document predates the "right-hand" document by 0 to 30 days. To perform efficient date arithmetic, we use Apache Spark's built-in `date` function to convert a string `date` field to an integer: +``` +--fields 'date(date) as day' --filterpairs 'day <= day2 AND day2 - day <= 30 AND uid <> uid2' +``` +Since the dates may be equal, we also include the constraint that the hashed document ids (`uid`) be different. Had we not done this, the output would also have included alignments of every document with itself. The `uid` field as a hash of the `id` field is always available. Note also the SQL inequality operator `<>`. + ## Marking Locations inside Documents Documents may document their extent on physical pages with the `pages` field. This field is an array of `Page` regions with the following schema (here written in Scala): From cef6a01472208a8a4586e74c54ef0950cecb342d Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 11 Feb 2020 09:13:07 -0500 Subject: [PATCH 30/39] Grouping operations. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8c7b4d4..0c1fe00 100644 --- a/README.md +++ b/README.md @@ -172,7 +172,7 @@ This ensures that documents from the same series are not aligned and, further, e As an example, consider aligning only document pairs where the "left-hand" document predates the "right-hand" document by 0 to 30 days. To perform efficient date arithmetic, we use Apache Spark's built-in `date` function to convert a string `date` field to an integer: ``` ---fields 'date(date) as day' --filterpairs 'day <= day2 AND day2 - day <= 30 AND uid <> uid2' +--fields 'date(date) as day' --filterpairs 'day <= day2 AND (day2 - day) <= 30 AND uid <> uid2' ``` Since the dates may be equal, we also include the constraint that the hashed document ids (`uid`) be different. Had we not done this, the output would also have included alignments of every document with itself. The `uid` field as a hash of the `id` field is always available. Note also the SQL inequality operator `<>`. From b4634d79e94770ea634219a3285ee5347a306275 Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 11 Feb 2020 09:40:11 -0500 Subject: [PATCH 31/39] More on JOINs. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0c1fe00..78e4054 100644 --- a/README.md +++ b/README.md @@ -164,7 +164,7 @@ The `--fields` argument tells passim which fields in the input records to index Since document and series identifiers can be long strings, passim runs more efficiently if they are hashed to long integers by the (builtin) `hashId` function. -The `--filterpairs` argument is an SQL expression that specifies which pairs of documents are candidates for comparison. A candidate pair consists of a "left-hand" document, whose field names are identical to those in the input, and a "right-hand" document, whose field names have a "2" appended to them. The default value for the filterpairs argument is: +The `--filterpairs` argument is an SQL expression that specifies which pairs of documents are candidates for comparison. A candidate pair consists of a "left-hand" document, whose field names are identical to those in the input, and a "right-hand" document, whose field names have a "2" appended to them. This is similar to the left- and right-handed sides of a SQL JOIN; in fact, passim is effectively performing a (massively pruned) self-join on the table of input documents. The default value for the filterpairs argument is: ``` --filterpairs 'gid < gid2' ``` From 74ca198502f3ee5001c5af0d6e87173e4d03a94d Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 11 Feb 2020 17:38:13 -0500 Subject: [PATCH 32/39] Draft of Aligned Output. --- README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 78e4054..7a80848 100644 --- a/README.md +++ b/README.md @@ -153,7 +153,7 @@ See the [Spark documentation](https://spark.apache.org/docs/latest/index.html) for further configuration options. -### Pruning the Alignments +### Pruning Alignments The documents input to passim are indexed to determine which pairs should be aligned. Often, document metadata can provide a priori constraints on which documents should be aligned. If there were no constraints, every pair of documents in the input would be aligned, in both directions. By default, however, documents with the same `series` value will not be aliged. These constraints on alignments are expressed by two arguments to passim: `--fields` and `--filterpairs`. @@ -176,6 +176,16 @@ As an example, consider aligning only document pairs where the "left-hand" docum ``` Since the dates may be equal, we also include the constraint that the hashed document ids (`uid`) be different. Had we not done this, the output would also have included alignments of every document with itself. The `uid` field as a hash of the `id` field is always available. Note also the SQL inequality operator `<>`. +### Producing Aligned Output + +For the purposes of collating related texts or aligning different transcriptions of the same text, one can produce output using the `--docwise` or `--linewise` flags. Each output record in `out.json` or `out.parquet` will then contain a document or line from the right-hand, "target" text, along with information about corresponding passages in the left-hand, "witness" or "source" text. + +In the case of `--docwise` output. each output document contains an array of target line records, and each line record contains zero or more passages from the left-hand side that are aligned to that target line. Note that we specify "passages" from the left-hand side because the line breaks in the witness, if any, may not correspond to line breaks in the target texts. Both the target line and witness passage data may contain information about image coordinates, when available. + +For `--linewise` output, each output document contains a single newline-delimited line from a target document. This line is identified by the input document `id` and a character offset `begin` into the input text. The corresponding witness passage is identified with `wid` and `wbegin`. + +In both of these output variants, target lines and witness passages are presented in their original textual form and in their aligned form, with hyphens to pad insertions and deletions. + ## Marking Locations inside Documents Documents may document their extent on physical pages with the `pages` field. This field is an array of `Page` regions with the following schema (here written in Scala): From c726544f7892221a03b27ee7e96d3ec4c92d2aa5 Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 11 Feb 2020 22:09:34 -0500 Subject: [PATCH 33/39] linewise example --- README.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7a80848..4cd7c9a 100644 --- a/README.md +++ b/README.md @@ -184,7 +184,19 @@ In the case of `--docwise` output. each output document contains an array of tar For `--linewise` output, each output document contains a single newline-delimited line from a target document. This line is identified by the input document `id` and a character offset `begin` into the input text. The corresponding witness passage is identified with `wid` and `wbegin`. -In both of these output variants, target lines and witness passages are presented in their original textual form and in their aligned form, with hyphens to pad insertions and deletions. +In both of these output variants, target lines and witness passages are presented in their original textual form and in their aligned form, with hyphens to pad insertions and deletions. An example of a target line aligned to a single witness passage is: +``` +{ + "id": "scheffel_ekkehard_1855#f0261z751", + "begin": 721, + "text": "Grammatik iſt ein hohes Weib, anders erſcheint ſie Holzhackern, an-\n", + "wid": "scheffel_ekkehard_1855/0261", + "wbegin": 739, + "wtext": "Grammatik iſt ein hohes Weib, anders erſcheint ſie HolzhaFern , an=\n", + "talg": "Grammatik iſt ein hohes Weib, anders erſcheint ſie Holzhackern-, an‐\n", + "walg": "Grammatik iſt ein hohes Weib, anders erſcheint ſie Holzha-Fern , an=\n" +} +``` ## Marking Locations inside Documents From cce6a0cb79e62dd0643a5e835b87fc3b13f699f7 Mon Sep 17 00:00:00 2001 From: David Smith Date: Sat, 15 Feb 2020 11:07:24 -0500 Subject: [PATCH 34/39] Clean up pairwise alignment code. --- src/main/scala/PassimApp.scala | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 1b78b6b..194cc27 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -661,7 +661,7 @@ transform($pageCol, val corpusFields = ListBuffer(expr("uid"), expr(config.id + " as id"), expr(config.text + " as text"), expr("termCharBegin"), expr("termCharEnd")) - val algFields = ListBuffer("uid", "id" ,"bw", "ew", "b", "e", "len", "tok", "text") + val algFields = ListBuffer("first", "uid", "id", "bw", "ew", "b", "e", "len", "tok", "text") if ( corpus.columns.contains("pages") ) { corpusFields += expr("pages") algFields += "pages" @@ -686,11 +686,9 @@ transform($pageCol, .withColumn("tok", size('termCharBegin)) .withColumnRenamed("begin", "b") .withColumnRenamed("end", "e") - .select('mid, 'first, struct(algFields.map(expr):_*) as "info") + .select('mid, struct(algFields.map(expr):_*) as "info") .groupBy("mid") - .agg(first("first") as "sorted", first("info") as "info1", last("info") as "info2") - .select(when('sorted, 'info1).otherwise('info2) as "info1", - when('sorted, 'info2).otherwise('info1) as "info2") + .agg(max("info") as "info1", min("info") as "info2") .withColumn("alg", alignStrings($"info1.text", $"info2.text")) .select($"info1.*", $"info2.*", $"alg.*") .toDF(algFinal:_*) @@ -702,7 +700,7 @@ transform($pageCol, fullalign .select((cols.filter(_ endsWith "1") ++ cols.filter(_ endsWith "2") ++ Seq("matches", "score")).map(col):_*) - .sort('id1, 'id2, 'b1, 'b2) + // .sort('id1, 'id2, 'b1, 'b2) } def boilerPassages(config: Config, corpus: DataFrame): DataFrame = { import align.sparkSession.implicits._ @@ -1174,7 +1172,7 @@ transform($pageCol, spark.conf.set("spark.sql.shuffle.partitions", corpus.rdd.getNumPartitions * 3) if ( config.names ) { - corpus.select('uid, col(config.id), col(groupCol), size('terms) as "nterms") + corpus.select('uid, 'gid, col(config.id), col(groupCol), size('terms) as "nterms") .write.save(config.outputPath + "/names.parquet") sys.exit(0) } From e8fb0727f7f6768e6282ff8c8ce4bbf05d58201b Mon Sep 17 00:00:00 2001 From: David Smith Date: Thu, 2 Apr 2020 10:21:38 -0400 Subject: [PATCH 35/39] Clarify spark's multiple paths syntax. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4cd7c9a..0e8efa0 100644 --- a/README.md +++ b/README.md @@ -97,7 +97,7 @@ Multiple input paths should be separated by commas. Files may also be compressed. ``` -$ passim input.json,directory-of-json-files,some*.json.bz2 output +$ passim "{input.json,directory-of-json-files,some*.json.bz2}" output ``` Output is written to a directory that, on completion, will contain an From 809eab9fa0383dfbc2b89d3eaf980fbbeac7d822 Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 28 Apr 2020 16:24:51 -0400 Subject: [PATCH 36/39] Use ignore mode to short-circuit writing. --- src/main/scala/PassimApp.scala | 80 +++++++++++++++------------------- 1 file changed, 36 insertions(+), 44 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 1b78b6b..35c09a2 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -1274,20 +1274,20 @@ transform($pageCol, if ( config.context > 0 ) { extents.withContext(config, corpus) - .write.format(config.outputFormat) + .write.mode("ignore").format(config.outputFormat) .save(config.outputPath + "/context." + config.outputFormat) } if (config.pairwise || config.aggregate) { val alignments = extents.pairwiseAlignments(config, corpus) if ( config.pairwise ) { - alignments.write.format(config.outputFormat) + alignments.write.mode("ignore").format(config.outputFormat) .save(config.outputPath + "/align." + config.outputFormat) } if ( config.aggregate ) { extents.aggregateAlignments(config, corpus, extents) - .write.format(config.outputFormat) + .write.mode("ignore").format(config.outputFormat) .save(config.outputPath + "/aggregate." + config.outputFormat) } } @@ -1297,8 +1297,6 @@ transform($pageCol, extents.boilerPassages(config, corpus).write.parquet(passFname) } val pass = spark.read.parquet(passFname) - val pageFields = pass.select(expr(s"inline(wit.pages)")).columns - .filter { _ != "regions" }.map { f => s"p.$f as $f" }.mkString(", ") if ( config.docwise ) { val textLines = udf { (text: String) => val res = ListBuffer[LineInfo]() @@ -1338,55 +1336,49 @@ transform($pageCol, sys.exit(0) } - if ( !hdfsExists(spark, passFname) ) { - extents.mergePassages(config).write.parquet(passFname) - } + extents.mergePassages(config).write.mode("ignore").parquet(passFname) - if ( !hdfsExists(spark, clusterFname) ) { - val pass = spark.read.parquet(passFname) + val pass = spark.read.parquet(passFname) - if ( !config.labelPropagation ) { - spark.conf.set("spark.sql.shuffle.partitions", spark.sparkContext.defaultParallelism) - } + if ( !config.labelPropagation ) { + spark.conf.set("spark.sql.shuffle.partitions", spark.sparkContext.defaultParallelism) + } - val passGraph = GraphFrame( - pass.select('nid as "id", 'uid, 'gid, 'begin, 'end), - pass.select('nid, explode('edges) as "eid") - .groupBy("eid").agg(min("nid") as "src", max("nid") as "dst")) + val passGraph = GraphFrame( + pass.select('nid as "id", 'uid, 'gid, 'begin, 'end), + pass.select('nid, explode('edges) as "eid") + .groupBy("eid").agg(min("nid") as "src", max("nid") as "dst")) - val groups = if ( config.labelPropagation ) { - passGraph.labelPropagation.maxIter(11).run().withColumnRenamed("label", "cluster") - } else { - spark.sparkContext.setCheckpointDir(config.outputPath + "/tmp") - passGraph.connectedComponents.run().withColumnRenamed("component", "cluster") - } + val groups = if ( config.labelPropagation ) { + passGraph.labelPropagation.maxIter(11).run().withColumnRenamed("label", "cluster") + } else { + spark.sparkContext.setCheckpointDir(config.outputPath + "/tmp") + passGraph.connectedComponents.run().withColumnRenamed("component", "cluster") + } - val merge_spans = udf { (spans: Seq[Row]) => - PassFun.mergeSpansLR(0, spans.map { s => (Span(s.getInt(0), s.getInt(1)), 0L) }) - .map { _._1 } - } + val merge_spans = udf { (spans: Seq[Row]) => + PassFun.mergeSpansLR(0, spans.map { s => (Span(s.getInt(0), s.getInt(1)), 0L) }) + .map { _._1 } + } - val clusters = - groups.groupBy("cluster", "uid") - .agg(merge_spans(collect_list(struct("begin", "end"))) as "spans") - .select('cluster, 'uid, explode('spans) as "span") - .select('cluster, 'uid, $"span.*") - clusters.cache() + val clusters = + groups.groupBy("cluster", "uid") + .agg(merge_spans(collect_list(struct("begin", "end"))) as "spans") + .select('cluster, 'uid, explode('spans) as "span") + .select('cluster, 'uid, $"span.*") + clusters.cache() - clusters.join(clusters.groupBy("cluster").agg(count("uid") as "size"), "cluster") - .select('uid, 'cluster, 'size, 'begin, 'end) - .write.parquet(clusterFname) + clusters.join(clusters.groupBy("cluster").agg(count("uid") as "size"), "cluster") + .select('uid, 'cluster, 'size, 'begin, 'end) + .write.mode("ignore").parquet(clusterFname) - clusters.unpersist() - if ( !config.labelPropagation ) { - spark.conf.set("spark.sql.shuffle.partitions", corpus.rdd.getNumPartitions * 3) - } + clusters.unpersist() + if ( !config.labelPropagation ) { + spark.conf.set("spark.sql.shuffle.partitions", corpus.rdd.getNumPartitions * 3) } - if ( !hdfsExists(spark, outFname) ) { - clusterJoin(config, spark.read.parquet(clusterFname), corpus) - .write.format(config.outputFormat).save(outFname) - } + clusterJoin(config, spark.read.parquet(clusterFname), corpus) + .write.mode("ignore").format(config.outputFormat).save(outFname) spark.stop() From f4d564d526677638cbb1897ea93bd1dbd64d5df9 Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 28 Apr 2020 17:24:34 -0400 Subject: [PATCH 37/39] More use of ignore to short-circuit writing. --- src/main/scala/PassimApp.scala | 140 ++++++++++++++++----------------- 1 file changed, 66 insertions(+), 74 deletions(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 35c09a2..e20360d 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -1186,28 +1186,26 @@ transform($pageCol, if ( config.aggregate && !indexFields.contains(config.group)) indexFields ++= ListBuffer(config.group) val termCorpus = corpus.select(indexFields.toList.map(expr):_*) - if ( !hdfsExists(spark, dfpostFname) ) { - val getPostings = makeIndexer(config.n, config.wordLength) - - val postings = termCorpus - .withColumn("post", explode(getPostings('terms))) - .drop("terms") - .withColumn("feat", 'post("feat")) - .withColumn("tf", 'post("tf")) - .withColumn("post", 'post("post")) - .filter { 'tf === 1 } - .drop("tf") - - val df = postings.groupBy("feat").count.select('feat, 'count.cast("int") as "df") - .filter { 'df >= config.minDF && 'df <= config.maxDF } - - postings.join(df, "feat").write.save(dfpostFname) - } + val getPostings = makeIndexer(config.n, config.wordLength) + + val posts = termCorpus + .withColumn("post", explode(getPostings('terms))) + .drop("terms") + .withColumn("feat", 'post("feat")) + .withColumn("tf", 'post("tf")) + .withColumn("post", 'post("post")) + .filter { 'tf === 1 } + .drop("tf") + + val df = posts.groupBy("feat").count.select('feat, 'count.cast("int") as "df") + .filter { 'df >= config.minDF && 'df <= config.maxDF } + + posts.join(df, "feat").write.mode("ignore").save(dfpostFname) + if ( config.postings ) sys.exit(0) - if ( !hdfsExists(spark, pairsFname) ) { - val getPairs = - udf { (uid: Long, uid2: Long, post: Seq[Int], post2: Seq[Int], df: Seq[Int]) => + val getPairs = + udf { (uid: Long, uid2: Long, post: Seq[Int], post2: Seq[Int], df: Seq[Int]) => val matches = PassFun.increasingMatches((post, post2, df).zipped.toSeq) if ( matches.size >= config.minRep ) { PassFun.gappedMatches(config.n, config.gap, config.minAlg, matches) @@ -1216,59 +1214,55 @@ transform($pageCol, } else Seq() } - val dfpost = spark.read.load(dfpostFname) - - dfpost - .join(dfpost.toDF(dfpost.columns.map { f => if ( f == "feat" ) f else f + "2" }:_*), - "feat") - .filter(config.filterpairs) - .select("uid", "uid2", "post", "post2", "df") - .groupBy("uid", "uid2") - .agg(collect_list("post") as "post", collect_list("post2") as "post2", - collect_list("df") as "df") - .filter(size('post) >= config.minRep) - .select(explode(getPairs('uid, 'uid2, 'post, 'post2, 'df)) as "pair", - monotonically_increasing_id() as "mid") // Unique IDs serve as edge IDs in connected component graph - .select(explode('pair) as "pass", 'mid) - .select($"pass.*", 'mid) - .write.parquet(pairsFname) // But we need to cache so IDs don't get reassigned. + val dfpost = spark.read.load(dfpostFname) + + dfpost + .join(dfpost.toDF(dfpost.columns.map { f => if ( f == "feat" ) f else f + "2" }:_*), + "feat") + .filter(config.filterpairs) + .select("uid", "uid2", "post", "post2", "df") + .groupBy("uid", "uid2") + .agg(collect_list("post") as "post", collect_list("post2") as "post2", + collect_list("df") as "df") + .filter(size('post) >= config.minRep) + .select(explode(getPairs('uid, 'uid2, 'post, 'post2, 'df)) as "pair", + monotonically_increasing_id() as "mid") // Unique IDs serve as edge IDs in connected component graph + .select(explode('pair) as "pass", 'mid) + .select($"pass.*", 'mid) + .write.mode("ignore").parquet(pairsFname) // We need to cache so IDs don't get reassigned. + + val matchMatrix = jaligner.matrix.MatrixGenerator.generate(2, -1) + val alignEdge = udf { + (idx1: Int, idx2: Int, text1: String, text2: String, anchor: String) => + PassFun.alignEdge(matchMatrix, idx1, idx2, text1, text2, anchor) } - if ( !hdfsExists(spark, extentsFname) ) { - val pairs = spark.read.parquet(pairsFname) - - val matchMatrix = jaligner.matrix.MatrixGenerator.generate(2, -1) - val alignEdge = udf { - (idx1: Int, idx2: Int, text1: String, text2: String, anchor: String) => - PassFun.alignEdge(matchMatrix, idx1, idx2, text1, text2, anchor) - } - - val extentFields = ListBuffer("uid", "gid", "first", "size(terms) as tok") - extentFields += (if ( termCorpus.columns.contains("ref") ) "ref" else "0 as ref") - - val extent: Int = config.gap * 2/3 - pairs.join(termCorpus, "uid") - .select('mid, 'begin, 'end, - struct(extentFields.toList.map(expr):_*) as "info", - termSpan('begin - extent, 'begin, 'terms) as "prefix", - termSpan('end, 'end + extent, 'terms) as "suffix") - .groupBy("mid") - .agg(first("info") as "info", last("info") as "info2", - alignEdge(first("begin"), last("begin"), - first("prefix"), last("prefix"), lit("R")) as "begin", - alignEdge(first("end"), last("end"), - first("suffix"), last("suffix"), lit("L")) as "end") - .filter { ($"end._1" - $"begin._1") >= config.minAlg && - ($"end._2" - $"begin._2") >= config.minAlg } - .select(explode(array(struct('mid, $"info.*", - ($"end._2" - $"begin._2") as "olen", - $"begin._1" as "begin", $"end._1" as "end"), - struct('mid, $"info2.*", - ($"end._1" - $"begin._1") as "olen", - $"begin._2" as "begin", $"end._2" as "end"))) as "pair") - .select($"pair.*") - .write.parquet(extentsFname) - } + val extentFields = ListBuffer("uid", "gid", "first", "size(terms) as tok") + extentFields += (if ( termCorpus.columns.contains("ref") ) "ref" else "0 as ref") + + val extent: Int = config.gap * 2/3 + spark.read.parquet(pairsFname) + .join(termCorpus, "uid") + .select('mid, 'begin, 'end, + struct(extentFields.toList.map(expr):_*) as "info", + termSpan('begin - extent, 'begin, 'terms) as "prefix", + termSpan('end, 'end + extent, 'terms) as "suffix") + .groupBy("mid") + .agg(first("info") as "info", last("info") as "info2", + alignEdge(first("begin"), last("begin"), + first("prefix"), last("prefix"), lit("R")) as "begin", + alignEdge(first("end"), last("end"), + first("suffix"), last("suffix"), lit("L")) as "end") + .filter { ($"end._1" - $"begin._1") >= config.minAlg && + ($"end._2" - $"begin._2") >= config.minAlg } + .select(explode(array(struct('mid, $"info.*", + ($"end._2" - $"begin._2") as "olen", + $"begin._1" as "begin", $"end._1" as "end"), + struct('mid, $"info2.*", + ($"end._1" - $"begin._1") as "olen", + $"begin._2" as "begin", $"end._2" as "end"))) as "pair") + .select($"pair.*") + .write.mode("ignore").parquet(extentsFname) val extents = spark.read.parquet(extentsFname) @@ -1293,9 +1287,7 @@ transform($pageCol, } if ( config.boilerplate || config.docwise || config.linewise) { - if ( !hdfsExists(spark, passFname) ) { - extents.boilerPassages(config, corpus).write.parquet(passFname) - } + extents.boilerPassages(config, corpus).write.mode("ignore").parquet(passFname) val pass = spark.read.parquet(passFname) if ( config.docwise ) { val textLines = udf { (text: String) => From 053f3c16bce8a64b5a21c215877f88b7deafa587 Mon Sep 17 00:00:00 2001 From: David Smith Date: Thu, 4 Jun 2020 17:38:27 -0400 Subject: [PATCH 38/39] Include character offset as start for each line in docwise. --- src/main/scala/PassimApp.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/PassimApp.scala b/src/main/scala/PassimApp.scala index 194cc27..e909116 100644 --- a/src/main/scala/PassimApp.scala +++ b/src/main/scala/PassimApp.scala @@ -1316,7 +1316,7 @@ transform($pageCol, .withColumn("tlines", textLines('text)) .withColumn("mvars", map_from_arrays($"variants.start", $"variants.wits")) .withColumn("lines", - expr("transform(tlines, r -> struct(r.text as text, mvars[r.start] as wits))")) + expr("transform(tlines, r -> struct(r.start as begin, r.text as text, mvars[r.start] as wits))")) .drop("tlines", "mvars", "variants") .write.format(config.outputFormat).save(outFname) } else if ( config.linewise ) { From bc603a0b13b6c22d5aec270504341daaae38b344 Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 15 Dec 2020 13:36:36 -0500 Subject: [PATCH 39/39] Upgrade to spark 3.0 --- bin/passim | 4 ++-- build.sbt | 10 +++++----- project/build.properties | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bin/passim b/bin/passim index 8e2909b..274e813 100755 --- a/bin/passim +++ b/bin/passim @@ -5,6 +5,6 @@ PASSIM_HOME="$(cd "`dirname "$0"`"/..; pwd)" SPARK_SUBMIT_ARGS="$SPARK_SUBMIT_ARGS" spark-submit --class passim.PassimApp \ - --packages 'com.github.scopt:scopt_2.11:3.5.0,graphframes:graphframes:0.7.0-spark2.4-s_2.11' \ + --packages 'com.github.scopt:scopt_2.12:3.5.0,graphframes:graphframes:0.8.0-spark3.0-s_2.12' \ $SPARK_SUBMIT_ARGS \ - "$PASSIM_HOME"/target/scala-2.11/passim_2.11-0.2.0.jar "$@" + "$PASSIM_HOME"/target/scala-2.12/passim_2.12-0.2.0.jar "$@" diff --git a/build.sbt b/build.sbt index 822d08f..c292205 100644 --- a/build.sbt +++ b/build.sbt @@ -2,17 +2,17 @@ name := "passim" version := "0.2.0" -scalaVersion := "2.11.8" +scalaVersion := "2.12.10" resolvers += Resolver.mavenLocal -libraryDependencies += "org.apache.spark" %% "spark-core" % "2.4.3" -libraryDependencies += "org.apache.spark" %% "spark-graphx" % "2.4.3" -libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.3" +libraryDependencies += "org.apache.spark" %% "spark-core" % "3.0.0" +libraryDependencies += "org.apache.spark" %% "spark-graphx" % "3.0.0" +libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.0.0" resolvers += "Spark Packages Repo" at "http://dl.bintray.com/spark-packages/maven" -libraryDependencies += "graphframes" % "graphframes" % "0.7.0-spark2.4-s_2.11" +libraryDependencies += "graphframes" % "graphframes" % "0.8.0-spark3.0-s_2.12" libraryDependencies += "com.github.scopt" %% "scopt" % "3.5.0" diff --git a/project/build.properties b/project/build.properties index 23aa187..b1e5e31 100644 --- a/project/build.properties +++ b/project/build.properties @@ -14,4 +14,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # -sbt.version=0.13.18 +sbt.version=1.3.13