From 3f6b39d13b014cbc1498b3c33d277b96a2fedc10 Mon Sep 17 00:00:00 2001 From: Maria Date: Mon, 7 Dec 2020 08:31:55 -0700 Subject: [PATCH 1/4] add a todo based on meeting --- src/main/scala/org/clulab/taxero/OntologyExtender.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/scala/org/clulab/taxero/OntologyExtender.scala b/src/main/scala/org/clulab/taxero/OntologyExtender.scala index 42e147e..67d22b9 100644 --- a/src/main/scala/org/clulab/taxero/OntologyExtender.scala +++ b/src/main/scala/org/clulab/taxero/OntologyExtender.scala @@ -139,6 +139,7 @@ object OntologyExtender extends App with LazyLogging { } def isSimilarToLeafHeader(result: ScoredMatch, header: Seq[String], similarityToHeaderTreshold: Double): Boolean = { + // todo: When calculating the embedding for the name/header of the leaf, add decay, that is decrease the weight of the nodes on the path as we move up the ontology (decrease node weight by, for example, half every step up the ontology) reader.similarityScore(result.result.map(_.toLowerCase()), header) > similarityToHeaderTreshold } From 06e48c450eafef875a8f73174d6364cf08fc8abe Mon Sep 17 00:00:00 2001 From: Maria Date: Mon, 4 Jan 2021 00:26:41 -0700 Subject: [PATCH 2/4] added decaying score for similarity to header; updated threshold based on this change --- src/main/resources/application.conf | 2 +- .../org/clulab/taxero/OntologyExtender.scala | 85 ++++++++++++++++--- 2 files changed, 73 insertions(+), 14 deletions(-) diff --git a/src/main/resources/application.conf b/src/main/resources/application.conf index f81285e..6495ddd 100644 --- a/src/main/resources/application.conf +++ b/src/main/resources/application.conf @@ -11,7 +11,7 @@ ontologyExtender { ontologyDir = "path/to/ontology/leaf/files" outputDir = "path/to/ontologyFilesEnriched" manualEval = false // true to output files that can be used for manual eval of the extender ouputs - similarityToHeaderTreshold = 0.75 + similarityToHeaderTreshold = 1.16 // might need tuning maxExamplesToAddPerOntologyLeaf = 10 inclOriginalLeaf = true onlyQueryByLeafHeaderTerms = false diff --git a/src/main/scala/org/clulab/taxero/OntologyExtender.scala b/src/main/scala/org/clulab/taxero/OntologyExtender.scala index 67d22b9..70b3d93 100644 --- a/src/main/scala/org/clulab/taxero/OntologyExtender.scala +++ b/src/main/scala/org/clulab/taxero/OntologyExtender.scala @@ -39,7 +39,7 @@ object OntologyExtender extends App with LazyLogging { for (file <- files) { Try { - val outfile = new File(outputDir, file.getName.replace(".txt", ".csv")) + val outfile = new File(outputDir, file.getName.replace(".txt", ".txt")) // retrieve existing examples val lines = Source.fromFile(file).getLines().toList val (header, examples) = lines.partition(_.startsWith("#")) @@ -60,6 +60,7 @@ object OntologyExtender extends App with LazyLogging { // lemmas in the header - used for filtering examples unrelated to the ontology leaf; use full header, not only the last item in the header because while some terms there are to generic to use for queries, they help to narrow down the general topic of the ontology leaf val headerLemmas = getHeaderLemmas(header) + val headerLemmasNested = getHeaderLemmasNonFlat(header.head) // just take the first header val queries = if (onlyQueryByLeafHeaderTerms) { headerQueries @@ -76,20 +77,21 @@ object OntologyExtender extends App with LazyLogging { val egAsSeq = eg.split(" ") // hyponym results val results = reader.getRankedHyponyms(egAsSeq, lemmatize) - for (r <- results) if (isSimilarToLeafHeader(r, headerLemmas, similarityToHeaderTreshold)) resultsFromAllTerms.append(r) +// for (r <- results) if (isSimilarToLeafHeader(r, headerLemmas, similarityToHeaderTreshold)) resultsFromAllTerms.append(r) + for (r <- results) if(isSimilarToLeafWithDecay(r, headerLemmasNested, similarityToHeaderTreshold)) resultsFromAllTerms.append(r) // cohyponym results val coResults = reader.getRankedCohyponyms(egAsSeq, lemmatize) - for (r <-coResults) if (isSimilarToLeafHeader(r, headerLemmas, similarityToHeaderTreshold)) resultsFromAllTerms.append(r) + for (r <-coResults) if(isSimilarToLeafWithDecay(r, headerLemmasNested, similarityToHeaderTreshold)) resultsFromAllTerms.append(r) if (manualEval) { val (singleWord, multiWord) = results.partition(_.query.length < 2) - for (sw <- singleWord) if (isSimilarToLeafHeader(sw, headerLemmas, similarityToHeaderTreshold)) singleWordQueryResult.append(sw) - for (mw <- multiWord) if (isSimilarToLeafHeader(mw, headerLemmas, similarityToHeaderTreshold)) multiWordQueryResult.append(mw) + for (sw <- singleWord) if(isSimilarToLeafWithDecay(sw, headerLemmasNested, similarityToHeaderTreshold)) singleWordQueryResult.append(sw) + for (mw <- multiWord) if(isSimilarToLeafWithDecay(mw, headerLemmasNested, similarityToHeaderTreshold)) multiWordQueryResult.append(mw) val (singleWordCohyp, multiWordCohyp) = coResults.partition(_.query.length < 2) - for (sw <- singleWordCohyp) if (isSimilarToLeafHeader(sw, headerLemmas, similarityToHeaderTreshold)) singleWordQueryResult.append(sw) - for (mw <- multiWordCohyp) if (isSimilarToLeafHeader(mw, headerLemmas, similarityToHeaderTreshold)) multiWordQueryResult.append(mw) + for (sw <- singleWordCohyp) if(isSimilarToLeafWithDecay(sw, headerLemmasNested, similarityToHeaderTreshold)) singleWordQueryResult.append(sw) + for (mw <- multiWordCohyp) if(isSimilarToLeafWithDecay(mw, headerLemmasNested, similarityToHeaderTreshold)) multiWordQueryResult.append(mw) } } @@ -97,23 +99,24 @@ object OntologyExtender extends App with LazyLogging { val cleanQueries = queries.filter(_.length > 0).map(_.toLowerCase()) val sortedResults = if (manualEval) { - returnResultsForManualEval(resultsFromAllTerms, cleanQueries, headerLemmas) + returnResultsForManualEval(resultsFromAllTerms, cleanQueries, headerLemmasNested) } else { resultsFromAllTerms.filter(res => !cleanQueries.contains(res.result.mkString(" ").toLowerCase)).sortBy(-_.score).map(res => res.result.mkString(" ") + "\n") } val sortedResultsSingleWord = if (manualEval) { - returnResultsForManualEval(singleWordQueryResult, cleanQueries, headerLemmas) + returnResultsForManualEval(singleWordQueryResult, cleanQueries, headerLemmasNested) } else Seq.empty val sortedResultsMultiWord = if (manualEval) { - returnResultsForManualEval(multiWordQueryResult, cleanQueries, headerLemmas) + returnResultsForManualEval(multiWordQueryResult, cleanQueries, headerLemmasNested) } else Seq.empty val bw = new BufferedWriter(new FileWriter(outfile)) if (inclOriginalLeaf) { - bw.write(lines.head) + bw.write(header.mkString("\n")) +// bw.write(lines.head) bw.write("\n" + examples.filter(_.length > 0).mkString("\n")) bw.write("\n") } @@ -143,6 +146,41 @@ object OntologyExtender extends App with LazyLogging { reader.similarityScore(result.result.map(_.toLowerCase()), header) > similarityToHeaderTreshold } + def decayingScore(result: ScoredMatch, header: Seq[Seq[String]]): Double = { + val scoreWeights = new ArrayBuffer[Double]() + var highestScore = 1.0 + scoreWeights.append(highestScore) + for (i <- 1 to header.length-1) { + // println("->>" + i) + val nextScore = highestScore / 2 + scoreWeights.append(nextScore) + highestScore = nextScore + } + // for (sc <- scoreWeights.reverse) println(sc) + + var overallScore = 0.0 + val scoreWeightsReversed = scoreWeights.reverse + for ((node, ind) <- header.zipWithIndex) { + overallScore += reader.similarityScore(result.result.map(_.toLowerCase()), node) * scoreWeightsReversed(ind) + println("node: " + node.mkString("|")) + println("sim score: " + reader.similarityScore(result.result.map(_.toLowerCase()), node)) + println("multiplier: " + scoreWeightsReversed(ind)) + println("added score" + reader.similarityScore(result.result.map(_.toLowerCase()), node) * scoreWeightsReversed(ind)) + + + } + println("SCORE: " + overallScore) + overallScore + + } + + def isSimilarToLeafWithDecay(result: ScoredMatch, header: Seq[Seq[String]], similarityToHeaderThreshold: Double): Boolean = { + val score = decayingScore(result, header) + score > similarityToHeaderThreshold + +// println("SCORE: " + overallScore) + } + // not currently used def checkIfQueryIsRelevant(results: Seq[ScoredMatch], fullHeader: Seq[String]): Boolean = { @@ -172,11 +210,32 @@ object OntologyExtender extends App with LazyLogging { .split("/").distinct) } - def returnResultsForManualEval(results: Seq[ScoredMatch], cleanQueries: Seq[String], headerLemmas: Seq[String]): Seq[String] = { + def getHeaderLemmasNonFlat(fullHeader: String): Seq[Seq[String]] = { + + val toReturn = fullHeader + .replace("#", "").replace(" ", "") + .split("/") + .map(_.split("And|Or|(?=[A-Z])").filter(_.length >0).map(_.toLowerCase()).toSeq) +// println(toReturn) +// .map(_.split("/").tail.mkString("/")) +// .map(_.split("And|Or|(?=[A-Z])")) + // .filter(_!="/").map(_.toLowerCase())) +// .map(reader.convertToLemmas(_)) + +// for (tr <- toReturn) println("=>" + tr.mkString("|")) +// .map(_.split("And|Or|(?=[A-Z])")) +// .mkString("/") +// .map() +// .split("And|Or|(?=[A-Z])") +// .map(_.toLowerCase()).distinct + toReturn.toSeq + } + + def returnResultsForManualEval(results: Seq[ScoredMatch], cleanQueries: Seq[String], headerLemmas: Seq[Seq[String]]): Seq[String] = { results.filter( res => !cleanQueries.contains(res.result.mkString(" ").toLowerCase)) .sortBy(-_.score) - .map(res => res.result.mkString(" ") + "\t" + res.query.mkString(" ") + "\t" + res.score.toString + "\t" + res.similarity.toString + "\t" + reader.similarityScore(res.result.map(_.toLowerCase()), headerLemmas).toString + "\n") + .map(res => res.result.mkString(" ") + "\t" + res.query.mkString(" ") + "\t" + res.score.toString + "\t" + res.similarity.toString + "\t" + decayingScore(res, headerLemmas).toString + "\n") } def getStringToWrite(results: Seq[String], maxExamplesToAddPerOntologyLeaf: Int): String = { From cc691de07b36f7181e5dead4d6500fa420e2db23 Mon Sep 17 00:00:00 2001 From: maxaalexeeva Date: Sun, 31 Jan 2021 23:39:43 -0700 Subject: [PATCH 3/4] a working version before cosmetic changes --- .../org/clulab/taxero/OntologyExtender.scala | 136 +++++++++++++++--- 1 file changed, 114 insertions(+), 22 deletions(-) diff --git a/src/main/scala/org/clulab/taxero/OntologyExtender.scala b/src/main/scala/org/clulab/taxero/OntologyExtender.scala index 70b3d93..e355822 100644 --- a/src/main/scala/org/clulab/taxero/OntologyExtender.scala +++ b/src/main/scala/org/clulab/taxero/OntologyExtender.scala @@ -10,6 +10,9 @@ import scala.collection.mutable.ArrayBuffer import scala.io.Source import scala.util.{Failure, Success, Try} import com.typesafe.scalalogging.LazyLogging +import ujson.IndexedValue.True + +import scala.collection.mutable object OntologyExtender extends App with LazyLogging { // given a directory with ontology files, where each file has the ontology leaf header and a set of examples for the leaf of the ontology, @@ -31,18 +34,34 @@ object OntologyExtender extends App with LazyLogging { val inclOriginalLeaf = config.apply[Boolean]("ontologyExtender.inclOriginalLeaf") val lemmatize = config.apply[Boolean]("ontologyExtender.lemmatize") val onlyQueryByLeafHeaderTerms = config.apply[Boolean]("ontologyExtender.onlyQueryByLeafHeaderTerms") - val maxExamplesToAddPerOntologyLeaf = config.apply[Int]("ontologyExtender.maxExamplesToAddPerOntologyLeaf") + val countThreshold = config.apply[Int]("ontologyExtender.countThreshold") + val addExamplesProportionallyToCurrentNum = config.apply[Boolean]("ontologyExtender.addExamplesProportionallyToCurrentNum") + val proportionToExpandBy = config.apply[Double]("ontologyExtender.proportionToExpandBy") + val maxExamplesToAddPerOntologyLeafDefault = config.apply[Int]("ontologyExtender.maxExamplesToAddPerOntologyLeaf") // create directories and files outputDir.mkdir() val files = ontologyDirectory.listFiles() + val (termToLeaf, allHeaders) = getTermToLeafMap(ontologyDirectory) +// val otherHeaders = ??? + + + for (t <- termToLeaf) println(t._1 + "||" + t._2.mkString("++++")) + + var examplesAddedPerFile = 0 + var numOfFilesSucceeded = 0 for (file <- files) { Try { - val outfile = new File(outputDir, file.getName.replace(".txt", ".txt")) + val outfile = new File(outputDir, file.getName.replace(".txt", ".csv")) // retrieve existing examples val lines = Source.fromFile(file).getLines().toList val (header, examples) = lines.partition(_.startsWith("#")) + val existingExampleLemmas = examples.map(_.split(" ")).flatten +// for (cet <- currentExampleTokens) println("||" + cet + "||") +// println("\n") + val maxExamplesToAddPerOntologyLeaf = if (addExamplesProportionallyToCurrentNum) (examples.length * proportionToExpandBy).toInt else maxExamplesToAddPerOntologyLeafDefault +// println(maxExamplesToAddPerOntologyLeaf + "<- max len to write for leaf " + file.getName) // only the last item on the path in the header is used for querying, // but there could be multiple header lines in one ontology file @@ -77,21 +96,20 @@ object OntologyExtender extends App with LazyLogging { val egAsSeq = eg.split(" ") // hyponym results val results = reader.getRankedHyponyms(egAsSeq, lemmatize) -// for (r <- results) if (isSimilarToLeafHeader(r, headerLemmas, similarityToHeaderTreshold)) resultsFromAllTerms.append(r) - for (r <- results) if(isSimilarToLeafWithDecay(r, headerLemmasNested, similarityToHeaderTreshold)) resultsFromAllTerms.append(r) + for (r <- results) if(isSimilarToLeafWithDecay(r, headerLemmasNested, similarityToHeaderTreshold) && seenMoreThanK(r, countThreshold)) resultsFromAllTerms.append(r) // cohyponym results val coResults = reader.getRankedCohyponyms(egAsSeq, lemmatize) - for (r <-coResults) if(isSimilarToLeafWithDecay(r, headerLemmasNested, similarityToHeaderTreshold)) resultsFromAllTerms.append(r) + for (r <-coResults) if(isSimilarToLeafWithDecay(r, headerLemmasNested, similarityToHeaderTreshold) && seenMoreThanK(r, countThreshold)) resultsFromAllTerms.append(r) if (manualEval) { val (singleWord, multiWord) = results.partition(_.query.length < 2) - for (sw <- singleWord) if(isSimilarToLeafWithDecay(sw, headerLemmasNested, similarityToHeaderTreshold)) singleWordQueryResult.append(sw) - for (mw <- multiWord) if(isSimilarToLeafWithDecay(mw, headerLemmasNested, similarityToHeaderTreshold)) multiWordQueryResult.append(mw) + for (sw <- singleWord) if(isSimilarToLeafWithDecay(sw, headerLemmasNested, similarityToHeaderTreshold) && seenMoreThanK(sw, countThreshold)) singleWordQueryResult.append(sw) + for (mw <- multiWord) if(isSimilarToLeafWithDecay(mw, headerLemmasNested, similarityToHeaderTreshold) && seenMoreThanK(mw, countThreshold)) multiWordQueryResult.append(mw) val (singleWordCohyp, multiWordCohyp) = coResults.partition(_.query.length < 2) - for (sw <- singleWordCohyp) if(isSimilarToLeafWithDecay(sw, headerLemmasNested, similarityToHeaderTreshold)) singleWordQueryResult.append(sw) - for (mw <- multiWordCohyp) if(isSimilarToLeafWithDecay(mw, headerLemmasNested, similarityToHeaderTreshold)) multiWordQueryResult.append(mw) + for (sw <- singleWordCohyp) if(isSimilarToLeafWithDecay(sw, headerLemmasNested, similarityToHeaderTreshold) && seenMoreThanK(sw, countThreshold)) singleWordQueryResult.append(sw) + for (mw <- multiWordCohyp) if(isSimilarToLeafWithDecay(mw, headerLemmasNested, similarityToHeaderTreshold) && seenMoreThanK(mw, countThreshold)) multiWordQueryResult.append(mw) } } @@ -131,18 +149,32 @@ object OntologyExtender extends App with LazyLogging { bw.write(getStringToWrite(sortedResultsMultiWord, maxExamplesToAddPerOntologyLeaf)) bw.close() } else { - bw.write(getStringToWrite(sortedResults, maxExamplesToAddPerOntologyLeaf)) + bw.write(getStringToWriteDistinctTokens(sortedResults, maxExamplesToAddPerOntologyLeaf, header.head, allHeaders, existingExampleLemmas)) bw.close() } } match { - case Success(_) => logger.info(s"extended ontology leaf ${file.getName}") + case Success(_) => { + logger.info(s"extended ontology leaf ${file.getName}") + numOfFilesSucceeded += 1 + } + case Failure(e) => logger.error(s"failed to extend ontology leaf ${file.getName}", e) } + + } + + logger.info(s"FILES SUCCEEDED:\t ${numOfFilesSucceeded}") + + def seenMoreThanK(result: ScoredMatch, k: Int): Boolean = { + result.count > k + } + + def lemmaAlreadyExistsInExamples(token: String, examples: Seq[String]) = { + examples.contains(reader.convertToLemmas(Seq(token)).head) } def isSimilarToLeafHeader(result: ScoredMatch, header: Seq[String], similarityToHeaderTreshold: Double): Boolean = { - // todo: When calculating the embedding for the name/header of the leaf, add decay, that is decrease the weight of the nodes on the path as we move up the ontology (decrease node weight by, for example, half every step up the ontology) reader.similarityScore(result.result.map(_.toLowerCase()), header) > similarityToHeaderTreshold } @@ -151,25 +183,16 @@ object OntologyExtender extends App with LazyLogging { var highestScore = 1.0 scoreWeights.append(highestScore) for (i <- 1 to header.length-1) { - // println("->>" + i) val nextScore = highestScore / 2 scoreWeights.append(nextScore) highestScore = nextScore } - // for (sc <- scoreWeights.reverse) println(sc) var overallScore = 0.0 val scoreWeightsReversed = scoreWeights.reverse for ((node, ind) <- header.zipWithIndex) { overallScore += reader.similarityScore(result.result.map(_.toLowerCase()), node) * scoreWeightsReversed(ind) - println("node: " + node.mkString("|")) - println("sim score: " + reader.similarityScore(result.result.map(_.toLowerCase()), node)) - println("multiplier: " + scoreWeightsReversed(ind)) - println("added score" + reader.similarityScore(result.result.map(_.toLowerCase()), node) * scoreWeightsReversed(ind)) - - } - println("SCORE: " + overallScore) overallScore } @@ -177,8 +200,65 @@ object OntologyExtender extends App with LazyLogging { def isSimilarToLeafWithDecay(result: ScoredMatch, header: Seq[Seq[String]], similarityToHeaderThreshold: Double): Boolean = { val score = decayingScore(result, header) score > similarityToHeaderThreshold + } + + def getTermToLeafMap(ontologyDirPath: File): (Map[String, ArrayBuffer[String]], Seq[String]) = { + val files = ontologyDirPath.listFiles() + val termToHeaderMap = mutable.Map[String, ArrayBuffer[String]]() + val allHeaders = new ArrayBuffer[String]() + + for (file <- files) { + Try { + val source = Source.fromFile(file) + val lines = source.getLines().toList + source.close() + val (header, examples) = lines.partition(_.startsWith("#")) + allHeaders.append(header.head) + val distinctExamples = examples.flatMap(eg => eg.split(" ")).distinct + for (de <- distinctExamples) { + if (termToHeaderMap.contains(de)) { + termToHeaderMap(de).append(header.head) + } else { + termToHeaderMap(de) = new ArrayBuffer[String]() + termToHeaderMap(de).append(header.head) + } + } + } match { + case Success(_) => None + case Failure(e) => logger.error(s"failed to get examples from ${file.getName}", e) + } + + + } + (termToHeaderMap.toMap, allHeaders) + } + + def existsInOtherLeaves(currentTerm: String, termToLeaves: Map[String, Seq[String]], topHeader: String): Boolean = { + if (!termToLeaves.contains(currentTerm)) return false // the term does not exist as an example in any of the ontology leaves + if (termToLeaves(currentTerm).length > 1) return true // the term exists in more than one ontology leaf + if (termToLeaves(currentTerm).mkString("/") != topHeader) return true // there exists one leaf with the current term as an example and it is not the current ontology leaf (=topHeader) + false + } + + def findMostSimilarHeader(token: String, otherNodeHeaders: Seq[String]): String = { + val scores = new ArrayBuffer[Double]() + for (h <- otherNodeHeaders) { + val headerLemmas = getHeaderLemmas(Seq(h)) + val score = reader.similarityScore(Seq(token), headerLemmas) + scores.append(score) + } + val scoresWithIdx = scores.zipWithIndex + val sortedScoresWithIdx = scoresWithIdx.sortBy(_._1).reverse + val maxHeader = otherNodeHeaders(sortedScoresWithIdx.head._2) + maxHeader + + } + + def mostSimilarToCurrentLeaf(token: String, currentHeader: String, otherNodeHeaders: Seq[String]): Boolean = { + //- in distinct result terms, if it's most similar to current header, return true and keep the term; currently not used---filtering too aggressive + val maxHeader = findMostSimilarHeader(token, otherNodeHeaders) + if (maxHeader == currentHeader) return true else return false -// println("SCORE: " + overallScore) } // not currently used @@ -243,4 +323,16 @@ object OntologyExtender extends App with LazyLogging { string } + def getStringToWriteDistinctTokens(results: Seq[String], maxExamplesToAddPerOntologyLeaf: Int, topHeader: String, otherHeaders: Seq[String], existingExampleLemmas: Seq[String]): String = { + val string = results.distinct.filter(_.length > 0) + .flatMap(res => res.replace("\n","").split(" ")).distinct + .filter(term => !lemmaAlreadyExistsInExamples(term, existingExampleLemmas)) + .filter(term => term.toLowerCase() == term) +// .filter(term => mostSimilarToCurrentLeaf(term, topHeader, otherHeaders)) + .filter(term => !existsInOtherLeaves(term, termToLeaf, topHeader)) + .slice(0, maxExamplesToAddPerOntologyLeaf).mkString("\n") + println("num of nodes added:\t" + string.split("\n").length + "leaf: " + topHeader) + "\nnew examples:\n" + string + } + } From 04f3d7b22237a219663da05f5ae597ed897d2250 Mon Sep 17 00:00:00 2001 From: maxaalexeeva Date: Sun, 31 Jan 2021 23:53:36 -0700 Subject: [PATCH 4/4] cleanup --- src/main/resources/application.conf | 17 ++++---- .../org/clulab/taxero/OntologyExtender.scala | 41 ++++++------------- 2 files changed, 22 insertions(+), 36 deletions(-) diff --git a/src/main/resources/application.conf b/src/main/resources/application.conf index 6495ddd..7294a41 100644 --- a/src/main/resources/application.conf +++ b/src/main/resources/application.conf @@ -1,19 +1,22 @@ -odinson.indexDir = "path/to/index" // the index will be saved here when using IndexCDRs and reac from here while using Taxero -odinson.dataDir = "path/to/data/dir" -apps.cdrDir = "path/to/cdr/files" +odinson.indexDir = "/local/path/to/index" // the index will be saved here when using IndexCDRs and read from here while using Taxero +odinson.dataDir = "/local/path/to/data/dir" +apps.cdrDir = "/local/path/to/dir/containing/cdrs/for/indexing" taxero { - wordEmbeddings = "path/to/vectors.txt" + wordEmbeddings = "/local/path/to/vectors.txt" lemmatize = true } ontologyExtender { - ontologyDir = "path/to/ontology/leaf/files" - outputDir = "path/to/ontologyFilesEnriched" + ontologyDir = "/local/path/to/ontology/files" + outputDir = "" manualEval = false // true to output files that can be used for manual eval of the extender ouputs - similarityToHeaderTreshold = 1.16 // might need tuning + similarityToHeaderTreshold = 1.16 // best for current decaying similarity to header score + addExamplesProportionallyToCurrentNum = false + proportionToExpandBy = 0.5 maxExamplesToAddPerOntologyLeaf = 10 inclOriginalLeaf = true onlyQueryByLeafHeaderTerms = false lemmatize = true + countThreshold = 0 // example has to occur at least countThreshold times to be added to ontology } \ No newline at end of file diff --git a/src/main/scala/org/clulab/taxero/OntologyExtender.scala b/src/main/scala/org/clulab/taxero/OntologyExtender.scala index e355822..4fc7b92 100644 --- a/src/main/scala/org/clulab/taxero/OntologyExtender.scala +++ b/src/main/scala/org/clulab/taxero/OntologyExtender.scala @@ -10,13 +10,11 @@ import scala.collection.mutable.ArrayBuffer import scala.io.Source import scala.util.{Failure, Success, Try} import com.typesafe.scalalogging.LazyLogging -import ujson.IndexedValue.True import scala.collection.mutable object OntologyExtender extends App with LazyLogging { - // given a directory with ontology files, where each file has the ontology leaf header and a set of examples for the leaf of the ontology, - // produces corresponding files with the example sets enriched with hyponyms and co-hyponyms + // given a directory with ontology files, where each file has one or more ontology leaf paths as headers (e.g., Event/HealthAndDisease/Illness) and a set of examples for the leaf of the ontology, produces corresponding files with the example sets enriched with hyponyms and co-hyponyms // get the reader @@ -44,10 +42,6 @@ object OntologyExtender extends App with LazyLogging { val files = ontologyDirectory.listFiles() val (termToLeaf, allHeaders) = getTermToLeafMap(ontologyDirectory) -// val otherHeaders = ??? - - - for (t <- termToLeaf) println(t._1 + "||" + t._2.mkString("++++")) var examplesAddedPerFile = 0 var numOfFilesSucceeded = 0 @@ -55,13 +49,12 @@ object OntologyExtender extends App with LazyLogging { Try { val outfile = new File(outputDir, file.getName.replace(".txt", ".csv")) // retrieve existing examples - val lines = Source.fromFile(file).getLines().toList + val source = Source.fromFile(file) + val lines = source.getLines().toList + source.close() val (header, examples) = lines.partition(_.startsWith("#")) - val existingExampleLemmas = examples.map(_.split(" ")).flatten -// for (cet <- currentExampleTokens) println("||" + cet + "||") -// println("\n") + val existingExampleLemmas = examples.flatMap(_.split(" ")) val maxExamplesToAddPerOntologyLeaf = if (addExamplesProportionallyToCurrentNum) (examples.length * proportionToExpandBy).toInt else maxExamplesToAddPerOntologyLeafDefault -// println(maxExamplesToAddPerOntologyLeaf + "<- max len to write for leaf " + file.getName) // only the last item on the path in the header is used for querying, // but there could be multiple header lines in one ontology file @@ -134,7 +127,6 @@ object OntologyExtender extends App with LazyLogging { val bw = new BufferedWriter(new FileWriter(outfile)) if (inclOriginalLeaf) { bw.write(header.mkString("\n")) -// bw.write(lines.head) bw.write("\n" + examples.filter(_.length > 0).mkString("\n")) bw.write("\n") } @@ -164,7 +156,7 @@ object OntologyExtender extends App with LazyLogging { } - logger.info(s"FILES SUCCEEDED:\t ${numOfFilesSucceeded}") + logger.info(s"FILES SUCCEEDED:\t $numOfFilesSucceeded") def seenMoreThanK(result: ScoredMatch, k: Int): Boolean = { result.count > k @@ -203,6 +195,7 @@ object OntologyExtender extends App with LazyLogging { } def getTermToLeafMap(ontologyDirPath: File): (Map[String, ArrayBuffer[String]], Seq[String]) = { + // map every token in existin examples to all the ontology leaf paths/headers it occures in val files = ontologyDirPath.listFiles() val termToHeaderMap = mutable.Map[String, ArrayBuffer[String]]() val allHeaders = new ArrayBuffer[String]() @@ -291,23 +284,12 @@ object OntologyExtender extends App with LazyLogging { } def getHeaderLemmasNonFlat(fullHeader: String): Seq[Seq[String]] = { - + // used for decaying similarity score to the header, e.g., in a path #Events/IllnessAndDisease, the similarity score weight decay will be applied to Event, but not to Illness and Disease + // as the latter two are at the same level in the ontology val toReturn = fullHeader .replace("#", "").replace(" ", "") .split("/") .map(_.split("And|Or|(?=[A-Z])").filter(_.length >0).map(_.toLowerCase()).toSeq) -// println(toReturn) -// .map(_.split("/").tail.mkString("/")) -// .map(_.split("And|Or|(?=[A-Z])")) - // .filter(_!="/").map(_.toLowerCase())) -// .map(reader.convertToLemmas(_)) - -// for (tr <- toReturn) println("=>" + tr.mkString("|")) -// .map(_.split("And|Or|(?=[A-Z])")) -// .mkString("/") -// .map() -// .split("And|Or|(?=[A-Z])") -// .map(_.toLowerCase()).distinct toReturn.toSeq } @@ -324,6 +306,7 @@ object OntologyExtender extends App with LazyLogging { } def getStringToWriteDistinctTokens(results: Seq[String], maxExamplesToAddPerOntologyLeaf: Int, topHeader: String, otherHeaders: Seq[String], existingExampleLemmas: Seq[String]): String = { + // some of the filtering is only applied while writing the examples to ontology leaf files - for manual eval we keep the collocation results from taxero intact val string = results.distinct.filter(_.length > 0) .flatMap(res => res.replace("\n","").split(" ")).distinct .filter(term => !lemmaAlreadyExistsInExamples(term, existingExampleLemmas)) @@ -331,8 +314,8 @@ object OntologyExtender extends App with LazyLogging { // .filter(term => mostSimilarToCurrentLeaf(term, topHeader, otherHeaders)) .filter(term => !existsInOtherLeaves(term, termToLeaf, topHeader)) .slice(0, maxExamplesToAddPerOntologyLeaf).mkString("\n") - println("num of nodes added:\t" + string.split("\n").length + "leaf: " + topHeader) - "\nnew examples:\n" + string + logger.info(s"num of nodes added:\t ${string.split("\n").length} for leaf: $topHeader") + string } }