Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ontology extender #2

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions src/main/resources/application.conf
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
odinson.indexDir = "path/to/index" // the index will be saved here when using IndexCDRs and reac from here while using Taxero
odinson.dataDir = "path/to/data/dir"
apps.cdrDir = "path/to/cdr/files"
odinson.indexDir = "/local/path/to/index" // the index will be saved here when using IndexCDRs and read from here while using Taxero
odinson.dataDir = "/local/path/to/data/dir"
apps.cdrDir = "/local/path/to/dir/containing/cdrs/for/indexing"

taxero {
wordEmbeddings = "path/to/vectors.txt"
wordEmbeddings = "/local/path/to/vectors.txt"
lemmatize = true
}

ontologyExtender {
ontologyDir = "path/to/ontology/leaf/files"
outputDir = "path/to/ontologyFilesEnriched"
ontologyDir = "/local/path/to/ontology/files"
outputDir = ""
manualEval = false // true to output files that can be used for manual eval of the extender ouputs
similarityToHeaderTreshold = 0.75
similarityToHeaderTreshold = 1.16 // best for current decaying similarity to header score
addExamplesProportionallyToCurrentNum = false
proportionToExpandBy = 0.5
maxExamplesToAddPerOntologyLeaf = 10
inclOriginalLeaf = true
onlyQueryByLeafHeaderTerms = false
lemmatize = true
countThreshold = 0 // example has to occur at least countThreshold times to be added to ontology
}
171 changes: 153 additions & 18 deletions src/main/scala/org/clulab/taxero/OntologyExtender.scala
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ import scala.io.Source
import scala.util.{Failure, Success, Try}
import com.typesafe.scalalogging.LazyLogging

import scala.collection.mutable

object OntologyExtender extends App with LazyLogging {
// given a directory with ontology files, where each file has the ontology leaf header and a set of examples for the leaf of the ontology,
// produces corresponding files with the example sets enriched with hyponyms and co-hyponyms
// given a directory with ontology files, where each file has one or more ontology leaf paths as headers (e.g., Event/HealthAndDisease/Illness) and a set of examples for the leaf of the ontology, produces corresponding files with the example sets enriched with hyponyms and co-hyponyms


// get the reader
Expand All @@ -31,18 +32,29 @@ object OntologyExtender extends App with LazyLogging {
val inclOriginalLeaf = config.apply[Boolean]("ontologyExtender.inclOriginalLeaf")
val lemmatize = config.apply[Boolean]("ontologyExtender.lemmatize")
val onlyQueryByLeafHeaderTerms = config.apply[Boolean]("ontologyExtender.onlyQueryByLeafHeaderTerms")
val maxExamplesToAddPerOntologyLeaf = config.apply[Int]("ontologyExtender.maxExamplesToAddPerOntologyLeaf")
val countThreshold = config.apply[Int]("ontologyExtender.countThreshold")
val addExamplesProportionallyToCurrentNum = config.apply[Boolean]("ontologyExtender.addExamplesProportionallyToCurrentNum")
val proportionToExpandBy = config.apply[Double]("ontologyExtender.proportionToExpandBy")
val maxExamplesToAddPerOntologyLeafDefault = config.apply[Int]("ontologyExtender.maxExamplesToAddPerOntologyLeaf")

// create directories and files
outputDir.mkdir()
val files = ontologyDirectory.listFiles()

val (termToLeaf, allHeaders) = getTermToLeafMap(ontologyDirectory)

var examplesAddedPerFile = 0
var numOfFilesSucceeded = 0
for (file <- files) {
Try {
val outfile = new File(outputDir, file.getName.replace(".txt", ".csv"))
// retrieve existing examples
val lines = Source.fromFile(file).getLines().toList
val source = Source.fromFile(file)
val lines = source.getLines().toList
source.close()
val (header, examples) = lines.partition(_.startsWith("#"))
val existingExampleLemmas = examples.flatMap(_.split(" "))
val maxExamplesToAddPerOntologyLeaf = if (addExamplesProportionallyToCurrentNum) (examples.length * proportionToExpandBy).toInt else maxExamplesToAddPerOntologyLeafDefault

// only the last item on the path in the header is used for querying,
// but there could be multiple header lines in one ontology file
Expand All @@ -60,6 +72,7 @@ object OntologyExtender extends App with LazyLogging {

// lemmas in the header - used for filtering examples unrelated to the ontology leaf; use full header, not only the last item in the header because while some terms there are to generic to use for queries, they help to narrow down the general topic of the ontology leaf
val headerLemmas = getHeaderLemmas(header)
val headerLemmasNested = getHeaderLemmasNonFlat(header.head) // just take the first header

val queries = if (onlyQueryByLeafHeaderTerms) {
headerQueries
Expand All @@ -76,44 +89,44 @@ object OntologyExtender extends App with LazyLogging {
val egAsSeq = eg.split(" ")
// hyponym results
val results = reader.getRankedHyponyms(egAsSeq, lemmatize)
for (r <- results) if (isSimilarToLeafHeader(r, headerLemmas, similarityToHeaderTreshold)) resultsFromAllTerms.append(r)
for (r <- results) if(isSimilarToLeafWithDecay(r, headerLemmasNested, similarityToHeaderTreshold) && seenMoreThanK(r, countThreshold)) resultsFromAllTerms.append(r)
// cohyponym results
val coResults = reader.getRankedCohyponyms(egAsSeq, lemmatize)
for (r <-coResults) if (isSimilarToLeafHeader(r, headerLemmas, similarityToHeaderTreshold)) resultsFromAllTerms.append(r)
for (r <-coResults) if(isSimilarToLeafWithDecay(r, headerLemmasNested, similarityToHeaderTreshold) && seenMoreThanK(r, countThreshold)) resultsFromAllTerms.append(r)

if (manualEval) {
val (singleWord, multiWord) = results.partition(_.query.length < 2)

for (sw <- singleWord) if (isSimilarToLeafHeader(sw, headerLemmas, similarityToHeaderTreshold)) singleWordQueryResult.append(sw)
for (mw <- multiWord) if (isSimilarToLeafHeader(mw, headerLemmas, similarityToHeaderTreshold)) multiWordQueryResult.append(mw)
for (sw <- singleWord) if(isSimilarToLeafWithDecay(sw, headerLemmasNested, similarityToHeaderTreshold) && seenMoreThanK(sw, countThreshold)) singleWordQueryResult.append(sw)
for (mw <- multiWord) if(isSimilarToLeafWithDecay(mw, headerLemmasNested, similarityToHeaderTreshold) && seenMoreThanK(mw, countThreshold)) multiWordQueryResult.append(mw)

val (singleWordCohyp, multiWordCohyp) = coResults.partition(_.query.length < 2)
for (sw <- singleWordCohyp) if (isSimilarToLeafHeader(sw, headerLemmas, similarityToHeaderTreshold)) singleWordQueryResult.append(sw)
for (mw <- multiWordCohyp) if (isSimilarToLeafHeader(mw, headerLemmas, similarityToHeaderTreshold)) multiWordQueryResult.append(mw)
for (sw <- singleWordCohyp) if(isSimilarToLeafWithDecay(sw, headerLemmasNested, similarityToHeaderTreshold) && seenMoreThanK(sw, countThreshold)) singleWordQueryResult.append(sw)
for (mw <- multiWordCohyp) if(isSimilarToLeafWithDecay(mw, headerLemmasNested, similarityToHeaderTreshold) && seenMoreThanK(mw, countThreshold)) multiWordQueryResult.append(mw)
}
}

// used for filtering out results that match queries
val cleanQueries = queries.filter(_.length > 0).map(_.toLowerCase())

val sortedResults = if (manualEval) {
returnResultsForManualEval(resultsFromAllTerms, cleanQueries, headerLemmas)
returnResultsForManualEval(resultsFromAllTerms, cleanQueries, headerLemmasNested)
} else {
resultsFromAllTerms.filter(res => !cleanQueries.contains(res.result.mkString(" ").toLowerCase)).sortBy(-_.score).map(res => res.result.mkString(" ") + "\n")
}

val sortedResultsSingleWord = if (manualEval) {
returnResultsForManualEval(singleWordQueryResult, cleanQueries, headerLemmas)
returnResultsForManualEval(singleWordQueryResult, cleanQueries, headerLemmasNested)
} else Seq.empty

val sortedResultsMultiWord = if (manualEval) {
returnResultsForManualEval(multiWordQueryResult, cleanQueries, headerLemmas)
returnResultsForManualEval(multiWordQueryResult, cleanQueries, headerLemmasNested)
} else Seq.empty


val bw = new BufferedWriter(new FileWriter(outfile))
if (inclOriginalLeaf) {
bw.write(lines.head)
bw.write(header.mkString("\n"))
bw.write("\n" + examples.filter(_.length > 0).mkString("\n"))
bw.write("\n")
}
Expand All @@ -128,20 +141,119 @@ object OntologyExtender extends App with LazyLogging {
bw.write(getStringToWrite(sortedResultsMultiWord, maxExamplesToAddPerOntologyLeaf))
bw.close()
} else {
bw.write(getStringToWrite(sortedResults, maxExamplesToAddPerOntologyLeaf))
bw.write(getStringToWriteDistinctTokens(sortedResults, maxExamplesToAddPerOntologyLeaf, header.head, allHeaders, existingExampleLemmas))
bw.close()
}

} match {
case Success(_) => logger.info(s"extended ontology leaf ${file.getName}")
case Success(_) => {
logger.info(s"extended ontology leaf ${file.getName}")
numOfFilesSucceeded += 1
}

case Failure(e) => logger.error(s"failed to extend ontology leaf ${file.getName}", e)
}

}

logger.info(s"FILES SUCCEEDED:\t $numOfFilesSucceeded")

def seenMoreThanK(result: ScoredMatch, k: Int): Boolean = {
result.count > k
}

def lemmaAlreadyExistsInExamples(token: String, examples: Seq[String]) = {
examples.contains(reader.convertToLemmas(Seq(token)).head)
}

def isSimilarToLeafHeader(result: ScoredMatch, header: Seq[String], similarityToHeaderTreshold: Double): Boolean = {
reader.similarityScore(result.result.map(_.toLowerCase()), header) > similarityToHeaderTreshold
}

def decayingScore(result: ScoredMatch, header: Seq[Seq[String]]): Double = {
val scoreWeights = new ArrayBuffer[Double]()
var highestScore = 1.0
scoreWeights.append(highestScore)
for (i <- 1 to header.length-1) {
val nextScore = highestScore / 2
scoreWeights.append(nextScore)
highestScore = nextScore
}

var overallScore = 0.0
val scoreWeightsReversed = scoreWeights.reverse
for ((node, ind) <- header.zipWithIndex) {
overallScore += reader.similarityScore(result.result.map(_.toLowerCase()), node) * scoreWeightsReversed(ind)
}
overallScore

}

def isSimilarToLeafWithDecay(result: ScoredMatch, header: Seq[Seq[String]], similarityToHeaderThreshold: Double): Boolean = {
val score = decayingScore(result, header)
score > similarityToHeaderThreshold
}

def getTermToLeafMap(ontologyDirPath: File): (Map[String, ArrayBuffer[String]], Seq[String]) = {
// map every token in existin examples to all the ontology leaf paths/headers it occures in
val files = ontologyDirPath.listFiles()
val termToHeaderMap = mutable.Map[String, ArrayBuffer[String]]()
val allHeaders = new ArrayBuffer[String]()

for (file <- files) {
Try {
val source = Source.fromFile(file)
val lines = source.getLines().toList
source.close()
val (header, examples) = lines.partition(_.startsWith("#"))
allHeaders.append(header.head)
val distinctExamples = examples.flatMap(eg => eg.split(" ")).distinct
for (de <- distinctExamples) {
if (termToHeaderMap.contains(de)) {
termToHeaderMap(de).append(header.head)
} else {
termToHeaderMap(de) = new ArrayBuffer[String]()
termToHeaderMap(de).append(header.head)
}
}
} match {
case Success(_) => None
case Failure(e) => logger.error(s"failed to get examples from ${file.getName}", e)
}


}
(termToHeaderMap.toMap, allHeaders)
}

def existsInOtherLeaves(currentTerm: String, termToLeaves: Map[String, Seq[String]], topHeader: String): Boolean = {
if (!termToLeaves.contains(currentTerm)) return false // the term does not exist as an example in any of the ontology leaves
if (termToLeaves(currentTerm).length > 1) return true // the term exists in more than one ontology leaf
if (termToLeaves(currentTerm).mkString("/") != topHeader) return true // there exists one leaf with the current term as an example and it is not the current ontology leaf (=topHeader)
false
}

def findMostSimilarHeader(token: String, otherNodeHeaders: Seq[String]): String = {
val scores = new ArrayBuffer[Double]()
for (h <- otherNodeHeaders) {
val headerLemmas = getHeaderLemmas(Seq(h))
val score = reader.similarityScore(Seq(token), headerLemmas)
scores.append(score)
}
val scoresWithIdx = scores.zipWithIndex
val sortedScoresWithIdx = scoresWithIdx.sortBy(_._1).reverse
val maxHeader = otherNodeHeaders(sortedScoresWithIdx.head._2)
maxHeader

}

def mostSimilarToCurrentLeaf(token: String, currentHeader: String, otherNodeHeaders: Seq[String]): Boolean = {
//- in distinct result terms, if it's most similar to current header, return true and keep the term; currently not used---filtering too aggressive
val maxHeader = findMostSimilarHeader(token, otherNodeHeaders)
if (maxHeader == currentHeader) return true else return false

}

// not currently used
def checkIfQueryIsRelevant(results: Seq[ScoredMatch], fullHeader: Seq[String]): Boolean = {

Expand Down Expand Up @@ -171,16 +283,39 @@ object OntologyExtender extends App with LazyLogging {
.split("/").distinct)
}

def returnResultsForManualEval(results: Seq[ScoredMatch], cleanQueries: Seq[String], headerLemmas: Seq[String]): Seq[String] = {
def getHeaderLemmasNonFlat(fullHeader: String): Seq[Seq[String]] = {
// used for decaying similarity score to the header, e.g., in a path #Events/IllnessAndDisease, the similarity score weight decay will be applied to Event, but not to Illness and Disease
// as the latter two are at the same level in the ontology
val toReturn = fullHeader
.replace("#", "").replace(" ", "")
.split("/")
.map(_.split("And|Or|(?=[A-Z])").filter(_.length >0).map(_.toLowerCase()).toSeq)
toReturn.toSeq
}

def returnResultsForManualEval(results: Seq[ScoredMatch], cleanQueries: Seq[String], headerLemmas: Seq[Seq[String]]): Seq[String] = {
results.filter(
res => !cleanQueries.contains(res.result.mkString(" ").toLowerCase))
.sortBy(-_.score)
.map(res => res.result.mkString(" ") + "\t" + res.query.mkString(" ") + "\t" + res.score.toString + "\t" + res.similarity.toString + "\t" + reader.similarityScore(res.result.map(_.toLowerCase()), headerLemmas).toString + "\n")
.map(res => res.result.mkString(" ") + "\t" + res.query.mkString(" ") + "\t" + res.score.toString + "\t" + res.similarity.toString + "\t" + decayingScore(res, headerLemmas).toString + "\n")
}

def getStringToWrite(results: Seq[String], maxExamplesToAddPerOntologyLeaf: Int): String = {
val string = results.distinct.filter(_.length > 0).slice(0, maxExamplesToAddPerOntologyLeaf).mkString("")
string
}

def getStringToWriteDistinctTokens(results: Seq[String], maxExamplesToAddPerOntologyLeaf: Int, topHeader: String, otherHeaders: Seq[String], existingExampleLemmas: Seq[String]): String = {
// some of the filtering is only applied while writing the examples to ontology leaf files - for manual eval we keep the collocation results from taxero intact
val string = results.distinct.filter(_.length > 0)
.flatMap(res => res.replace("\n","").split(" ")).distinct
.filter(term => !lemmaAlreadyExistsInExamples(term, existingExampleLemmas))
.filter(term => term.toLowerCase() == term)
// .filter(term => mostSimilarToCurrentLeaf(term, topHeader, otherHeaders))
.filter(term => !existsInOtherLeaves(term, termToLeaf, topHeader))
.slice(0, maxExamplesToAddPerOntologyLeaf).mkString("\n")
logger.info(s"num of nodes added:\t ${string.split("\n").length} for leaf: $topHeader")
string
}

}