From c3980b794bc06a6667eb8081e4a3bf6b351f08d1 Mon Sep 17 00:00:00 2001 From: Gus Hahn-Powell Date: Tue, 12 Nov 2024 02:34:17 -0700 Subject: [PATCH] Compose and execute disjunction of basic patterns (#5) ## Summary of Changes - Support compilation and execution of a disjunction of "basic" Odinson patterns - POST `/api/execute/grammar` now takes a YAML grammar as the its text-based body w/ options specified via query params - Changes to Mention JSON response: - `text` attributes and complete spans ### Related issues - See https://gist.github.com/myedibleenso/bb383ba5ad6267eccfa7a46be7156c46 --- .github/workflows/docs.yml | 2 +- app/ai/lum/odinson/rest/json/package.scala | 159 ++++++++++++---- .../rest/requests/GrammarRequest.scala | 1 + .../rest/requests/SimplePatternsRequest.scala | 19 ++ .../odinson/rest/utils/ExceptionUtils.scala | 1 + app/ai/lum/odinson/rest/utils/StartEnd.scala | 3 + app/controllers/OdinsonController.scala | 171 ++++++++++++----- conf/routes | 5 +- public/schema/odinson.yaml | 147 ++++++++++++++- python/lum/odinson/doc.py | 54 +++--- python/lum/odinson/rest/api.py | 107 ++++++++--- python/lum/odinson/rest/docker.py | 49 +++-- python/lum/odinson/rest/requests.py | 35 +++- python/lum/odinson/rest/responses.py | 48 +++-- .../odinson/tests/test_odinson_document.py | 13 +- .../odinson/tests/test_odinson_sentence.py | 5 +- test/controllers/OdinsonControllerSpec.scala | 176 +++++++++++------- 17 files changed, 738 insertions(+), 257 deletions(-) create mode 100644 app/ai/lum/odinson/rest/requests/SimplePatternsRequest.scala create mode 100644 app/ai/lum/odinson/rest/utils/StartEnd.scala diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 3ecb568..5264082 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -76,7 +76,7 @@ jobs: # docs (other) - name: "Create documentation (other)" run: | - docker run -i -v "$GITHUB_WORKSPACE:/app" parsertongue/mkdocs:latest mkdocs build -c + mkdocs build -c - name: Deploy docs # FIXME: re-enable later #if: github.ref == 'refs/heads/main' diff --git a/app/ai/lum/odinson/rest/json/package.scala b/app/ai/lum/odinson/rest/json/package.scala index b0cf9d6..d4b9536 100644 --- a/app/ai/lum/odinson/rest/json/package.scala +++ b/app/ai/lum/odinson/rest/json/package.scala @@ -4,7 +4,7 @@ import ai.lum.common.ConfigUtils._ import ai.lum.odinson.lucene.OdinResults import ai.lum.odinson.lucene.search.OdinsonScoreDoc import ai.lum.odinson.{ - Document => OdinsonDocument, + //Document => OdinsonDocument, EventMatch, ExtractorEngine, Mention, @@ -12,14 +12,37 @@ import ai.lum.odinson.{ NGramMatch, OdinsonMatch } -import ai.lum.odinson.rest.utils.ExtractorEngineUtils +import ai.lum.odinson.rest.utils.{ExtractorEngineUtils,StartEnd} import com.typesafe.config.Config import play.api.http.ContentTypes import play.api.libs.json._ import play.api.mvc._ import play.api.mvc.Results._ +import scala.annotation.tailrec package object json { + + @tailrec + def getStartEnd( + m: OdinsonMatch, + start: Int, + end: Int, + remaining: List[NamedCapture] + ): StartEnd = m match { + case finished if remaining.isEmpty => + StartEnd( + start = List(start, finished.start).min, + end = List(end, finished.end).max + ) + case _ => + val next: OdinsonMatch = remaining.head.capturedMatch + getStartEnd( + next, + start = List(start, next.start).min, + end = List(end, next.end).max, + remaining = remaining.tail ::: m.namedCaptures.toList + ) + } /** convenience methods for formatting Play 2 Json */ implicit class JsonOps(json: JsValue) { @@ -80,25 +103,50 @@ package object json { ) } - def mkJsonForMention(mention: Mention): Json.JsValueWrapper = { - + def getTokens(luceneDocId: Int): Seq[String] = { val displayField = engine.index.displayField // val doc: LuceneDocument = engine.indexSearcher.doc(mention.luceneDocId) // We want **all** tokens for the sentence - val tokens = engine.dataGatherer.getTokens(mention.luceneDocId, displayField) + engine.dataGatherer.getTokens(luceneDocId, displayField) + } + + def mkJsonForMention(mention: Mention): Json.JsValueWrapper = { + // We want **all** tokens for the sentence + val tokens = getTokens(mention.luceneDocId) + //println(s"""(${mention.start} - ${mention.end} w/ label ${mention.label.getOrElse("???")}): ${tokens.slice(mention.start, mention.end).mkString(" ")}""") + mention.odinsonMatch match { + case em: EventMatch => + Json.obj( + // format: off + "sentenceId" -> mention.luceneDocId, + // "score" -> odinsonScoreDoc.score, + "label" -> mention.label, + "documentId" -> getOdinsonDocId(mention.luceneDocId), + "sentenceIndex" -> getSentenceIndex(mention.luceneDocId), + "words" -> JsArray(tokens.map(JsString)), + "foundBy" -> mention.foundBy, + "trigger" -> Json.obj( + "start" -> em.trigger.start, + "end" -> em.trigger.end + ), + "match" -> mkJsonForMatch(m=em, luceneDocId=mention.luceneDocId) + // format: on + ) + case om => + Json.obj( + // format: off + "sentenceId" -> mention.luceneDocId, + // "score" -> odinsonScoreDoc.score, + "label" -> mention.label, + "documentId" -> getOdinsonDocId(mention.luceneDocId), + "sentenceIndex" -> getSentenceIndex(mention.luceneDocId), + "words" -> JsArray(tokens.map(JsString)), + "foundBy" -> mention.foundBy, + "match" -> mkJsonForMatch(m=om, luceneDocId=mention.luceneDocId) + // format: on + ) + } - Json.obj( - // format: off - "sentenceId" -> mention.luceneDocId, - // "score" -> odinsonScoreDoc.score, - "label" -> mention.label, - "documentId" -> getOdinsonDocId(mention.luceneDocId), - "sentenceIndex" -> getSentenceIndex(mention.luceneDocId), - "words" -> JsArray(tokens.map(JsString)), - "foundBy" -> mention.foundBy, - "match" -> Json.arr(mkJsonForMatch(mention.odinsonMatch)) - // format: on - ) } def mkJsonForScoreDoc(odinsonScoreDoc: OdinsonScoreDoc): Json.JsValueWrapper = { @@ -114,40 +162,80 @@ package object json { "documentId" -> getOdinsonDocId(odinsonScoreDoc.doc), "sentenceIndex" -> getSentenceIndex(odinsonScoreDoc.doc), "words" -> JsArray(tokens.map(JsString)), - "matches" -> Json.arr(odinsonScoreDoc.matches.map(mkJsonForMatch): _*) + "matches" -> Json.arr(odinsonScoreDoc.matches.map{ m => mkJsonForMatch(m=m, luceneDocId=odinsonScoreDoc.doc)}:_*) + //"matches" -> Json.arr(odinsonScoreDoc.matches.map(mkJsonForMatch): _*) // format: on ) } - def mkJsonForMatch(m: OdinsonMatch): Json.JsValueWrapper = m match { + def mkJsonForMatch(m: OdinsonMatch, luceneDocId: Int): Json.JsValueWrapper = { + val se: StartEnd = getStartEnd( + m = m, + start = m.start, + end = m.end, + remaining = m.namedCaptures.toList + ) + val tokens = getTokens(luceneDocId) + val text = tokens.slice(se.start, se.end).mkString(" ") + m match { case em: EventMatch => Json.obj( - "start" -> em.trigger.start, - "end" -> em.trigger.end, - // FIXME: should we simplify this? - "trigger" -> mkJsonForMatch(em), - "namedCaptures" -> Json.arr(em.namedCaptures.map(mkJsonForNamedCapture): _*) + "start" -> se.start, + "end" -> se.end, + "text" -> text, + "trigger" -> Json.obj( + "start" -> em.trigger.start, + "end" -> em.trigger.end, + "text" -> tokens.slice(em.trigger.start, em.trigger.end).mkString(" "), + ), + "namedCaptures" -> { + em.namedCaptures match { + case nothing if nothing.size == 0 => JsNull + case captures => + Json.arr(captures.map{c => mkJsonForNamedCapture(c, luceneDocId)}:_*) + } + } // ignore argumentMetadata ) - case ngram: NGramMatch => + case _: NGramMatch => Json.obj( - "start" -> ngram.start, - "end" -> ngram.end + "start" -> se.start, + "end" -> se.end, + "text" -> text, // avoid including empty namedCaptures ) case other@_ => - Json.obj( - "start" -> m.start, - "end" -> m.end, - "namedCaptures" -> Json.arr(m.namedCaptures.map(mkJsonForNamedCapture): _*) - ) + m.namedCaptures match { + case nothing if nothing.size == 0 => + Json.obj( + "start" -> se.start, + "end" -> se.end, + "text" -> text + ) + case captures => + Json.obj( + "start" -> se.start, + "end" -> se.end, + "text" -> text, + "trigger" -> Json.obj( + "start" -> m.start, + "end" -> m.end, + "text" -> tokens.slice(m.start, m.end).mkString(" ") + ), + "namedCaptures" -> Json.arr(captures.map{nc => mkJsonForNamedCapture(nc, luceneDocId)}:_*) + ) + } + } } - def mkJsonForNamedCapture(namedCapture: NamedCapture): Json.JsValueWrapper = { + def mkJsonForNamedCapture(namedCapture: NamedCapture, luceneDocId: Int): Json.JsValueWrapper = { + //val tokens = getTokens(luceneDocId) Json.obj( "name" -> namedCapture.name, "label" -> namedCapture.label, - "capturedMatch" -> mkJsonForMatch(namedCapture.capturedMatch) + //"text" -> tokens.slice(namedCapture.start, namedCapture.end).mkString(" "), + //"capturedMatch" + "match" -> mkJsonForMatch(namedCapture.capturedMatch, luceneDocId) ) } @@ -162,7 +250,8 @@ package object json { "documentId" -> getOdinsonDocId(odinsonScoreDoc.doc), "sentenceIndex" -> getSentenceIndex(odinsonScoreDoc.doc), "sentence" -> mkUnabridgedSentenceJson(odinsonScoreDoc.doc, config), - "matches" -> Json.arr(odinsonScoreDoc.matches.map(mkJsonForMatch): _*) + "matches" -> Json.arr(odinsonScoreDoc.matches.map{m => mkJsonForMatch(m=m, luceneDocId=odinsonScoreDoc.doc)}:_*) + //matches.map{mkJsonForMatch): _*) // format: on ) } diff --git a/app/ai/lum/odinson/rest/requests/GrammarRequest.scala b/app/ai/lum/odinson/rest/requests/GrammarRequest.scala index ec39a41..5d49c8d 100644 --- a/app/ai/lum/odinson/rest/requests/GrammarRequest.scala +++ b/app/ai/lum/odinson/rest/requests/GrammarRequest.scala @@ -10,6 +10,7 @@ case class GrammarRequest( pretty: Option[Boolean] = None ) +// Deprecated object GrammarRequest { implicit val fmt: OFormat[GrammarRequest] = Json.format[GrammarRequest] implicit val read: Reads[GrammarRequest] = Json.reads[GrammarRequest] diff --git a/app/ai/lum/odinson/rest/requests/SimplePatternsRequest.scala b/app/ai/lum/odinson/rest/requests/SimplePatternsRequest.scala new file mode 100644 index 0000000..6624445 --- /dev/null +++ b/app/ai/lum/odinson/rest/requests/SimplePatternsRequest.scala @@ -0,0 +1,19 @@ +package ai.lum.odinson.rest.requests + +import play.api.libs.json._ + +case class SimplePatternsRequest( + patterns: List[String], + metadataQuery: Option[String] = None, + // label: Option[String] = None, + // commit: Option[Boolean] = None, + prevDoc: Option[Int] = None, + prevScore: Option[Float] = None, + enriched: Option[Boolean] = None, + pretty: Option[Boolean] = None +) + +object SimplePatternsRequest { + implicit val fmt: OFormat[SimplePatternsRequest] = Json.format[SimplePatternsRequest] + implicit val read: Reads[SimplePatternsRequest] = Json.reads[SimplePatternsRequest] +} diff --git a/app/ai/lum/odinson/rest/utils/ExceptionUtils.scala b/app/ai/lum/odinson/rest/utils/ExceptionUtils.scala index 493f47d..add7157 100644 --- a/app/ai/lum/odinson/rest/utils/ExceptionUtils.scala +++ b/app/ai/lum/odinson/rest/utils/ExceptionUtils.scala @@ -16,6 +16,7 @@ object ExceptionUtils { e: Throwable, message: Option[String] = None ): Result = { + //println(s"e:\t${ApacheExceptionUtils.getStackTrace(e)}") val errorMsg: String = message match { // .getMessage , .getStackTrace , .getRootCause case None => ApacheExceptionUtils.getMessage(e) diff --git a/app/ai/lum/odinson/rest/utils/StartEnd.scala b/app/ai/lum/odinson/rest/utils/StartEnd.scala new file mode 100644 index 0000000..e4691b8 --- /dev/null +++ b/app/ai/lum/odinson/rest/utils/StartEnd.scala @@ -0,0 +1,3 @@ +package ai.lum.odinson.rest.utils + +case class StartEnd(start: Int, end: Int) \ No newline at end of file diff --git a/app/controllers/OdinsonController.scala b/app/controllers/OdinsonController.scala index 05d59a7..b2f3a6c 100644 --- a/app/controllers/OdinsonController.scala +++ b/app/controllers/OdinsonController.scala @@ -1,10 +1,11 @@ package controllers +import ai.lum.odinson.utils.DisplayUtils import ai.lum.common.ConfigFactory import ai.lum.common.ConfigUtils._ import ai.lum.odinson.digraph.Vocabulary import ai.lum.odinson.lucene._ -import ai.lum.odinson.lucene.search.{ OdinsonQuery, OdinsonScoreDoc } +import ai.lum.odinson.lucene.search.{ OdinsonQuery, OdinsonScoreDoc, OdinOrQuery } import ai.lum.odinson.{ Document => OdinsonDocument, ExtractorEngine, Mention } //import ai.lum.odinson.lucene.index.OdinsonIndexWriter import com.typesafe.config.{ Config, ConfigRenderOptions, ConfigValueFactory } @@ -18,7 +19,6 @@ import org.apache.lucene.store.FSDirectory import play.api.http.ContentTypes import play.api.libs.json._ import play.api.mvc._ - import java.io.File import java.nio.file.Path //import java.nio.file.{ Files, Path } @@ -322,11 +322,13 @@ class OdinsonController @Inject() ( */ def bodyToString(body: AnyContent): Option[String] = try { - val contents = body.asRaw.get.asBytes().get.decodeString(StandardCharsets.UTF_8) - Some(contents) + //.get.asBytes().get.decodeString(StandardCharsets.UTF_8) + body.asText } catch { case _: Throwable => - None + val contents = body.asRaw.get.asBytes().get.decodeString(StandardCharsets.UTF_8) + Some(contents) + ///None } /** Validates an Odinson rule. @@ -443,53 +445,80 @@ class OdinsonController @Inject() ( * @return * JSON of matches */ - def executeGrammar() = Action { request => - // FIXME: do this in a non-blocking way - ExtractorEngine.usingEngine(config) { engine => - // FIXME: replace .get with validation check - val gr = request.body.asJson.get.as[GrammarRequest] - val grammar = gr.grammar - val maxDocs = gr.maxDocs - val metadataQuery = gr.metadataQuery - val allowTriggerOverlaps = gr.allowTriggerOverlaps.getOrElse(false) - val pretty = gr.pretty - try { - // rules -> OdinsonQuery - val extractors = metadataQuery match { - case None => engine.ruleReader.compileRuleString(grammar) - case Some(raw) => - val mq = engine.compiler.mkParentQuery(raw) - engine.compileRuleString(rules=grammar, metadataFilter=mq) - } - - val start = System.currentTimeMillis() - - val maxSentences: Int = maxDocs match { - case Some(md) => md - case None => engine.numDocs() - } - - val mentions: Seq[Mention] = { - // FIXME: should deal in iterators to better support pagination...? - val iterator = engine.extractMentions( - extractors, - numSentences = maxSentences, - allowTriggerOverlaps = allowTriggerOverlaps, - disableMatchSelector = false - ) - iterator.toVector - } - - val duration = (System.currentTimeMillis() - start) / 1000f // duration in seconds - - val json = - Json.toJson(engine.mkMentionsJson(None, duration, allowTriggerOverlaps, mentions)) - json.format(pretty) - } catch handleNonFatal + def executeGrammar( + maxDocs: Option[Int] = None, + allowTriggerOverlaps: Option[Boolean] = None, + metadataQuery: Option[String] = None, + label: Option[String] = None, + pretty: Option[Boolean] = None + ): Action[String] = Action(parse.text) { (request: Request[String]) => + try { + request.body match { + case grammar: String => + // validation here + // FIXME: do this in a non-blocking way + val engine = ExtractorEngine.fromConfig(config) + //ExtractorEngine.usingEngine(config) { engine => + val allowOverlaps: Boolean = allowTriggerOverlaps.getOrElse(false) + try { + // rules -> OdinsonQuery + val extractors = metadataQuery match { + case None => engine.ruleReader.compileRuleString(grammar) + case Some(raw) => + val mq = engine.compiler.mkParentQuery(raw) + engine.compileRuleString(rules=grammar, metadataFilter=mq) + } + + val start = System.currentTimeMillis() + + val maxSentences: Int = maxDocs match { + case Some(md) => md + case None => engine.numDocs() + } + + val mentions: Seq[Mention] = { + // FIXME: should deal in iterators to better support pagination...? + //println(s"Using state ${engine.state}") + val iterator = engine.extractMentions( + extractors, + numSentences = maxSentences, + allowTriggerOverlaps = allowOverlaps, + disableMatchSelector = false + ) + iterator.toVector + } + + val filteredMentions = label match { + case Some(lbl) => mentions.filter(_.label == Some(lbl)) + case None => mentions + } + + val duration = (System.currentTimeMillis() - start) / 1000f // duration in seconds + val json = + Json.toJson(engine.mkMentionsJson(None, duration, allowOverlaps, filteredMentions)) + // println(s"${engine.state.getAllMentions().toSeq.size} mentions in state") + // engine.state.getAllMentions().foreach{ m => DisplayUtils.displayMention(m, engine)} + json.format(pretty) + } catch { + case e: Throwable => + handleNonFatal(e) + } finally { + engine.close() + } + // } + case _ => + BadRequest("Malformed body. Send grammar.") + } + } catch { + case error: Throwable => + Status(500)( + Json.toJson(OdinsonErrors.fromException(error)) + ) } } - /** @param odinsonQuery + /** Executes the provided Odinson pattern. + * @param odinsonQuery * An Odinson pattern * @param metadataQuery * A Lucene query to filter documents (optional). @@ -553,6 +582,43 @@ class OdinsonController @Inject() ( } } + /** Applies a disjunction of the provided patterns against the corpus. + * @return + * JSON of matches + */ + def runDisjunctiveQuery() = Action { request => + // FIXME: do this in a non-blocking way + ExtractorEngine.usingEngine(config) { engine => + // FIXME: replace .get with validation check + val spr = request.body.asJson.get.as[SimplePatternsRequest] + try { + val patterns: List[OdinsonQuery] = spr.patterns.map(engine.compiler.mkQuery).toList + val disjunctiveQuery = new OdinOrQuery(patterns, field = patterns.head.getField) + val oq = spr.metadataQuery match { + case Some(pq) => + engine.compiler.mkQuery(disjunctiveQuery, pq) + case None => + disjunctiveQuery + } + val start = System.currentTimeMillis() + val results: OdinResults = retrieveResults(engine, oq, spr.prevDoc, spr.prevScore) + val duration = (System.currentTimeMillis() - start) / 1000f // duration in seconds + + // NOTE: no use of state here + + val json = Json.toJson(engine.mkJson( + spr.patterns.map{ patt => s"(${patt})"}.mkString(" | "), + spr.metadataQuery, + duration, + results, + spr.enriched.getOrElse(false), + config + )) + json.format(spr.pretty) + } catch handleNonFatal + } + } + def getMetadataJsonByDocumentId( odinsonDocId: String, pretty: Option[Boolean] @@ -583,7 +649,8 @@ class OdinsonController @Inject() ( ) = Action.async { Future { // FIXME: do this in a non-blocking way - ExtractorEngine.usingEngine(config) { engine => + //ExtractorEngine.usingEngine(config) { engine => + val engine = ExtractorEngine.fromConfig(config) try { val odinsonDocId = engine.getOdinsonDocId(sentenceId) val doc: OdinsonDocument = @@ -597,8 +664,10 @@ class OdinsonController @Inject() ( ) case _: Throwable => BadRequest(s"sentenceId '${sentenceId}' not found") + } finally { + engine.close() } - } + //} } } diff --git a/conf/routes b/conf/routes index 84a2ca9..3746455 100644 --- a/conf/routes +++ b/conf/routes @@ -20,7 +20,10 @@ GET /api controllers.OpenApiController.openAPI GET /api/execute/pattern controllers.OdinsonController.runQuery(odinsonQuery: String, metadataQuery: Option[String], label: Option[String], commit: Option[Boolean], prevDoc: Option[Int], prevScore: Option[Float], enriched: Boolean = false, pretty: Option[Boolean]) + nocsrf -POST /api/execute/grammar controllers.OdinsonController.executeGrammar() +POST /api/execute/disjunction-of-patterns controllers.OdinsonController.runDisjunctiveQuery() + ++ nocsrf +POST /api/execute/grammar controllers.OdinsonController.executeGrammar(maxDocs: Option[Int], allowTriggerOverlaps: Option[Boolean], metadataQuery: Option[String], label: Option[String], pretty: Option[Boolean]) # document json + nocsrf diff --git a/public/schema/odinson.yaml b/public/schema/odinson.yaml index 4adf9c8..103ed91 100644 --- a/public/schema/odinson.yaml +++ b/public/schema/odinson.yaml @@ -112,6 +112,39 @@ paths: schema: $ref: '#/components/schemas/QueryError' + /api/execute/disjunction-of-patterns: + post: + tags: + - search + summary: | + Composes a disjunction of Odinson queries (ex. A OR B OR C) and executes it against the corpus. + description: | + Composes a disjunction of Odinson queries (ex. A OR B OR C) and executes it against the corpus. Optionally include a doc-level Lucene query to identify a subset of documents to which the query should be applied. + operationId: execute-disjunctive-pattern + requestBody: + description: | + a disjunction of patterns. + required: true + content: + "application/json": + schema: + $ref: '#/components/schemas/SimplePatternsRequest' + responses: + '200': + description: Paginated matches for the query. + content: + "application/json": + schema: + type: array + items: + $ref: '#/components/schemas/BasicResults' + '400': + description: Syntax error in query. + content: + "application/json": + schema: + $ref: '#/components/schemas/QueryError' + /api/execute/grammar: post: tags: @@ -127,12 +160,75 @@ paths: An Odinson grammar. required: true content: - "application/json": + "plain/text": schema: - $ref: '#/components/schemas/OdinsonGrammarRequest' -# consumes: -# - application/json + type: string + example: | + vars: + chunk: "([tag=/J.*/]{,3} [tag=/N.*/]+ (of [tag=DT]? [tag=/J.*/]{,3} [tag=/N.*/]+)?)" + + rules: + - name: xp + label: XP + type: basic + priority: 1 + pattern: | + [chunk=/.-NP/]+|[chunk=/.-ADVP/]+|[chunk=/.-VP/]+ + + - name: xp-seq + label: test + type: basic + priority: 2 + pattern: | + @XP @XP + + - name: example-basic-rule + type: basic + priority: 1 + pattern: | + (? ${chunk}) >nmod_such_as (? ${chunk}) + - name: example-event-rule + type: event + priority: 1 + pattern: | + trigger = cause|increase|decrease|affect + cause = >nsubj ${chunk} + effect = >dobj ${chunk} + + parameters: + - name: maxDocs + in: query + description: | + The maximum number of sentences to execute the rules against. + schema: + type: integer + format: int32 + example: 10 + - name: allowTriggerOverlaps + in: query + description: | + Whether or not event arguments are permitted to overlap with the event's trigger. Defaults to false. + schema: + type: boolean + default: false + example: false + - name: metadataQuery + in: query + required: false + schema: + type: string + description: | + A query to filter Documents by their metadata before applying an Odinson grammar. + example: "character contains 'Special Agent'" + - name: label + in: query + required: false + schema: + type: string + description: | + Only return mentions matching the label (if provided). + #example: "character contains 'Special Agent'" responses: '200': description: Mentions matched by the grammar. @@ -1081,6 +1177,49 @@ components: roots: [ 2 ] schemas: + SimplePatternsRequest: + type: object + required: + - patterns + properties: + patterns: + type: array + description: A list of queries. + items: + type: string + description: A single odinson query. + metadataQuery: + $ref: '#/components/schemas/MetadataQuery' + label: + description: | + The label to use when committing mentions to the State. + schema: + type: string + commit: + description: | + Whether or not the results of this query should be committed to the State. + schema: + type: boolean + prevDoc: + description: | + The ID (`sentenceId`) for the last document (sentence) seen in the previous page of results. + required: false + schema: + type: integer + format: int32 + # minimum: 1 + # exclusiveMinimum: false + # maximum: 3 + # exclusiveMaximum: false + #example: 1 + prevScore: + description: | + The score for the last result seen in the previous page of results. + required: false + schema: + type: number + format: float + #example: 0.424 OdinsonGrammarRequest: type: object diff --git a/python/lum/odinson/doc.py b/python/lum/odinson/doc.py index 600eed6..f08708a 100644 --- a/python/lum/odinson/doc.py +++ b/python/lum/odinson/doc.py @@ -30,6 +30,7 @@ class Field(BaseModel): type: Fields = pydantic.Field(alias="$type", default="ai.lum.odinson.Field") model_config = ConfigDict(use_enum_values=True, validate_default=True) + class TokensField(Field): tokens: Tokens type: Literal[Fields.TOKENS_FIELD] = pydantic.Field( @@ -196,11 +197,11 @@ def json(self, **kwargs): def copy(self, fields: List[AnyField]) -> "Sentence": """Convenience method for easily copying an Odinson Sentence and replacing specific attributes""" return Sentence( - # validate and count tokens - numTokens=Sentence._count_tokens(fields), - fields=fields + # validate and count tokens + numTokens=Sentence._count_tokens(fields), + fields=fields, ) - + @staticmethod def validate_fields(fields: List[AnyField]) -> bool: # validation @@ -210,26 +211,28 @@ def validate_fields(fields: List[AnyField]) -> bool: num_tokens.add(len(f.tokens)) # NOTE: this will also fail if no TokensField are present if len(num_tokens) != 1: - raise Exception(f"All TokensField for sentence should have same length, but found {len(num_tokens)}") + raise Exception( + f"All TokensField for sentence should have same length, but found {len(num_tokens)}" + ) return True - + @staticmethod def _count_tokens(fields: List[AnyField]) -> bool: - """Get count of tokens based on TokensField after first validating with Sentence.validate_fields""" - _ = Sentence.validate_fields(fields) - for f in fields: - if isinstance(f, TokensField): - return len(f.tokens) - + """Get count of tokens based on TokensField after first validating with Sentence.validate_fields""" + _ = Sentence.validate_fields(fields) + for f in fields: + if isinstance(f, TokensField): + return len(f.tokens) + @staticmethod def from_fields(fields: List[AnyField]) -> "Sentence": """Create an Odinson Sentence from a collection of fields""" return Sentence( - # validate and count - numTokens=Sentence._count_tokens(fields), - fields=fields + # validate and count + numTokens=Sentence._count_tokens(fields), + fields=fields, ) - + @staticmethod def from_tokens(tokens: List[Token]) -> "Sentence": """Create an Odinson Sentence from a collection Tokens""" @@ -239,11 +242,11 @@ def from_tokens(tokens: List[Token]) -> "Sentence": value = fields_dict.get(k, []) value.append(v) fields_dict[k] = value - num_tokens = list({len(values) for values in fields_dict.items()}) + num_tokens = list({len(values) for values in fields_dict.items()}) assert num_tokens == 1, "All token attributes must have the same length" fields = [TokensField(name=k, tokens=toks) for k, toks in fields_dict.items()] return Sentence(numTokens=num_tokens[0], fields=fields) - + class Document(BaseModel): """ai.lum.odinson.Document""" @@ -304,10 +307,15 @@ def from_file(fp: Text) -> Document: with open(fp, "r") as f: return Document(**json.loads(f.read())) - def copy(self, id: Optional[str] = None, metadata: Optional[List[AnyField]] = None, sentences: Optional[List[Sentence]] = None) -> "Document": + def copy( + self, + id: Optional[str] = None, + metadata: Optional[List[AnyField]] = None, + sentences: Optional[List[Sentence]] = None, + ) -> "Document": """Convenience method for easily copying an Odinson Document and replacing specific attributes""" return Document( - id=id or self.id, - metadata=metadata or self.metadata, - sentences=sentences or self.sentences - ) \ No newline at end of file + id=id or self.id, + metadata=metadata or self.metadata, + sentences=sentences or self.sentences, + ) diff --git a/python/lum/odinson/rest/api.py b/python/lum/odinson/rest/api.py index 0bcafa5..b00beb4 100644 --- a/python/lum/odinson/rest/api.py +++ b/python/lum/odinson/rest/api.py @@ -1,8 +1,15 @@ from __future__ import annotations from typing import Any, Dict, Iterator, List, Literal, Optional, Text, Union from lum.odinson.doc import AnyField, Document, Sentence -from lum.odinson.rest.responses import CorpusInfo, OdinsonErrors, ScoreDoc, Statistic, GrammarResults, Results -from lum.odinson.rest.requests import GrammarRequest +from lum.odinson.rest.responses import ( + CorpusInfo, + OdinsonErrors, + ScoreDoc, + Statistic, + GrammarResults, + Results, +) +from lum.odinson.rest.requests import GrammarRequest, SimplePatternsRequest from pydantic import BaseModel from dataclasses import dataclass import pydantic @@ -16,7 +23,6 @@ class OdinsonBaseAPI: - def __init__(self, address: Text): self.address = address @@ -110,14 +116,19 @@ def _post_doc( ) def _post_text( - self, endpoint: str, text: str, headers: Optional[Dict[str, str]] = None + self, + endpoint: str, + text: str, + params: Optional[Dict[str, Union[str,int]]] = None, + headers: Optional[Dict[str, str]] = None ) -> requests.Response: return requests.post( endpoint, # NOTE: data takes str & .json() returns json str - #json=text, + # json=text, data=text, - headers=headers + params=params, + headers=headers, ) def validate_document(self, doc: Document, strict: bool = True) -> bool: @@ -146,7 +157,7 @@ def validate_grammar( ) -> Union[bool, OdinsonErrors]: """Inspects and validates an Odinson grammar""" endpoint = f"{self.address}/api/validate/grammar" - res = self._post_text(endpoint=endpoint, contents=grammar) + res = self._post_text(endpoint=endpoint, text=grammar) if res.status_code == 200: return OdinsonBaseAPI.status_code_to_bool(res.status_code) else: @@ -261,20 +272,16 @@ def execute_grammar( # A query to filter Documents by their metadata before applying an Odinson pattern. metadata_query: Optional[str] = None, max_docs: Optional[int] = 20, - allow_trigger_overlaps: bool = False + allow_trigger_overlaps: bool = False, ): endpoint = f"{self.address}/api/execute/grammar" - gr = GrammarRequest( - grammar=grammar, - metadataQuery=metadata_query, - maxDocs=max_docs, - allowTriggerOverlaps=allow_trigger_overlaps - ) - res = requests.post( - endpoint, - json=gr.dict() - ) - #return GrammarResults.empty() if res.status_code != 200 else GrammarResults(**res.json()) + params = { + "metadataQuery" : metadata_query, + "maxDocs" : max_docs, + "allowTriggerOverlaps" : allow_trigger_overlaps + } + res = self._post_text(endpoint=endpoint, text=grammar, params=params) + # return GrammarResults.empty() if res.status_code != 200 else GrammarResults(**res.json()) # FIXME: check status code and return error or empty results? return GrammarResults(**res.json()) @@ -296,14 +303,6 @@ def search( prev_score: Optional[float] = None, ) -> Iterator[ScoreDoc]: endpoint = f"{self.address}/api/execute/pattern" - params = { - "odinsonQuery": odinson_query, - "metadataQuery": metadata_query, - "label": label, - "commit": commit, - "prevDoc": prev_doc, - "prevScore": prev_score, - } seen = 0 results: Results = self._search( odinson_query=odinson_query, @@ -335,6 +334,60 @@ def search( ) # print(f"total_hits:\t{results.total_hits}") + def search_disjunction_of_patterns( + self, + # An Odinson pattern. + # Example: ["[lemma=pie] []", "[lemma=blarg]"] + patterns: list[str], + # A query to filter Documents by their metadata before applying an Odinson pattern. + metadata_query: Optional[str] = None, + # The label to use when committing mentions to the State. + # Example: character contains 'Special Agent' + label: Optional[str] = None, + # The ID (sentenceId) for the last document (sentence) seen in the previous page of results. + prev_doc: Optional[int] = None, + # The score for the last result seen in the previous page of results. + prev_score: Optional[float] = None, + ) -> Iterator[ScoreDoc]: + endpoint = f"{self.address}/api/execute/disjunction-of-patterns" + + spr = SimplePatternsRequest( + patterns=patterns, + metadataQuery=metadata_query, + prevDoc=prev_doc, + prevScore=prev_score, + ) + results: Results = Results(**requests.post(endpoint, json=spr.dict()).json()) + + seen = 0 + total = results.total_hits + if total == 0: + return iter(()) + last = results.score_docs[-1] + while seen < total: + for sd in results.score_docs: + seen += 1 + last = sd + # print(f"{seen-1}/{total}") + # print(f"sd.document_id:\t{sd.document_id}") + # print(f"sd.sentence_id:\t{sd.sentence_id}\n") + # FIXME: should this be a Results() with a single doc? + yield sd + # paginate + nspr = SimplePatternsRequest( + patterns=patterns, + metadata_query=metadata_query, + label=label, + prev_doc=last.sentence_id, + ) + results: Results = Results( + **requests.post( + endpoint, + json=nspr.dict(), + ).json() + ) + # print(f"total_hits:\t{results.total_hits}") + # TODO: add rewrite method # for any token that matches the pattern, replace its entry in field with