Provide spans of named entities (#464)

Where possible (i.e., an extracted named entity is explicitly mentioned in the input text), its corresponding span will be provided in the list of named entities, a la: ```yaml named_entities: - id: AUTO:erosion label: erosion original_spans: - 29:35 - id: SNOMEDCT:263913002 label: ulceration original_spans: - 44:53 ``` These will *not* be included if the named entity isn't present in the input text, and this may happen if the prompt instructs the raw response to be transformed in some way (e.g., the raw text contains "ulceration" but the extraction instructions cause the LLM to represent it as "TRUE", as in "yes there is an ulceration", then the span details won't appear)
monarch-initiative · Oct 10, 2024 · c1a0427 · c1a0427
2 parents ae1fb64 + c919e68
commit c1a0427
Show file tree

Hide file tree

Showing 4 changed files with 256 additions and 77 deletions.
diff --git a/src/ontogpt/engines/spires_engine.py b/src/ontogpt/engines/spires_engine.py
@@ -33,7 +33,8 @@
     chunk_text_by_sentence,
 )
 from ontogpt.io.yaml_wrapper import dump_minimal_yaml
-from ontogpt.templates.core import ExtractionResult
+from ontogpt.templates.core import ExtractionResult, NamedEntity
+from ontogpt.utils.parse_utils import get_span_values
 
 this_path = Path(__file__).parent
 
@@ -122,6 +123,11 @@ def extract_from_text(
                 raw_text, cls, object=object  # type: ignore
             )
 
+        # Add spans to entities
+        self.extracted_named_entities = self.get_spans(
+            input_text=text, named_entities=self.extracted_named_entities
+        )
+
         return ExtractionResult(
             input_text=text,
             raw_completion_output=raw_text,
@@ -765,3 +771,20 @@ def ground_annotation_object(
         logging.info(new_ann)
         py_cls = self.template_module.__dict__[cls.name]
         return py_cls(**new_ann)
+
+    def get_spans(self, input_text: str, named_entities: list[NamedEntity]) -> list[NamedEntity]:
+        """
+        Get the spans for the named entities in the input text.
+
+        :param input_text: The full input text for the overall extraction
+        :param extracted_object: The extracted object to be updated
+        :return: The extracted object with spans
+        """
+
+        named_entities_with_spans = []
+
+        for ne in named_entities:
+            ne.original_spans = get_span_values(input_text, ne.label)
+            named_entities_with_spans.append(ne)
+
+        return named_entities_with_spans
diff --git a/src/ontogpt/templates/core.py b/src/ontogpt/templates/core.py
@@ -1,123 +1,235 @@
-from __future__ import annotations
-from datetime import datetime, date
-from enum import Enum
-
-from typing import List, Dict, Optional, Any, Union
-from pydantic import BaseModel as BaseModel, ConfigDict,  Field, field_validator
+from __future__ import annotations 
+from datetime import (
+    datetime,
+    date,
+    time
+)
+from decimal import Decimal 
+from enum import Enum 
 import re
 import sys
-if sys.version_info >= (3, 8):
-    from typing import Literal
-else:
-    from typing_extensions import Literal
-
-
+from typing import (
+    Any,
+    ClassVar,
+    List,
+    Literal,
+    Dict,
+    Optional,
+    Union
+)
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    RootModel,
+    field_validator
+)
 metamodel_version = "None"
 version = "None"
 
+
 class ConfiguredBaseModel(BaseModel):
     model_config = ConfigDict(
-        validate_assignment=True,
-        validate_default=True,
-        extra = 'forbid',
-        arbitrary_types_allowed=True,
-        use_enum_values = True)
+        validate_assignment = True,
+        validate_default = True,
+        extra = "forbid",
+        arbitrary_types_allowed = True,
+        use_enum_values = True,
+        strict = False,
+    )
     pass
 
 
+
+
+class LinkMLMeta(RootModel):
+    root: Dict[str, Any] = {}
+    model_config = ConfigDict(frozen=True)
+
+    def __getattr__(self, key:str):
+        return getattr(self.root, key)
+
+    def __getitem__(self, key:str):
+        return self.root[key]
+
+    def __setitem__(self, key:str, value):
+        self.root[key] = value
+
+    def __contains__(self, key:str) -> bool:
+        return key in self.root
+
+
+linkml_meta = LinkMLMeta({'default_prefix': 'core',
+     'default_range': 'string',
+     'description': 'Core upper level',
+     'id': 'http://w3id.org/ontogpt/core',
+     'imports': ['linkml:types'],
+     'license': 'https://creativecommons.org/publicdomain/zero/1.0/',
+     'name': 'core',
+     'prefixes': {'NCIT': {'prefix_prefix': 'NCIT',
+                           'prefix_reference': 'http://purl.obolibrary.org/obo/NCIT_'},
+                  'RO': {'prefix_prefix': 'RO',
+                         'prefix_reference': 'http://purl.obolibrary.org/obo/RO_'},
+                  'biolink': {'prefix_prefix': 'biolink',
+                              'prefix_reference': 'https://w3id.org/biolink/vocab/'},
+                  'core': {'prefix_prefix': 'core',
+                           'prefix_reference': 'http://w3id.org/ontogpt/core/'},
+                  'linkml': {'prefix_prefix': 'linkml',
+                             'prefix_reference': 'https://w3id.org/linkml/'},
+                  'rdf': {'prefix_prefix': 'rdf',
+                          'prefix_reference': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'},
+                  'rdfs': {'prefix_prefix': 'rdfs',
+                           'prefix_reference': 'http://www.w3.org/2000/01/rdf-schema#'}},
+     'source_file': 'src/ontogpt/templates/core.yaml',
+     'title': 'AI core Template'} )
+
 class NullDataOptions(str, Enum):
-
-
     UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION"
-
     NOT_APPLICABLE = "NOT_APPLICABLE"
-
     NOT_MENTIONED = "NOT_MENTIONED"
-    
-    
+
+
 
 class ExtractionResult(ConfiguredBaseModel):
     """
     A result of extracting knowledge on text
     """
-    input_id: Optional[str] = Field(None)
-    input_title: Optional[str] = Field(None)
-    input_text: Optional[str] = Field(None)
-    raw_completion_output: Optional[str] = Field(None)
-    prompt: Optional[str] = Field(None)
-    extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""")
-    named_entities: Optional[List[Any]] = Field(default_factory=list, description="""Named entities extracted from the text""")
-
-
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})
+
+    input_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_id', 'domain_of': ['ExtractionResult']} })
+    input_title: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_title', 'domain_of': ['ExtractionResult']} })
+    input_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_text', 'domain_of': ['ExtractionResult']} })
+    raw_completion_output: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'raw_completion_output', 'domain_of': ['ExtractionResult']} })
+    prompt: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'prompt', 'domain_of': ['ExtractionResult']} })
+    extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'extracted_object', 'domain_of': ['ExtractionResult']} })
+    named_entities: Optional[List[Any]] = Field(None, description="""Named entities extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'named_entities', 'domain_of': ['ExtractionResult']} })
+
 
 class NamedEntity(ConfiguredBaseModel):
-
-    id: str = Field(..., description="""A unique identifier for the named entity""")
-    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
-
-
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'})
+
+    id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id',
+         'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
+         'comments': ['this is populated during the grounding and normalization step'],
+         'domain_of': ['NamedEntity', 'Publication']} })
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label',
+         'aliases': ['name'],
+         'annotations': {'owl': {'tag': 'owl',
+                                 'value': 'AnnotationProperty, AnnotationAssertion'}},
+         'domain_of': ['NamedEntity'],
+         'slot_uri': 'rdfs:label'} })
+    original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans',
+         'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
+         'comments': ['This is determined during grounding and normalization',
+                      'But is based on the full input text'],
+         'domain_of': ['NamedEntity']} })
+
+    @field_validator('original_spans')
+    def pattern_original_spans(cls, v):
+        pattern=re.compile(r"^\d+:\d+$")
+        if isinstance(v,list):
+            for element in v:
+                if not pattern.match(element):
+                    raise ValueError(f"Invalid original_spans format: {element}")
+        elif isinstance(v,str):
+            if not pattern.match(v):
+                raise ValueError(f"Invalid original_spans format: {v}")
+        return v
+
 
 class CompoundExpression(ConfiguredBaseModel):
-
-    None
-
-    
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'})
+
+    pass
+
 
 class Triple(CompoundExpression):
     """
     Abstract parent for Relation Extraction tasks
     """
-    subject: Optional[str] = Field(None)
-    predicate: Optional[str] = Field(None)
-    object: Optional[str] = Field(None)
-    qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""")
-    subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""")
-    object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""")
-
-
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'})
+
+    subject: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject', 'domain_of': ['Triple']} })
+    predicate: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'predicate', 'domain_of': ['Triple']} })
+    object: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object', 'domain_of': ['Triple']} })
+    qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""", json_schema_extra = { "linkml_meta": {'alias': 'qualifier', 'domain_of': ['Triple']} })
+    subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""", json_schema_extra = { "linkml_meta": {'alias': 'subject_qualifier', 'domain_of': ['Triple']} })
+    object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""", json_schema_extra = { "linkml_meta": {'alias': 'object_qualifier', 'domain_of': ['Triple']} })
+
 
 class TextWithTriples(ConfiguredBaseModel):
     """
     A text containing one or more relations of the Triple type.
     """
-    publication: Optional[Publication] = Field(None)
-    triples: Optional[List[Triple]] = Field(default_factory=list)
-
-
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})
+
+    publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication',
+         'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
+         'domain_of': ['TextWithTriples', 'TextWithEntity']} })
+    triples: Optional[List[Triple]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'triples', 'domain_of': ['TextWithTriples']} })
+
 
 class TextWithEntity(ConfiguredBaseModel):
     """
     A text containing one or more instances of a single type of entity.
     """
-    publication: Optional[Publication] = Field(None)
-    entities: Optional[List[str]] = Field(default_factory=list)
-
-
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})
+
+    publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication',
+         'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
+         'domain_of': ['TextWithTriples', 'TextWithEntity']} })
+    entities: Optional[List[str]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'entities', 'domain_of': ['TextWithEntity']} })
+
 
 class RelationshipType(NamedEntity):
-
-    id: str = Field(..., description="""A unique identifier for the named entity""")
-    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
-
-
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core',
+         'id_prefixes': ['RO', 'biolink']})
+
+    id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id',
+         'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
+         'comments': ['this is populated during the grounding and normalization step'],
+         'domain_of': ['NamedEntity', 'Publication']} })
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label',
+         'aliases': ['name'],
+         'annotations': {'owl': {'tag': 'owl',
+                                 'value': 'AnnotationProperty, AnnotationAssertion'}},
+         'domain_of': ['NamedEntity'],
+         'slot_uri': 'rdfs:label'} })
+    original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans',
+         'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
+         'comments': ['This is determined during grounding and normalization',
+                      'But is based on the full input text'],
+         'domain_of': ['NamedEntity']} })
+
+    @field_validator('original_spans')
+    def pattern_original_spans(cls, v):
+        pattern=re.compile(r"^\d+:\d+$")
+        if isinstance(v,list):
+            for element in v:
+                if not pattern.match(element):
+                    raise ValueError(f"Invalid original_spans format: {element}")
+        elif isinstance(v,str):
+            if not pattern.match(v):
+                raise ValueError(f"Invalid original_spans format: {v}")
+        return v
+
 
 class Publication(ConfiguredBaseModel):
-
-    id: Optional[str] = Field(None, description="""The publication identifier""")
-    title: Optional[str] = Field(None, description="""The title of the publication""")
-    abstract: Optional[str] = Field(None, description="""The abstract of the publication""")
-    combined_text: Optional[str] = Field(None)
-    full_text: Optional[str] = Field(None, description="""The full text of the publication""")
-
-    
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})
+
+    id: Optional[str] = Field(None, description="""The publication identifier""", json_schema_extra = { "linkml_meta": {'alias': 'id', 'domain_of': ['NamedEntity', 'Publication']} })
+    title: Optional[str] = Field(None, description="""The title of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'title', 'domain_of': ['Publication']} })
+    abstract: Optional[str] = Field(None, description="""The abstract of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'abstract', 'domain_of': ['Publication']} })
+    combined_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'combined_text', 'domain_of': ['Publication']} })
+    full_text: Optional[str] = Field(None, description="""The full text of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'full_text', 'domain_of': ['Publication']} })
+
 
 class AnnotatorResult(ConfiguredBaseModel):
-
-    subject_text: Optional[str] = Field(None)
-    object_id: Optional[str] = Field(None)
-    object_text: Optional[str] = Field(None)
-
-
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})
+
+    subject_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject_text', 'domain_of': ['AnnotatorResult']} })
+    object_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_id', 'domain_of': ['AnnotatorResult']} })
+    object_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_text', 'domain_of': ['AnnotatorResult']} })
 
 
 # Model rebuild

diff --git a/src/ontogpt/templates/core.yaml b/src/ontogpt/templates/core.yaml
@@ -64,6 +64,22 @@ classes:
         slot_uri: rdfs:label
         annotations:
           owl: AnnotationProperty, AnnotationAssertion
+      original_spans:
+        description: >-
+          The coordinates of the original text span from which the named entity
+          was extracted, inclusive. For example, "10:25" means the span starting
+          from the 10th character and ending with the 25th character. The first
+          character in the text has index 0. Newlines are treated as single
+          characters. Multivalued as there may be multiple spans for a single
+          text.
+        comments:
+          - This is determined during grounding and normalization
+          - But is based on the full input text
+        range: string
+        multivalued: true
+        pattern: "^\\d+:\\d+$"
+        annotations:
+          prompt.skip: "true"
 
   CompoundExpression:
     abstract: true