Skip to content

Commit

Permalink
Provide spans of named entities (#464)
Browse files Browse the repository at this point in the history
Where possible (i.e., an extracted named entity is explicitly mentioned
in the input text), its corresponding span will be provided in the list
of named entities, a la:
```yaml
named_entities:
  - id: AUTO:erosion
    label: erosion
    original_spans:
      - 29:35
  - id: SNOMEDCT:263913002
    label: ulceration
    original_spans:
      - 44:53
 ```
These will *not* be included if the named entity isn't present in the input text, and this may happen if the prompt instructs the raw response to be transformed in some way (e.g., the raw text contains "ulceration" but the extraction instructions cause the LLM to represent it as "TRUE", as in "yes there is an ulceration", then the span details won't appear)
  • Loading branch information
caufieldjh authored Oct 10, 2024
2 parents ae1fb64 + c919e68 commit c1a0427
Show file tree
Hide file tree
Showing 4 changed files with 256 additions and 77 deletions.
25 changes: 24 additions & 1 deletion src/ontogpt/engines/spires_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
chunk_text_by_sentence,
)
from ontogpt.io.yaml_wrapper import dump_minimal_yaml
from ontogpt.templates.core import ExtractionResult
from ontogpt.templates.core import ExtractionResult, NamedEntity
from ontogpt.utils.parse_utils import get_span_values

this_path = Path(__file__).parent

Expand Down Expand Up @@ -122,6 +123,11 @@ def extract_from_text(
raw_text, cls, object=object # type: ignore
)

# Add spans to entities
self.extracted_named_entities = self.get_spans(
input_text=text, named_entities=self.extracted_named_entities
)

return ExtractionResult(
input_text=text,
raw_completion_output=raw_text,
Expand Down Expand Up @@ -765,3 +771,20 @@ def ground_annotation_object(
logging.info(new_ann)
py_cls = self.template_module.__dict__[cls.name]
return py_cls(**new_ann)

def get_spans(self, input_text: str, named_entities: list[NamedEntity]) -> list[NamedEntity]:
"""
Get the spans for the named entities in the input text.
:param input_text: The full input text for the overall extraction
:param extracted_object: The extracted object to be updated
:return: The extracted object with spans
"""

named_entities_with_spans = []

for ne in named_entities:
ne.original_spans = get_span_values(input_text, ne.label)
named_entities_with_spans.append(ne)

return named_entities_with_spans
264 changes: 188 additions & 76 deletions src/ontogpt/templates/core.py
Original file line number Diff line number Diff line change
@@ -1,123 +1,235 @@
from __future__ import annotations
from datetime import datetime, date
from enum import Enum

from typing import List, Dict, Optional, Any, Union
from pydantic import BaseModel as BaseModel, ConfigDict, Field, field_validator
from __future__ import annotations
from datetime import (
datetime,
date,
time
)
from decimal import Decimal
from enum import Enum
import re
import sys
if sys.version_info >= (3, 8):
from typing import Literal
else:
from typing_extensions import Literal


from typing import (
Any,
ClassVar,
List,
Literal,
Dict,
Optional,
Union
)
from pydantic import (
BaseModel,
ConfigDict,
Field,
RootModel,
field_validator
)
metamodel_version = "None"
version = "None"


class ConfiguredBaseModel(BaseModel):
model_config = ConfigDict(
validate_assignment=True,
validate_default=True,
extra = 'forbid',
arbitrary_types_allowed=True,
use_enum_values = True)
validate_assignment = True,
validate_default = True,
extra = "forbid",
arbitrary_types_allowed = True,
use_enum_values = True,
strict = False,
)
pass




class LinkMLMeta(RootModel):
root: Dict[str, Any] = {}
model_config = ConfigDict(frozen=True)

def __getattr__(self, key:str):
return getattr(self.root, key)

def __getitem__(self, key:str):
return self.root[key]

def __setitem__(self, key:str, value):
self.root[key] = value

def __contains__(self, key:str) -> bool:
return key in self.root


linkml_meta = LinkMLMeta({'default_prefix': 'core',
'default_range': 'string',
'description': 'Core upper level',
'id': 'http://w3id.org/ontogpt/core',
'imports': ['linkml:types'],
'license': 'https://creativecommons.org/publicdomain/zero/1.0/',
'name': 'core',
'prefixes': {'NCIT': {'prefix_prefix': 'NCIT',
'prefix_reference': 'http://purl.obolibrary.org/obo/NCIT_'},
'RO': {'prefix_prefix': 'RO',
'prefix_reference': 'http://purl.obolibrary.org/obo/RO_'},
'biolink': {'prefix_prefix': 'biolink',
'prefix_reference': 'https://w3id.org/biolink/vocab/'},
'core': {'prefix_prefix': 'core',
'prefix_reference': 'http://w3id.org/ontogpt/core/'},
'linkml': {'prefix_prefix': 'linkml',
'prefix_reference': 'https://w3id.org/linkml/'},
'rdf': {'prefix_prefix': 'rdf',
'prefix_reference': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'},
'rdfs': {'prefix_prefix': 'rdfs',
'prefix_reference': 'http://www.w3.org/2000/01/rdf-schema#'}},
'source_file': 'src/ontogpt/templates/core.yaml',
'title': 'AI core Template'} )

class NullDataOptions(str, Enum):


UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION"

NOT_APPLICABLE = "NOT_APPLICABLE"

NOT_MENTIONED = "NOT_MENTIONED"



class ExtractionResult(ConfiguredBaseModel):
"""
A result of extracting knowledge on text
"""
input_id: Optional[str] = Field(None)
input_title: Optional[str] = Field(None)
input_text: Optional[str] = Field(None)
raw_completion_output: Optional[str] = Field(None)
prompt: Optional[str] = Field(None)
extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""")
named_entities: Optional[List[Any]] = Field(default_factory=list, description="""Named entities extracted from the text""")


linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})

input_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_id', 'domain_of': ['ExtractionResult']} })
input_title: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_title', 'domain_of': ['ExtractionResult']} })
input_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_text', 'domain_of': ['ExtractionResult']} })
raw_completion_output: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'raw_completion_output', 'domain_of': ['ExtractionResult']} })
prompt: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'prompt', 'domain_of': ['ExtractionResult']} })
extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'extracted_object', 'domain_of': ['ExtractionResult']} })
named_entities: Optional[List[Any]] = Field(None, description="""Named entities extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'named_entities', 'domain_of': ['ExtractionResult']} })


class NamedEntity(ConfiguredBaseModel):

id: str = Field(..., description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")


linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'})

id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'comments': ['this is populated during the grounding and normalization step'],
'domain_of': ['NamedEntity', 'Publication']} })
label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label',
'aliases': ['name'],
'annotations': {'owl': {'tag': 'owl',
'value': 'AnnotationProperty, AnnotationAssertion'}},
'domain_of': ['NamedEntity'],
'slot_uri': 'rdfs:label'} })
original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'comments': ['This is determined during grounding and normalization',
'But is based on the full input text'],
'domain_of': ['NamedEntity']} })

@field_validator('original_spans')
def pattern_original_spans(cls, v):
pattern=re.compile(r"^\d+:\d+$")
if isinstance(v,list):
for element in v:
if not pattern.match(element):
raise ValueError(f"Invalid original_spans format: {element}")
elif isinstance(v,str):
if not pattern.match(v):
raise ValueError(f"Invalid original_spans format: {v}")
return v


class CompoundExpression(ConfiguredBaseModel):

None

linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'})

pass


class Triple(CompoundExpression):
"""
Abstract parent for Relation Extraction tasks
"""
subject: Optional[str] = Field(None)
predicate: Optional[str] = Field(None)
object: Optional[str] = Field(None)
qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""")
subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""")
object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""")


linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'})

subject: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject', 'domain_of': ['Triple']} })
predicate: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'predicate', 'domain_of': ['Triple']} })
object: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object', 'domain_of': ['Triple']} })
qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""", json_schema_extra = { "linkml_meta": {'alias': 'qualifier', 'domain_of': ['Triple']} })
subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""", json_schema_extra = { "linkml_meta": {'alias': 'subject_qualifier', 'domain_of': ['Triple']} })
object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""", json_schema_extra = { "linkml_meta": {'alias': 'object_qualifier', 'domain_of': ['Triple']} })


class TextWithTriples(ConfiguredBaseModel):
"""
A text containing one or more relations of the Triple type.
"""
publication: Optional[Publication] = Field(None)
triples: Optional[List[Triple]] = Field(default_factory=list)


linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})

publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'domain_of': ['TextWithTriples', 'TextWithEntity']} })
triples: Optional[List[Triple]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'triples', 'domain_of': ['TextWithTriples']} })


class TextWithEntity(ConfiguredBaseModel):
"""
A text containing one or more instances of a single type of entity.
"""
publication: Optional[Publication] = Field(None)
entities: Optional[List[str]] = Field(default_factory=list)


linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})

publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'domain_of': ['TextWithTriples', 'TextWithEntity']} })
entities: Optional[List[str]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'entities', 'domain_of': ['TextWithEntity']} })


class RelationshipType(NamedEntity):

id: str = Field(..., description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")


linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core',
'id_prefixes': ['RO', 'biolink']})

id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'comments': ['this is populated during the grounding and normalization step'],
'domain_of': ['NamedEntity', 'Publication']} })
label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label',
'aliases': ['name'],
'annotations': {'owl': {'tag': 'owl',
'value': 'AnnotationProperty, AnnotationAssertion'}},
'domain_of': ['NamedEntity'],
'slot_uri': 'rdfs:label'} })
original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'comments': ['This is determined during grounding and normalization',
'But is based on the full input text'],
'domain_of': ['NamedEntity']} })

@field_validator('original_spans')
def pattern_original_spans(cls, v):
pattern=re.compile(r"^\d+:\d+$")
if isinstance(v,list):
for element in v:
if not pattern.match(element):
raise ValueError(f"Invalid original_spans format: {element}")
elif isinstance(v,str):
if not pattern.match(v):
raise ValueError(f"Invalid original_spans format: {v}")
return v


class Publication(ConfiguredBaseModel):

id: Optional[str] = Field(None, description="""The publication identifier""")
title: Optional[str] = Field(None, description="""The title of the publication""")
abstract: Optional[str] = Field(None, description="""The abstract of the publication""")
combined_text: Optional[str] = Field(None)
full_text: Optional[str] = Field(None, description="""The full text of the publication""")

linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})

id: Optional[str] = Field(None, description="""The publication identifier""", json_schema_extra = { "linkml_meta": {'alias': 'id', 'domain_of': ['NamedEntity', 'Publication']} })
title: Optional[str] = Field(None, description="""The title of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'title', 'domain_of': ['Publication']} })
abstract: Optional[str] = Field(None, description="""The abstract of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'abstract', 'domain_of': ['Publication']} })
combined_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'combined_text', 'domain_of': ['Publication']} })
full_text: Optional[str] = Field(None, description="""The full text of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'full_text', 'domain_of': ['Publication']} })


class AnnotatorResult(ConfiguredBaseModel):

subject_text: Optional[str] = Field(None)
object_id: Optional[str] = Field(None)
object_text: Optional[str] = Field(None)


linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})

subject_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject_text', 'domain_of': ['AnnotatorResult']} })
object_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_id', 'domain_of': ['AnnotatorResult']} })
object_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_text', 'domain_of': ['AnnotatorResult']} })


# Model rebuild
Expand Down
16 changes: 16 additions & 0 deletions src/ontogpt/templates/core.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,22 @@ classes:
slot_uri: rdfs:label
annotations:
owl: AnnotationProperty, AnnotationAssertion
original_spans:
description: >-
The coordinates of the original text span from which the named entity
was extracted, inclusive. For example, "10:25" means the span starting
from the 10th character and ending with the 25th character. The first
character in the text has index 0. Newlines are treated as single
characters. Multivalued as there may be multiple spans for a single
text.
comments:
- This is determined during grounding and normalization
- But is based on the full input text
range: string
multivalued: true
pattern: "^\\d+:\\d+$"
annotations:
prompt.skip: "true"

CompoundExpression:
abstract: true
Expand Down
Loading

0 comments on commit c1a0427

Please sign in to comment.