Skip to content

Commit

Permalink
Expand option to set token/context limit (#449)
Browse files Browse the repository at this point in the history
Input text for extractions can be chunked by character (or by sentence,
though this was already possible) to fit a token limit by using the
`--max-text-length` option.
Otherwise, the full text is used.
  • Loading branch information
caufieldjh authored Aug 29, 2024
2 parents 465fcd1 + cce2e8a commit d08f8c7
Show file tree
Hide file tree
Showing 10 changed files with 165 additions and 103 deletions.
2 changes: 2 additions & 0 deletions docs/custom.md
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,8 @@ The output of this is then passed through further SPIRES iterations.

LLMs have context sizes limiting the combined length of their inputs and outputs. The `gpt-3.5-turbo` model, for example, has a 4,096 token limit (prompt + completion), while the `gpt-3.5-turbo-16k` model has a larger context of 16,384 tokens.

To see the token limit for each model, use `ontogpt list-models`. The Max Tokens value will be in the fourth column.

## Advanced functionality with linkml-owl

A LinkML schema used in OntoGPT may include annotations describing how each component relates to OWL syntax.
Expand Down
169 changes: 97 additions & 72 deletions poetry.lock

Large diffs are not rendered by default.

19 changes: 13 additions & 6 deletions src/ontogpt/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,12 @@ def parse_tabular_input(inputpath: str, selectcols: List[str]) -> str:
"--selectcols",
help="Columns to select from tabular input, e.g. --selectcols name,description",
)
max_text_length_option = click.option(
"--max-text-length",
type=click.INT,
help="Maximum text length in characters for each input chunk."
" By default, the entire input is passed to the model.",
)


@click.group()
Expand Down Expand Up @@ -415,6 +421,7 @@ def main(verbose: int, quiet: bool, cache_db: str):
@temperature_option
@cut_input_text_option
@selectcols_option
@max_text_length_option
def extract(
inputfile,
template,
Expand All @@ -434,6 +441,7 @@ def extract(
model_provider,
system_message,
selectcols,
max_text_length,
**kwargs,
):
"""Extract knowledge from text guided by schema, using SPIRES engine.
Expand Down Expand Up @@ -487,6 +495,7 @@ def extract(
api_version=api_version,
model_provider=model_provider,
system_message=system_message,
max_text_length=max_text_length,
**kwargs,
)
if settings.cache_db:
Expand Down Expand Up @@ -681,11 +690,7 @@ def iteratively_generate_extract(
@api_version_option
@model_provider_option
@system_message_option
@click.option(
"--max-text-length",
default=3000,
help="Maximum text length for each input chunk. Dependent on context size of model used.",
)
@max_text_length_option
@click.argument("search")
def pubmed_annotate(
model,
Expand Down Expand Up @@ -734,7 +739,9 @@ def pubmed_annotate(

pubmed_annotate_limit = limit
pmc = PubmedClient()
pmc.max_text_length = max_text_length
if max_text_length:
logging.info(f"Using max text length of {max_text_length} - each input will be chunked.")
pmc.max_text_length = max_text_length
pmids = pmc.get_pmids(search)
if get_pmc:
logging.info("Will try to retrieve PubMed Central texts.")
Expand Down
18 changes: 11 additions & 7 deletions src/ontogpt/clients/pubmed_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class PubmedClient:
# The maximum length of text, in characters, to include in
# a single input chunk. This may be set in the CLI
# with the max_text_length option.
max_text_length: int = 10000
max_text_length: int = 0

try:
email = get_apikey_value("ncbi-email")
Expand Down Expand Up @@ -343,12 +343,16 @@ def text(
txt = []
onetxt = ""
for doc in these_docs:
if len(doc) > self.max_text_length and not raw:
logging.warning(
f'Truncating entry beginning "{doc[:50]}" to {str(self.max_text_length)} chars'
)
shortdoc = doc[0 : self.max_text_length]
txt.append(shortdoc)
if self.max_text_length > 0:
if len(doc) > self.max_text_length and not raw:
charnum = str(self.max_text_length)
logging.warning(
f'Truncating entry beginning "{doc[:50]}" to {charnum} chars'
)
shortdoc = doc[0 : self.max_text_length]
txt.append(shortdoc)
else:
txt.append(doc)
else:
txt.append(doc)
if singledoc and not pubmedcental:
Expand Down
6 changes: 5 additions & 1 deletion src/ontogpt/engines/knowledge_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,17 @@
ANNOTATION_KEY_EXAMPLES = "prompt.examples"


def chunk_text(text: str, window_size=3) -> Iterator[str]:
def chunk_text_by_sentence(text: str, window_size=3) -> Iterator[str]:
"""Chunk text into windows of sentences."""
sentences = re.split(r"[.?!]\s+", text)
for right_index in range(1, len(sentences)):
left_index = max(0, right_index - window_size)
yield ". ".join(sentences[left_index:right_index])

def chunk_text_by_char(text: str, window_size=1000) -> Iterator[str]:
"""Chunk text into windows of characters."""
for i in range(0, len(text), window_size):
yield text[i : i + window_size]

@dataclass
class KnowledgeEngine(ABC):
Expand Down
38 changes: 29 additions & 9 deletions src/ontogpt/engines/spires_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
FIELD,
OBJECT,
KnowledgeEngine,
chunk_text,
chunk_text_by_char,
chunk_text_by_sentence,
)
from ontogpt.io.yaml_wrapper import dump_minimal_yaml
from ontogpt.templates.core import ExtractionResult
Expand Down Expand Up @@ -58,6 +59,9 @@ class SPIRESEngine(KnowledgeEngine):
where this determines the maximum number of sentences per chain.
The results are then merged together."""

max_text_length: Optional[int] = None
"""If set, this will split the text into chunks based on this number of characters."""

def extract_from_text(
self,
text: str,
Expand All @@ -75,8 +79,18 @@ def extract_from_text(
"""
self.extracted_named_entities = [] # Clear the named entity buffer

# This indicates that the text will be chunked in some way
have_chunks = False

if self.sentences_per_window:
chunks = chunk_text(text, self.sentences_per_window)
chunks = chunk_text_by_sentence(text, self.sentences_per_window)
have_chunks = True

if self.max_text_length:
chunks = chunk_text_by_char(text, self.max_text_length)
have_chunks = True

if have_chunks:
extracted_object = None
for chunk in chunks:
raw_text = self._raw_extract(chunk, cls=cls, object=object, show_prompt=show_prompt)
Expand All @@ -87,14 +101,20 @@ def extract_from_text(
if extracted_object is None:
extracted_object = next_object
else:
for k, v in next_object.items():
if isinstance(v, list):
extracted_object[k] += v
else:
if k not in extracted_object:
extracted_object[k] = v
# If the input is too small, which may happen with chunking,
# there may be a new extracted object but it's empty,
# raising an AttributeError on items.
try:
for k, v in next_object.items():
if isinstance(v, list):
extracted_object[k] += v
else:
extracted_object[k] = v
if k not in extracted_object:
extracted_object[k] = v
else:
extracted_object[k] = v
except AttributeError:
logging.error(f"Empty object: {next_object}")
else:
raw_text = self._raw_extract(text=text, cls=cls, object=object, show_prompt=show_prompt)
logging.info(f"RAW TEXT: {raw_text}")
Expand Down
4 changes: 2 additions & 2 deletions src/ontogpt/evaluation/ctd/eval_ctd.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from oaklib import BasicOntologyInterface, get_adapter
from pydantic import BaseModel

from ontogpt.engines.knowledge_engine import chunk_text
from ontogpt.engines.knowledge_engine import chunk_text_by_sentence
from ontogpt.engines.spires_engine import SPIRESEngine
from ontogpt.evaluation.evaluation_engine import SimilarityScore, SPIRESEvaluationEngine
from ontogpt.templates.ctd import (
Expand Down Expand Up @@ -220,7 +220,7 @@ def eval(self) -> EvaluationObjectSetRE:
ke.named_entities = [] # This stores the NEs the extractor knows about

if self.chunking:
text_list = chunk_text(text)
text_list = chunk_text_by_sentence(text)
else:
text_list = iter([text])

Expand Down
4 changes: 2 additions & 2 deletions src/ontogpt/evaluation/ctd/eval_ctd_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from oaklib import BasicOntologyInterface, get_adapter
from pydantic import BaseModel

from ontogpt.engines.knowledge_engine import chunk_text
from ontogpt.engines.knowledge_engine import chunk_text_by_sentence
from ontogpt.engines.spires_engine import SPIRESEngine
from ontogpt.evaluation.evaluation_engine import SimilarityScore, SPIRESEvaluationEngine
from ontogpt.templates.ctd_ner import (
Expand Down Expand Up @@ -281,7 +281,7 @@ def eval(self) -> EvaluationObjectSetNER:
ke.named_entities = [] # This stores the NEs the extractor knows about

if self.chunking:
text_list = chunk_text(text)
text_list = chunk_text_by_sentence(text)
else:
text_list = iter([text])

Expand Down
4 changes: 2 additions & 2 deletions src/ontogpt/evaluation/maxo/eval_maxo.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from oaklib import BasicOntologyInterface, get_adapter
from pydantic import BaseModel

from ontogpt.engines.knowledge_engine import chunk_text
from ontogpt.engines.knowledge_engine import chunk_text_by_sentence
from ontogpt.engines.spires_engine import SPIRESEngine
from ontogpt.evaluation.evaluation_engine import SimilarityScore, SPIRESEvaluationEngine
from ontogpt.templates.maxo import MaxoAnnotations, ActionAnnotationRelationship, Publication
Expand Down Expand Up @@ -234,7 +234,7 @@ def eval(self) -> EvaluationObjectSetRE:
ke.named_entities = [] # This stores the NEs the extractor knows about

if self.chunking:
text_list = chunk_text(text)
text_list = chunk_text_by_sentence(text)
else:
text_list = iter([text])

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from ontogpt.clients.pubmed_client import PubmedClient
from ontogpt.engines import create_engine
from ontogpt.engines.knowledge_engine import chunk_text
from ontogpt.engines.knowledge_engine import chunk_text_by_sentence
from ontogpt.engines.spires_engine import SPIRESEngine
from ontogpt.io.template_loader import get_template_details
from ontogpt.io.yaml_wrapper import dump_minimal_yaml
Expand Down Expand Up @@ -346,7 +346,7 @@ def test_setup(self):
def test_chunk_text(self):
"""Test chunking."""
text = "Title: foo. Abstract: Sentence 1. Sentence 2.\n Sentence 3. Sentence 4."
chunks = list(chunk_text(text))
chunks = list(chunk_text_by_sentence(text))
for chunk in chunks:
print(chunk)
self.assertEqual(len(chunks), 4)
Expand Down

0 comments on commit d08f8c7

Please sign in to comment.