Expand option to set token/context limit (#449)

Input text for extractions can be chunked by character (or by sentence, though this was already possible) to fit a token limit by using the `--max-text-length` option. Otherwise, the full text is used.
monarch-initiative · Aug 29, 2024 · d08f8c7 · d08f8c7
2 parents 465fcd1 + cce2e8a
commit d08f8c7
Show file tree

Hide file tree

Showing 10 changed files with 165 additions and 103 deletions.
diff --git a/docs/custom.md b/docs/custom.md
@@ -367,6 +367,8 @@ The output of this is then passed through further SPIRES iterations.
 
 LLMs have context sizes limiting the combined length of their inputs and outputs. The `gpt-3.5-turbo` model, for example, has a 4,096 token limit (prompt + completion), while the `gpt-3.5-turbo-16k` model has a larger context of 16,384 tokens.
 
+To see the token limit for each model, use `ontogpt list-models`. The Max Tokens value will be in the fourth column.
+
 ## Advanced functionality with linkml-owl
 
 A LinkML schema used in OntoGPT may include annotations describing how each component relates to OWL syntax.

diff --git a/poetry.lock b/poetry.lock
diff --git a/src/ontogpt/cli.py b/src/ontogpt/cli.py
@@ -362,6 +362,12 @@ def parse_tabular_input(inputpath: str, selectcols: List[str]) -> str:
     "--selectcols",
     help="Columns to select from tabular input, e.g. --selectcols name,description",
 )
+max_text_length_option = click.option(
+    "--max-text-length",
+    type=click.INT,
+    help="Maximum text length in characters for each input chunk."
+    " By default, the entire input is passed to the model.",
+)
 
 
 @click.group()
@@ -415,6 +421,7 @@ def main(verbose: int, quiet: bool, cache_db: str):
 @temperature_option
 @cut_input_text_option
 @selectcols_option
+@max_text_length_option
 def extract(
     inputfile,
     template,
@@ -434,6 +441,7 @@ def extract(
     model_provider,
     system_message,
     selectcols,
+    max_text_length,
     **kwargs,
 ):
     """Extract knowledge from text guided by schema, using SPIRES engine.
@@ -487,6 +495,7 @@ def extract(
         api_version=api_version,
         model_provider=model_provider,
         system_message=system_message,
+        max_text_length=max_text_length,
         **kwargs,
     )
     if settings.cache_db:
@@ -681,11 +690,7 @@ def iteratively_generate_extract(
 @api_version_option
 @model_provider_option
 @system_message_option
-@click.option(
-    "--max-text-length",
-    default=3000,
-    help="Maximum text length for each input chunk. Dependent on context size of model used.",
-)
+@max_text_length_option
 @click.argument("search")
 def pubmed_annotate(
     model,
@@ -734,7 +739,9 @@ def pubmed_annotate(
 
     pubmed_annotate_limit = limit
     pmc = PubmedClient()
-    pmc.max_text_length = max_text_length
+    if max_text_length:
+        logging.info(f"Using max text length of {max_text_length} - each input will be chunked.")
+        pmc.max_text_length = max_text_length
     pmids = pmc.get_pmids(search)
     if get_pmc:
         logging.info("Will try to retrieve PubMed Central texts.")

diff --git a/src/ontogpt/clients/pubmed_client.py b/src/ontogpt/clients/pubmed_client.py
@@ -83,7 +83,7 @@ class PubmedClient:
     # The maximum length of text, in characters, to include in
     # a single input chunk. This may be set in the CLI
     # with the max_text_length option.
-    max_text_length: int = 10000
+    max_text_length: int = 0
 
     try:
         email = get_apikey_value("ncbi-email")
@@ -343,12 +343,16 @@ def text(
         txt = []
         onetxt = ""
         for doc in these_docs:
-            if len(doc) > self.max_text_length and not raw:
-                logging.warning(
-                    f'Truncating entry beginning "{doc[:50]}" to {str(self.max_text_length)} chars'
-                )
-                shortdoc = doc[0 : self.max_text_length]
-                txt.append(shortdoc)
+            if self.max_text_length > 0:
+                if len(doc) > self.max_text_length and not raw:
+                    charnum = str(self.max_text_length)
+                    logging.warning(
+                        f'Truncating entry beginning "{doc[:50]}" to {charnum} chars'
+                    )
+                    shortdoc = doc[0 : self.max_text_length]
+                    txt.append(shortdoc)
+                else:
+                    txt.append(doc)
             else:
                 txt.append(doc)
             if singledoc and not pubmedcental:

diff --git a/src/ontogpt/engines/knowledge_engine.py b/src/ontogpt/engines/knowledge_engine.py
@@ -44,13 +44,17 @@
 ANNOTATION_KEY_EXAMPLES = "prompt.examples"
 
 
-def chunk_text(text: str, window_size=3) -> Iterator[str]:
+def chunk_text_by_sentence(text: str, window_size=3) -> Iterator[str]:
     """Chunk text into windows of sentences."""
     sentences = re.split(r"[.?!]\s+", text)
     for right_index in range(1, len(sentences)):
         left_index = max(0, right_index - window_size)
         yield ". ".join(sentences[left_index:right_index])
 
+def chunk_text_by_char(text: str, window_size=1000) -> Iterator[str]:
+    """Chunk text into windows of characters."""
+    for i in range(0, len(text), window_size):
+        yield text[i : i + window_size]
 
 @dataclass
 class KnowledgeEngine(ABC):

diff --git a/src/ontogpt/engines/spires_engine.py b/src/ontogpt/engines/spires_engine.py
@@ -29,7 +29,8 @@
     FIELD,
     OBJECT,
     KnowledgeEngine,
-    chunk_text,
+    chunk_text_by_char,
+    chunk_text_by_sentence,
 )
 from ontogpt.io.yaml_wrapper import dump_minimal_yaml
 from ontogpt.templates.core import ExtractionResult
@@ -58,6 +59,9 @@ class SPIRESEngine(KnowledgeEngine):
     where this determines the maximum number of sentences per chain.
     The results are then merged together."""
 
+    max_text_length: Optional[int] = None
+    """If set, this will split the text into chunks based on this number of characters."""
+
     def extract_from_text(
         self,
         text: str,
@@ -75,8 +79,18 @@ def extract_from_text(
         """
         self.extracted_named_entities = []  # Clear the named entity buffer
 
+        # This indicates that the text will be chunked in some way
+        have_chunks = False
+
         if self.sentences_per_window:
-            chunks = chunk_text(text, self.sentences_per_window)
+            chunks = chunk_text_by_sentence(text, self.sentences_per_window)
+            have_chunks = True
+
+        if self.max_text_length:
+            chunks = chunk_text_by_char(text, self.max_text_length)
+            have_chunks = True
+
+        if have_chunks:
             extracted_object = None
             for chunk in chunks:
                 raw_text = self._raw_extract(chunk, cls=cls, object=object, show_prompt=show_prompt)
@@ -87,14 +101,20 @@ def extract_from_text(
                 if extracted_object is None:
                     extracted_object = next_object
                 else:
-                    for k, v in next_object.items():
-                        if isinstance(v, list):
-                            extracted_object[k] += v
-                        else:
-                            if k not in extracted_object:
-                                extracted_object[k] = v
+                    # If the input is too small, which may happen with chunking,
+                    # there may be a new extracted object but it's empty,
+                    # raising an AttributeError on items.
+                    try:
+                        for k, v in next_object.items():
+                            if isinstance(v, list):
+                                extracted_object[k] += v
                             else:
-                                extracted_object[k] = v
+                                if k not in extracted_object:
+                                    extracted_object[k] = v
+                                else:
+                                    extracted_object[k] = v
+                    except AttributeError:
+                        logging.error(f"Empty object: {next_object}")
         else:
             raw_text = self._raw_extract(text=text, cls=cls, object=object, show_prompt=show_prompt)
             logging.info(f"RAW TEXT: {raw_text}")

diff --git a/src/ontogpt/evaluation/ctd/eval_ctd.py b/src/ontogpt/evaluation/ctd/eval_ctd.py
@@ -32,7 +32,7 @@
 from oaklib import BasicOntologyInterface, get_adapter
 from pydantic import BaseModel
 
-from ontogpt.engines.knowledge_engine import chunk_text
+from ontogpt.engines.knowledge_engine import chunk_text_by_sentence
 from ontogpt.engines.spires_engine import SPIRESEngine
 from ontogpt.evaluation.evaluation_engine import SimilarityScore, SPIRESEvaluationEngine
 from ontogpt.templates.ctd import (
@@ -220,7 +220,7 @@ def eval(self) -> EvaluationObjectSetRE:
             ke.named_entities = []  # This stores the NEs the extractor knows about
 
             if self.chunking:
-                text_list = chunk_text(text)
+                text_list = chunk_text_by_sentence(text)
             else:
                 text_list = iter([text])
 

diff --git a/src/ontogpt/evaluation/ctd/eval_ctd_ner.py b/src/ontogpt/evaluation/ctd/eval_ctd_ner.py
@@ -36,7 +36,7 @@
 from oaklib import BasicOntologyInterface, get_adapter
 from pydantic import BaseModel
 
-from ontogpt.engines.knowledge_engine import chunk_text
+from ontogpt.engines.knowledge_engine import chunk_text_by_sentence
 from ontogpt.engines.spires_engine import SPIRESEngine
 from ontogpt.evaluation.evaluation_engine import SimilarityScore, SPIRESEvaluationEngine
 from ontogpt.templates.ctd_ner import (
@@ -281,7 +281,7 @@ def eval(self) -> EvaluationObjectSetNER:
             ke.named_entities = []  # This stores the NEs the extractor knows about
 
             if self.chunking:
-                text_list = chunk_text(text)
+                text_list = chunk_text_by_sentence(text)
             else:
                 text_list = iter([text])
 

diff --git a/src/ontogpt/evaluation/maxo/eval_maxo.py b/src/ontogpt/evaluation/maxo/eval_maxo.py
@@ -39,7 +39,7 @@
 from oaklib import BasicOntologyInterface, get_adapter
 from pydantic import BaseModel
 
-from ontogpt.engines.knowledge_engine import chunk_text
+from ontogpt.engines.knowledge_engine import chunk_text_by_sentence
 from ontogpt.engines.spires_engine import SPIRESEngine
 from ontogpt.evaluation.evaluation_engine import SimilarityScore, SPIRESEvaluationEngine
 from ontogpt.templates.maxo import MaxoAnnotations, ActionAnnotationRelationship, Publication
@@ -234,7 +234,7 @@ def eval(self) -> EvaluationObjectSetRE:
             ke.named_entities = []  # This stores the NEs the extractor knows about
 
             if self.chunking:
-                text_list = chunk_text(text)
+                text_list = chunk_text_by_sentence(text)
             else:
                 text_list = iter([text])
 

diff --git a/tests/integration/test_knowledge_engines/test_spires_engine.py b/tests/integration/test_knowledge_engines/test_spires_engine.py
@@ -8,7 +8,7 @@
 
 from ontogpt.clients.pubmed_client import PubmedClient
 from ontogpt.engines import create_engine
-from ontogpt.engines.knowledge_engine import chunk_text
+from ontogpt.engines.knowledge_engine import chunk_text_by_sentence
 from ontogpt.engines.spires_engine import SPIRESEngine
 from ontogpt.io.template_loader import get_template_details
 from ontogpt.io.yaml_wrapper import dump_minimal_yaml
@@ -346,7 +346,7 @@ def test_setup(self):
     def test_chunk_text(self):
         """Test chunking."""
         text = "Title: foo. Abstract: Sentence 1. Sentence 2.\n Sentence 3. Sentence 4."
-        chunks = list(chunk_text(text))
+        chunks = list(chunk_text_by_sentence(text))
         for chunk in chunks:
             print(chunk)
         self.assertEqual(len(chunks), 4)