From 0d1ec10a2ed2accca205f36f20826ee3b5b10497 Mon Sep 17 00:00:00 2001 From: cmungall Date: Thu, 15 Jun 2023 11:45:24 -0700 Subject: [PATCH 1/5] Testing new OpenAI functions as a potential SPIRES augmentation. See #132 --- src/ontogpt/engines/spires2_engine.py | 79 +++++++++ .../test_spires2_engine.py | 150 ++++++++++++++++++ 2 files changed, 229 insertions(+) create mode 100644 src/ontogpt/engines/spires2_engine.py create mode 100644 tests/integration/test_knowledge_engines/test_spires2_engine.py diff --git a/src/ontogpt/engines/spires2_engine.py b/src/ontogpt/engines/spires2_engine.py new file mode 100644 index 000000000..23a3e1d4e --- /dev/null +++ b/src/ontogpt/engines/spires2_engine.py @@ -0,0 +1,79 @@ +""" +Rewrite of SPIRES to use OpenAI function feature. + +See https://github.com/monarch-initiative/ontogpt/issues/132 +""" +import json +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +import openai +from linkml_runtime.linkml_model import ClassDefinition + +from ontogpt.engines.knowledge_engine import ( + ANNOTATION_KEY_PROMPT, + ANNOTATION_KEY_PROMPT_SKIP, + EXAMPLE, + FIELD, + OBJECT, + KnowledgeEngine, + chunk_text, +) +from ontogpt.templates.core import ExtractionResult + +this_path = Path(__file__).parent + + +logger = logging.getLogger(__name__) + + +@dataclass +class SPIRES2Engine(KnowledgeEngine): + """Knowledge extractor.""" + + model: str = "gpt-3.5-turbo-0613" + + def extract_from_text( + self, text: str, cls: ClassDefinition = None, object: OBJECT = None + ) -> ExtractionResult: + """ + Extract annotations from the given text. + + :param text: + :param cls: + :param object: optional stub object + :return: + """ + if cls is None: + [cls] = [c for c in self.schemaview.all_classes().values() if c.tree_root] + py_cls = self.template_module.__dict__[cls.name] + schema = py_cls.schema() + functions = [ + { + "name": "extract_data", + #"description": cls.description, + "description": "paper", + "parameters": schema, + }, + ] + # TODO: introspect schema to customize system content + messages = [ + {"role": "system", + "content": "You are a helpful assistant that extracts summaries from text as JSON for a database."}, + {"role": "user", + "content": 'Extract a summary from the following text: ' + text}, + ] + logger.info(json.dumps(functions, indent=2)) + # TODO: abstract this so as not hardcoded + response = openai.ChatCompletion.create( + model=self.model, functions=functions, messages=messages) + logger.info(f"Response: {response}") + r = response.choices[0]['message']['function_call']['arguments'] + extracted_object = py_cls(**json.loads(r)) + return ExtractionResult( + input_text=text, + extracted_object=extracted_object, + named_entities=self.named_entities, + ) diff --git a/tests/integration/test_knowledge_engines/test_spires2_engine.py b/tests/integration/test_knowledge_engines/test_spires2_engine.py new file mode 100644 index 000000000..67cebcaa0 --- /dev/null +++ b/tests/integration/test_knowledge_engines/test_spires2_engine.py @@ -0,0 +1,150 @@ +"""Core tests.""" +import unittest + +import yaml +from linkml_runtime.linkml_model import ClassDefinitionName +from oaklib import get_implementation_from_shorthand + +from ontogpt.clients.pubmed_client import PubmedClient +from ontogpt.engines import create_engine +from ontogpt.engines.knowledge_engine import chunk_text +from ontogpt.engines.spires2_engine import SPIRES2Engine +from ontogpt.engines.spires_engine import SPIRESEngine +from ontogpt.io.yaml_wrapper import dump_minimal_yaml +from ontogpt.templates.biological_process import BiologicalProcess +from ontogpt.templates.gocam import ( + ExtractionResult, + Gene, + GeneLocation, + GeneOrganismRelationship, + GoCamAnnotations, +) + +TEMPLATE = "gocam.GoCamAnnotations" + +PAPER = """ +Title: β-Catenin Is Required for the cGAS/STING Signaling Pathway but + Antagonized by the Herpes Simplex Virus 1 US3 Protein. +Text: +The cGAS/STING-mediated DNA-sensing signaling pathway is crucial +for interferon (IFN) production and host antiviral +responses. Herpes simplex virus I (HSV-1) is a DNA virus that has +evolved multiple strategies to evade host immune responses. Here, +we demonstrate that the highly conserved β-catenin protein in the +Wnt signaling pathway is an important factor to enhance the +transcription of type I interferon (IFN-I) in the cGAS/STING +signaling pathway, and the production of IFN-I mediated by +β-catenin was antagonized by HSV-1 US3 protein via its kinase +activity. Infection by US3-deficienct HSV-1 and its kinase-dead +variants failed to downregulate IFN-I and IFN-stimulated +gene (ISG) production induced by β-catenin. Consistent with this, +absence of β-catenin enhanced the replication of US3-deficienct +HSV-1, but not wild-type HSV-1. The underlying mechanism was the +interaction of US3 with β-catenin and its hyperphosphorylation of +β-catenin at Thr556 to block its nuclear translocation. For the +first time, HSV-1 US3 has been shown to inhibit IFN-I production +through hyperphosphorylation of β-catenin and to subvert host +antiviral innate immunity.IMPORTANCE Although increasing evidence +has demonstrated that HSV-1 subverts host immune responses and +establishes lifelong latent infection, the molecular mechanisms +by which HSV-1 interrupts antiviral innate immunity, especially +the cGAS/STING-mediated cellular DNA-sensing signaling pathway, +have not been fully explored. Here, we show that β-catenin +promotes cGAS/STING-mediated activation of the IFN pathway, which +is important for cellular innate immune responses and intrinsic +resistance to DNA virus infection. The protein kinase US3 +antagonizes the production of IFN by targeting β-catenin via its +kinase activity. The findings in this study reveal a novel +mechanism for HSV-1 to evade host antiviral immunity and add new +knowledge to help in understanding the interaction between the +host and HSV-1 infection. + +Keywords: HSV-1; US3; type I IFN; β-catenin. +""" + +EXAMPLE_RESULTS = """ +genes: β-Catenin; cGAS; STING; US3; IFN; ISG +organisms: Herpes Simplex Virus I (HSV-1); +gene_organisms: β-Catenin:host; cGAS:host; STING:host; US3:HSV-1; IFN:host; ISG:host +activities: production of type I IFN; transcription of type I IFN; replication of HSV-1; +nuclear translocation of β-catenin. +gene_functions: β-catenin:enhance the transcription of type I IFN; US3:antagonize +the production of IFN; β-catenin:block nuclear translocation. +cellular_processes: cGAS/STING-mediated DNA-sensing signaling; activation of IFN pathway +pathways: IFN pathway; Wnt signalling pathway +gene_gene_interactions: US3:β-catenin +gene_localizations: US3:host; β-catenin:host +""" + +EXAMPLE_RESULTS_ALT = """ +genes: β-Catenin; cGAS; STING; US3; IFN; ISG +organisms: Herpes Simplex Virus I (HSV-1); +gene_organisms: β-Catenin - Human; cGAS - Human; STING - Human; +US3 - Human; IFN - Human; ISG - Human. +activities: Transcription; Production; Downregulation; Replication; Nuclear Translocation +gene_functions: β-Catenin - Enhances Transcription; US3 - Antagonizes Production; +US3 - Downregulates IFN-I; US3 - Blocks Nuclear Translocation; β-Catenin - Enhances Production +cellular_processes: DNA-sensing; Interferon Production; Antiviral Innate Immunity; +Host Innate Immune Responses; Interaction with Host; Evade Host Antiviral Immunity +pathways: cGAS/STING-mediated DNA-sensing; Wnt Signaling; IFN pathway +gene_gene_interactions: US3 - β-Catenin; β-Catenin - US3 +gene_localizations: β-Catenin - Nuclear; US3 - Hyperphosphorylation +""" + +TEST_PROCESS = BiologicalProcess( + label="autophagosome assembly", + description="The formation of a double membrane-bounded structure, the autophagosome,\ + that occurs when a specialized membrane sac, called the isolation membrane,\ + starts to enclose a portion of the cytoplasm", + subclass_of="GO:0022607", + outputs=["GO:0005776"], +) + +DIRECT_PARSE = { + "genes": ["β-Catenin", "cGAS", "STING", "US3", "IFN", "ISG"], + "gene_organisms": [ + ("β-Catenin", "host"), + ("cGAS", "host"), + ("STING", "host"), + ("US3", "HSV-1"), + ("IFN", "host"), + ("ISG", "host"), + ], +} + + +class TestCore(unittest.TestCase): + """Test annotation.""" + + def setUp(self) -> None: + """Set up.""" + #self.ke = create_engine(TEMPLATE, SPIRESEngine) + self.ke = SPIRES2Engine(template=TEMPLATE) + + def test_setup(self): + """Tests template and module is loaded.""" + ke = self.ke + pyc = ke.template_pyclass + print(pyc) + obj = pyc(genes=["a"], gene_organisms=[{"gene": "a", "organism": "b"}]) + print(yaml.dump(obj.dict())) + self.assertEqual(obj.genes, ["a"]) + self.assertEqual(obj.gene_organisms[0].gene, "a") + self.assertEqual(obj.gene_organisms[0].organism, "b") + slot = ke.schemaview.induced_slot("genes", "GeneOrganismRelationship") + self.assertEqual(slot.name, "genes") + self.assertEqual(slot.multivalued, True) + self.assertEqual(slot.range, "Gene") + + + def test_extract(self): + """Tests end to end knowledge extraction.""" + ke = self.ke + ann = ke.extract_from_text(PAPER) + print(f"RESULTS={ann}") + print(yaml.dump(ann.dict())) + results = ann.extracted_object + if not isinstance(results, GoCamAnnotations): + raise ValueError(f"Expected GoCamAnnotations, got {type(results)}") + self.assertIn("HGNC:2514", results.genes) + From a2e70c84b2b03882f34270e46d58996f4f2bc0e3 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Thu, 15 Jun 2023 14:57:13 -0400 Subject: [PATCH 2/5] Update models.yaml --- src/ontogpt/models.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/ontogpt/models.yaml b/src/ontogpt/models.yaml index 711be6fad..91486e430 100644 --- a/src/ontogpt/models.yaml +++ b/src/ontogpt/models.yaml @@ -8,6 +8,22 @@ models: - OpenAI is_default: true + - name: MODEL_GPT_3_5_TURBO + alternative_names: + - "gpt-3.5-turbo-0613" + - "openai-gpt-3.5-turbo-0613" + provider: OpenAI + creators: + - OpenAI + + - name: MODEL_GPT_3_5_TURBO_16K + alternative_names: + - "gpt-3.5-turbo-16k" + - "openai-gpt-3.5-turbo-16k" + provider: OpenAI + creators: + - OpenAI + - name: MODEL_TEXT_DAVINCI_003 alternative_names: - "text-davinci-003" From 2f936a3f139a45c8a2f259bab1b61ff80f0697a7 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Thu, 15 Jun 2023 17:46:54 -0400 Subject: [PATCH 3/5] Small test fix - for SPIRES new and old --- .../test_knowledge_engines/test_spires2_engine.py | 5 +++-- .../integration/test_knowledge_engines/test_spires_engine.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_knowledge_engines/test_spires2_engine.py b/tests/integration/test_knowledge_engines/test_spires2_engine.py index 67cebcaa0..5435bbbef 100644 --- a/tests/integration/test_knowledge_engines/test_spires2_engine.py +++ b/tests/integration/test_knowledge_engines/test_spires2_engine.py @@ -131,8 +131,8 @@ def test_setup(self): self.assertEqual(obj.genes, ["a"]) self.assertEqual(obj.gene_organisms[0].gene, "a") self.assertEqual(obj.gene_organisms[0].organism, "b") - slot = ke.schemaview.induced_slot("genes", "GeneOrganismRelationship") - self.assertEqual(slot.name, "genes") + slot = ke.schemaview.induced_slot("gene", "GeneOrganismRelationship") + self.assertEqual(slot.name, "gene") self.assertEqual(slot.multivalued, True) self.assertEqual(slot.range, "Gene") @@ -144,6 +144,7 @@ def test_extract(self): print(f"RESULTS={ann}") print(yaml.dump(ann.dict())) results = ann.extracted_object + print(results) if not isinstance(results, GoCamAnnotations): raise ValueError(f"Expected GoCamAnnotations, got {type(results)}") self.assertIn("HGNC:2514", results.genes) diff --git a/tests/integration/test_knowledge_engines/test_spires_engine.py b/tests/integration/test_knowledge_engines/test_spires_engine.py index e3f026bed..4e0fecef4 100644 --- a/tests/integration/test_knowledge_engines/test_spires_engine.py +++ b/tests/integration/test_knowledge_engines/test_spires_engine.py @@ -129,8 +129,8 @@ def test_setup(self): self.assertEqual(obj.genes, ["a"]) self.assertEqual(obj.gene_organisms[0].gene, "a") self.assertEqual(obj.gene_organisms[0].organism, "b") - slot = ke.schemaview.induced_slot("genes", "GeneOrganismRelationship") - self.assertEqual(slot.name, "genes") + slot = ke.schemaview.induced_slot("gene", "GeneOrganismRelationship") + self.assertEqual(slot.name, "gene") self.assertEqual(slot.multivalued, True) self.assertEqual(slot.range, "Gene") From fc4a7e876f4e636b50488d54448121371872282e Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Thu, 15 Jun 2023 17:50:27 -0400 Subject: [PATCH 4/5] Another small test fix --- tests/integration/test_knowledge_engines/test_spires2_engine.py | 2 +- tests/integration/test_knowledge_engines/test_spires_engine.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_knowledge_engines/test_spires2_engine.py b/tests/integration/test_knowledge_engines/test_spires2_engine.py index 5435bbbef..a06a993a7 100644 --- a/tests/integration/test_knowledge_engines/test_spires2_engine.py +++ b/tests/integration/test_knowledge_engines/test_spires2_engine.py @@ -133,7 +133,7 @@ def test_setup(self): self.assertEqual(obj.gene_organisms[0].organism, "b") slot = ke.schemaview.induced_slot("gene", "GeneOrganismRelationship") self.assertEqual(slot.name, "gene") - self.assertEqual(slot.multivalued, True) + self.assertIsNone(slot.multivalued) self.assertEqual(slot.range, "Gene") diff --git a/tests/integration/test_knowledge_engines/test_spires_engine.py b/tests/integration/test_knowledge_engines/test_spires_engine.py index 4e0fecef4..dc0b29d1f 100644 --- a/tests/integration/test_knowledge_engines/test_spires_engine.py +++ b/tests/integration/test_knowledge_engines/test_spires_engine.py @@ -131,7 +131,7 @@ def test_setup(self): self.assertEqual(obj.gene_organisms[0].organism, "b") slot = ke.schemaview.induced_slot("gene", "GeneOrganismRelationship") self.assertEqual(slot.name, "gene") - self.assertEqual(slot.multivalued, True) + self.assertEqual(slot.multivalued, False) self.assertEqual(slot.range, "Gene") def test_chunk_text(self): From 0abdcb626d9e61d3b5be36a5b731f79ea1684e5c Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Thu, 15 Jun 2023 17:51:13 -0400 Subject: [PATCH 5/5] For spires v1 tests too --- tests/integration/test_knowledge_engines/test_spires_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_knowledge_engines/test_spires_engine.py b/tests/integration/test_knowledge_engines/test_spires_engine.py index dc0b29d1f..d5ea844ea 100644 --- a/tests/integration/test_knowledge_engines/test_spires_engine.py +++ b/tests/integration/test_knowledge_engines/test_spires_engine.py @@ -131,7 +131,7 @@ def test_setup(self): self.assertEqual(obj.gene_organisms[0].organism, "b") slot = ke.schemaview.induced_slot("gene", "GeneOrganismRelationship") self.assertEqual(slot.name, "gene") - self.assertEqual(slot.multivalued, False) + self.assertIsNone(slot.multivalued) self.assertEqual(slot.range, "Gene") def test_chunk_text(self):