diff --git a/dsp/modules/dummy_lm.py b/dsp/modules/dummy_lm.py
index 1bd6a04a12..49f35fa72a 100644
--- a/dsp/modules/dummy_lm.py
+++ b/dsp/modules/dummy_lm.py
@@ -5,7 +5,7 @@
 
 
 # This testing module was moved in PR #735 to patch Arize Phoenix logging
-class DummyLM(LM):
+class DSPDummyLM(LM):
     """Dummy language model for unit testing purposes."""
 
     def __init__(self, answers: Union[list[str], dict[str, str]], follow_examples: bool = False):
@@ -61,7 +61,7 @@ def basic_request(self, prompt, n=1, **kwargs) -> dict[str, list[dict[str, str]]
                 },
             )
 
-            RED, GREEN, RESET = "\033[91m", "\033[92m", "\033[0m"
+            RED, _, RESET = "\033[91m", "\033[92m", "\033[0m"
             print("=== DummyLM ===")
             print(prompt, end="")
             print(f"{RED}{answer}{RESET}")
diff --git a/dsp/utils/settings.py b/dsp/utils/settings.py
index df073c7075..61fc37ae7e 100644
--- a/dsp/utils/settings.py
+++ b/dsp/utils/settings.py
@@ -1,8 +1,30 @@
 import threading
+from copy import deepcopy
 from contextlib import contextmanager
 
 from dsp.utils.utils import dotdict
 
+DEFAULT_CONFIG = dotdict(
+    lm=None,
+    adapter=None,
+    rm=None,
+    branch_idx=0,
+    reranker=None,
+    compiled_lm=None,
+    force_reuse_cached_compilation=False,
+    compiling=False,
+    skip_logprobs=False,
+    trace=[],
+    release=0,
+    bypass_assert=False,
+    bypass_suggest=False,
+    assert_failures=0,
+    suggest_failures=0,
+    langchain_history=[],
+    experimental=False,
+    backoff_time=10,
+)
+
 
 class Settings:
     """DSP configuration settings."""
@@ -25,27 +47,9 @@ def __new__(cls):
             #  TODO: remove first-class support for re-ranker and potentially combine with RM to form a pipeline of sorts
             #  eg: RetrieveThenRerankPipeline(RetrievalModel, Reranker)
             #  downstream operations like dsp.retrieve would use configs from the defined pipeline.
-            config = dotdict(
-                lm=None,
-                adapter=None,
-                rm=None,
-                branch_idx=0,
-                reranker=None,
-                compiled_lm=None,
-                force_reuse_cached_compilation=False,
-                compiling=False,  # TODO: can probably be removed
-                skip_logprobs=False,
-                trace=[],
-                release=0,
-                bypass_assert=False,
-                bypass_suggest=False,
-                assert_failures=0,
-                suggest_failures=0,
-                langchain_history=[],
-                experimental=False,
-                backoff_time = 10
-            )
-            cls._instance.__append(config)
+
+            # make a deepcopy of the default config to avoid modifying the default config
+            cls._instance.__append(deepcopy(DEFAULT_CONFIG))
 
         return cls._instance
 
diff --git a/dspy/adapters/chat_adapter.py b/dspy/adapters/chat_adapter.py
index 7b33dc3678..5a20dcff32 100644
--- a/dspy/adapters/chat_adapter.py
+++ b/dspy/adapters/chat_adapter.py
@@ -1,14 +1,15 @@
-import re
 import ast
 import json
+import re
 import textwrap
+from typing import get_args, get_origin
 
-from pydantic import TypeAdapter
 import pydantic
+from pydantic import TypeAdapter
+
 from .base import Adapter
-from typing import get_origin, get_args
 
-field_header_pattern = re.compile(r'\[\[ ## (\w+) ## \]\]')
+field_header_pattern = re.compile(r"\[\[ ## (\w+) ## \]\]")
 
 
 class ChatAdapter(Adapter):
@@ -21,9 +22,11 @@ def format(self, signature, demos, inputs):
         # Extract demos where some of the output_fields are not filled in.
         incomplete_demos = [demo for demo in demos if not all(k in demo for k in signature.fields)]
         complete_demos = [demo for demo in demos if demo not in incomplete_demos]
-        incomplete_demos = [demo for demo in incomplete_demos \
-                            if any(k in demo for k in signature.input_fields) and \
-                                any(k in demo for k in signature.output_fields)]
+        incomplete_demos = [
+            demo
+            for demo in incomplete_demos
+            if any(k in demo for k in signature.input_fields) and any(k in demo for k in signature.output_fields)
+        ]
 
         demos = incomplete_demos + complete_demos
 
@@ -32,20 +35,22 @@ def format(self, signature, demos, inputs):
         for demo in demos:
             messages.append(format_turn(signature, demo, role="user", incomplete=demo in incomplete_demos))
             messages.append(format_turn(signature, demo, role="assistant", incomplete=demo in incomplete_demos))
-        
+
         messages.append(format_turn(signature, inputs, role="user"))
 
         return messages
-    
+
     def parse(self, signature, completion, _parse_values=True):
         sections = [(None, [])]
 
         for line in completion.splitlines():
             match = field_header_pattern.match(line.strip())
-            if match: sections.append((match.group(1), []))
-            else: sections[-1][1].append(line)
+            if match:
+                sections.append((match.group(1), []))
+            else:
+                sections[-1][1].append(line)
 
-        sections = [(k, '\n'.join(v).strip()) for k, v in sections]
+        sections = [(k, "\n".join(v).strip()) for k, v in sections]
 
         fields = {}
         for k, v in sections:
@@ -53,23 +58,29 @@ def parse(self, signature, completion, _parse_values=True):
                 try:
                     fields[k] = parse_value(v, signature.output_fields[k].annotation) if _parse_values else v
                 except Exception as e:
-                    raise ValueError(f"Error parsing field {k}: {e}.\n\n\t\tOn attempting to parse the value\n```\n{v}\n```")
+                    raise ValueError(
+                        f"Error parsing field {k}: {e}.\n\n\t\tOn attempting to parse the value\n```\n{v}\n```"
+                    )
 
         if fields.keys() != signature.output_fields.keys():
             raise ValueError(f"Expected {signature.output_fields.keys()} but got {fields.keys()}")
 
         return fields
 
+
 def format_blob(blob):
-    if '\n' not in blob and "«" not in blob and "»" not in blob: return f"«{blob}»"
+    if "\n" not in blob and "«" not in blob and "»" not in blob:
+        return f"«{blob}»"
 
-    modified_blob = blob.replace('\n', '\n    ')
+    modified_blob = blob.replace("\n", "\n    ")
     return f"«««\n    {modified_blob}\n»»»"
 
 
 def format_list(items):
-    if len(items) == 0: return "N/A"
-    if len(items) == 1: return format_blob(items[0])
+    if len(items) == 0:
+        return "N/A"
+    if len(items) == 1:
+        return format_blob(items[0])
 
     return "\n".join([f"[{idx+1}] {format_blob(txt)}" for idx, txt in enumerate(items)])
 
@@ -89,21 +100,25 @@ def format_fields(fields):
         v = _format_field_value(v)
         output.append(f"[[ ## {k} ## ]]\n{v}")
 
-    return '\n\n'.join(output).strip()
-        
+    return "\n\n".join(output).strip()
+
 
 def parse_value(value, annotation):
-    if annotation is str: return str(value)
+    if annotation is str:
+        return str(value)
     parsed_value = value
     if isinstance(value, str):
-        try: parsed_value = json.loads(value)
+        try:
+            parsed_value = json.loads(value)
         except json.JSONDecodeError:
-            try: parsed_value = ast.literal_eval(value)
-            except (ValueError, SyntaxError): parsed_value = value
+            try:
+                parsed_value = ast.literal_eval(value)
+            except (ValueError, SyntaxError):
+                parsed_value = value
     return TypeAdapter(annotation).validate_python(parsed_value)
 
 
-def format_turn(signature, values, role, incomplete=False):       
+def format_turn(signature, values, role, incomplete=False):
     content = []
 
     if role == "user":
@@ -111,42 +126,46 @@ def format_turn(signature, values, role, incomplete=False):
         if incomplete:
             content.append("This is an example of the task, though some input or output fields are not supplied.")
     else:
-        field_names, values = list(signature.output_fields.keys()) + ['completed'], {**values, 'completed': ''}
+        field_names, values = list(signature.output_fields.keys()) + ["completed"], {**values, "completed": ""}
 
     if not incomplete:
         if not set(values).issuperset(set(field_names)):
             raise ValueError(f"Expected {field_names} but got {values.keys()}")
-    
+
     content.append(format_fields({k: values.get(k, "Not supplied for this particular example.") for k in field_names}))
 
     if role == "user":
-        content.append("Respond with the corresponding output fields, starting with the field " +
-                       ", then ".join(f"`{f}`" for f in signature.output_fields) +
-                       ", and then ending with the marker for `completed`.")
+        content.append(
+            "Respond with the corresponding output fields, starting with the field "
+            + ", then ".join(f"`{f}`" for f in signature.output_fields)
+            + ", and then ending with the marker for `completed`."
+        )
 
-    return {"role": role, "content": '\n\n'.join(content).strip()}
+    return {"role": role, "content": "\n\n".join(content).strip()}
 
 
 def get_annotation_name(annotation):
     origin = get_origin(annotation)
     args = get_args(annotation)
     if origin is None:
-        if hasattr(annotation, '__name__'):
+        if hasattr(annotation, "__name__"):
             return annotation.__name__
         else:
             return str(annotation)
     else:
-        args_str = ', '.join(get_annotation_name(arg) for arg in args)
-        return f"{origin.__name__}[{args_str}]"
+        args_str = ", ".join(get_annotation_name(arg) for arg in args)
+        return f"{get_annotation_name(origin)}[{args_str}]"
+
 
 def enumerate_fields(fields):
     parts = []
     for idx, (k, v) in enumerate(fields.items()):
         parts.append(f"{idx+1}. `{k}`")
         parts[-1] += f" ({get_annotation_name(v.annotation)})"
-        parts[-1] += f": {v.json_schema_extra['desc']}" if v.json_schema_extra['desc'] != f'${{{k}}}' else ''
+        parts[-1] += f": {v.json_schema_extra['desc']}" if v.json_schema_extra["desc"] != f"${{{k}}}" else ""
+
+    return "\n".join(parts).strip()
 
-    return '\n'.join(parts).strip()
 
 def prepare_instructions(signature):
     parts = []
@@ -154,12 +173,12 @@ def prepare_instructions(signature):
     parts.append("Your output fields are:\n" + enumerate_fields(signature.output_fields))
     parts.append("All interactions will be structured in the following way, with the appropriate values filled in.")
 
-    parts.append(format_fields({f : f"{{{f}}}" for f in signature.input_fields}))
-    parts.append(format_fields({f : f"{{{f}}}" for f in signature.output_fields}))
-    parts.append(format_fields({'completed' : ""}))
+    parts.append(format_fields({f: f"{{{f}}}" for f in signature.input_fields}))
+    parts.append(format_fields({f: f"{{{f}}}" for f in signature.output_fields}))
+    parts.append(format_fields({"completed": ""}))
 
     instructions = textwrap.dedent(signature.instructions)
-    objective = ('\n' + ' ' * 8).join([''] + instructions.splitlines())
+    objective = ("\n" + " " * 8).join([""] + instructions.splitlines())
     parts.append(f"In adhering to this structure, your objective is: {objective}")
 
     # parts.append("You will receive some input fields in each interaction. " +
@@ -167,4 +186,4 @@ def prepare_instructions(signature):
     #              ", then ".join(f"`{f}`" for f in signature.output_fields) +
     #              ", and then ending with the marker for `completed`.")
 
-    return '\n\n'.join(parts).strip()
+    return "\n\n".join(parts).strip()
diff --git a/dspy/utils/dummies.py b/dspy/utils/dummies.py
index d97d59b5a8..b26cdda043 100644
--- a/dspy/utils/dummies.py
+++ b/dspy/utils/dummies.py
@@ -1,15 +1,18 @@
 import random
 import re
+from collections import defaultdict
 from typing import Union
 
 import numpy as np
 
-from dsp.modules import LM
+from dsp.modules import LM as DSPLM
 from dsp.utils.utils import dotdict
+from dspy.adapters.chat_adapter import field_header_pattern, format_fields
+from dspy.clients.lm import LM
 
 
-class DummyLM(LM):
-    """Dummy language model for unit testing purposes."""
+class DSPDummyLM(DSPLM):
+    """Dummy language model for unit testing purposes subclassing DSP LM class."""
 
     def __init__(self, answers: Union[list[str], dict[str, str]], follow_examples: bool = False):
         """Initializes the dummy language model.
@@ -64,7 +67,7 @@ def basic_request(self, prompt, n=1, **kwargs) -> dict[str, list[dict[str, str]]
                 },
             )
 
-            RED, GREEN, RESET = "\033[91m", "\033[92m", "\033[0m"
+            RED, _, RESET = "\033[91m", "\033[92m", "\033[0m"
             print("=== DummyLM ===")
             print(prompt, end="")
             print(f"{RED}{answer}{RESET}")
@@ -94,6 +97,111 @@ def get_convo(self, index) -> str:
         return self.history[index]["prompt"] + " " + self.history[index]["response"]["choices"][0]["text"]
 
 
+class DummyLM(LM):
+    """
+    Dummy language model for unit testing purposes.
+
+    Three modes of operation:
+
+    ## 1. List of dictionaries
+
+    If a list of dictionaries is provided, the dummy model will return the next dictionary
+    in the list for each request, formatted according to the `format_fields` function.
+    from the chat adapter.
+
+    ```python
+        lm = DummyLM([{"answer": "red"}, {"answer": "blue"}])
+        dspy.settings.configure(lm=lm)
+        predictor("What color is the sky?")
+        # Output: "[[## answer ##]]\nred"
+        predictor("What color is the sky?")
+        # Output: "[[## answer ##]]\nblue"
+    ```
+
+    ## 2. Dictionary of dictionaries
+
+    If a dictionary of dictionaries is provided, the dummy model will return the value
+    corresponding to the key which is contained with the final message of the prompt,
+    formatted according to the `format_fields` function from the chat adapter.
+
+    ```python
+        lm = DummyLM({"What color is the sky?": {"answer": "blue"}})
+        dspy.settings.configure(lm=lm)
+        predictor("What color is the sky?")
+        # Output: "[[## answer ##]]\nblue"
+    ```
+
+    ## 3. Follow examples
+
+    If `follow_examples` is set to True, and the prompt contains an example input exactly equal to the prompt,
+    the dummy model will return the output from that example.
+
+    ```python
+        lm = DummyLM([{"answer": "red"}], follow_examples=True)
+        dspy.settings.configure(lm=lm)
+        predictor("What color is the sky?, demos=dspy.Example(input="What color is the sky?", output="blue"))
+        # Output: "[[## answer ##]]\nblue"
+    ```
+
+    """
+
+    def __init__(self, answers: Union[list[dict[str, str]], dict[str, dict[str, str]]], follow_examples: bool = False):
+        super().__init__("dummy", "chat", 0.0, 1000, True)
+        self.answers = answers
+        if isinstance(answers, list):
+            self.answers = iter(answers)
+        self.follow_examples = follow_examples
+
+    def _use_example(self, messages):
+        # find all field names
+        fields = defaultdict(int)
+        for message in messages:
+            if "content" in message:
+                if ma := field_header_pattern.match(message["content"]):
+                    fields[message["content"][ma.start() : ma.end()]] += 1
+        # find the fields which are missing from the final turns
+        max_count = max(fields.values())
+        output_fields = [field for field, count in fields.items() if count != max_count]
+
+        # get the output from the last turn that has the output fields as headers
+        final_input = messages[-1]["content"].split("\n\n")[0]
+        for input, output in zip(reversed(messages[:-1]), reversed(messages)):
+            if any(field in output["content"] for field in output_fields) and final_input in input["content"]:
+                return output["content"]
+
+    def __call__(self, prompt=None, messages=None, **kwargs):
+        # Build the request.
+        outputs = []
+        for _ in range(kwargs.get("n", 1)):
+            messages = messages or [{"role": "user", "content": prompt}]
+            kwargs = {**self.kwargs, **kwargs}
+
+            if self.follow_examples:
+                outputs.append(self._use_example(messages))
+            elif isinstance(self.answers, dict):
+                outputs.append(
+                    next(
+                        (format_fields(v) for k, v in self.answers.items() if k in messages[-1]["content"]),
+                        "No more responses",
+                    )
+                )
+            else:
+                outputs.append(format_fields(next(self.answers, {"answer": "No more responses"})))
+
+            # Logging, with removed api key & where `cost` is None on cache hit.
+            kwargs = {k: v for k, v in kwargs.items() if not k.startswith("api_")}
+            entry = dict(prompt=prompt, messages=messages, kwargs=kwargs)
+            entry = dict(**entry, outputs=outputs, usage=0)
+            entry = dict(**entry, cost=0)
+            self.history.append(entry)
+
+        return outputs
+
+    def get_convo(self, index):
+        """Get the prompt + anwer from the ith message."""
+        return self.history[index]["messages"], self.history[index]["outputs"]
+
+
 def dummy_rm(passages=()) -> callable:
     if not passages:
 
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000000..0bb74ac90e
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,13 @@
+import pytest
+
+import dspy
+from dsp.utils.settings import DEFAULT_CONFIG
+
+
+@pytest.fixture(autouse=True)
+def clear_settings():
+    """Ensures that the settings are cleared after each test."""
+
+    yield
+
+    dspy.settings.configure(**DEFAULT_CONFIG, inherit_config=False)
diff --git a/tests/dsp_LM/__init__.py b/tests/dsp_LM/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/evaluate/__init__.py b/tests/dsp_LM/evaluate/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/evaluate/test_evaluate.py b/tests/dsp_LM/evaluate/test_evaluate.py
new file mode 100644
index 0000000000..5c6c1f82ee
--- /dev/null
+++ b/tests/dsp_LM/evaluate/test_evaluate.py
@@ -0,0 +1,100 @@
+import signal
+import threading
+from unittest.mock import patch
+
+import pytest
+
+import dsp
+import dspy
+from dspy.evaluate.evaluate import Evaluate
+from dspy.evaluate.metrics import answer_exact_match
+from dspy.predict import Predict
+from dspy.utils.dummies import DSPDummyLM
+
+
+def new_example(question, answer):
+    """Helper function to create a new example."""
+    return dspy.Example(
+        question=question,
+        answer=answer,
+    ).with_inputs("question")
+
+
+def test_evaluate_call():
+    dspy.settings.configure(lm=DSPDummyLM({"What is 1+1?": "2", "What is 2+2?": "4"}))
+    devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
+    program = Predict("question -> answer")
+    assert program(question="What is 1+1?").answer == "2"
+    ev = Evaluate(
+        devset=devset,
+        metric=answer_exact_match,
+        display_progress=False,
+    )
+    score = ev(program)
+    assert score == 100.0
+
+
+def test_multithread_evaluate_call():
+    dspy.settings.configure(lm=DSPDummyLM({"What is 1+1?": "2", "What is 2+2?": "4"}))
+    devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
+    program = Predict("question -> answer")
+    assert program(question="What is 1+1?").answer == "2"
+    ev = Evaluate(
+        devset=devset,
+        metric=answer_exact_match,
+        display_progress=False,
+        num_threads=2,
+    )
+    score = ev(program)
+    assert score == 100.0
+
+
+def test_multi_thread_evaluate_call_cancelled(monkeypatch):
+    # slow LM that sleeps for 1 second before returning the answer
+    class SlowLM(DSPDummyLM):
+        def __call__(self, prompt, **kwargs):
+            import time
+
+            time.sleep(1)
+            return super().__call__(prompt, **kwargs)
+
+    dspy.settings.configure(lm=SlowLM({"What is 1+1?": "2", "What is 2+2?": "4"}))
+
+    devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
+    program = Predict("question -> answer")
+    assert program(question="What is 1+1?").answer == "2"
+
+    # spawn a thread that will sleep for .1 seconds then send a KeyboardInterrupt
+    def sleep_then_interrupt():
+        import time
+
+        time.sleep(0.1)
+        import os
+
+        os.kill(os.getpid(), signal.SIGINT)
+
+    input_thread = threading.Thread(target=sleep_then_interrupt)
+    input_thread.start()
+
+    with pytest.raises(KeyboardInterrupt):
+        ev = Evaluate(
+            devset=devset,
+            metric=answer_exact_match,
+            display_progress=False,
+            num_threads=2,
+        )
+        score = ev(program)
+        assert score == 100.0
+
+
+def test_evaluate_call_bad():
+    dspy.settings.configure(lm=DSPDummyLM({"What is 1+1?": "0", "What is 2+2?": "0"}))
+    devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
+    program = Predict("question -> answer")
+    ev = Evaluate(
+        devset=devset,
+        metric=answer_exact_match,
+        display_progress=False,
+    )
+    score = ev(program)
+    assert score == 0.0
diff --git a/tests/dsp_LM/examples/__init__.py b/tests/dsp_LM/examples/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/functional/__init__.py b/tests/dsp_LM/functional/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/functional/test_functional.py b/tests/dsp_LM/functional/test_functional.py
new file mode 100644
index 0000000000..31d0416b7a
--- /dev/null
+++ b/tests/dsp_LM/functional/test_functional.py
@@ -0,0 +1,906 @@
+import datetime
+import textwrap
+from typing import Annotated, Any, Generic, List, Literal, Optional, TypeVar
+
+import pydantic
+import pytest
+from pydantic import AfterValidator, BaseModel, Field, ValidationError, field_validator, model_validator
+
+import dspy
+from dspy.functional import FunctionalModule, TypedChainOfThought, TypedPredictor, cot, predictor
+from dspy.predict.predict import Predict
+from dspy.primitives.example import Example
+from dspy.teleprompt.bootstrap import BootstrapFewShot
+from dspy.teleprompt.vanilla import LabeledFewShot
+from dspy.utils.dummies import DSPDummyLM
+
+
+def test_simple():
+    @predictor
+    def hard_question(topic: str) -> str:
+        """Think of a hard factual question about a topic."""
+
+    expected = "What is the speed of light?"
+    lm = DSPDummyLM([expected])
+    dspy.settings.configure(lm=lm)
+
+    question = hard_question(topic="Physics")
+    lm.inspect_history(n=2)
+
+    assert question == expected
+
+
+def test_list_output():
+    @predictor
+    def hard_questions(topics: List[str]) -> List[str]:
+        pass
+
+    expected = ["What is the speed of light?", "What is the speed of sound?"]
+    lm = DSPDummyLM(['["What is the speed of light?", "What is the speed of sound?"]'])
+    dspy.settings.configure(lm=lm)
+
+    question = hard_questions(topics=["Physics", "Music"])
+    lm.inspect_history(n=2)
+
+    assert question == expected
+
+
+def test_simple_type():
+    class Question(pydantic.BaseModel):
+        value: str
+
+    @predictor
+    def hard_question(topic: str) -> Question:
+        """Think of a hard factual question about a topic."""
+
+    expected = "What is the speed of light?"
+    lm = DSPDummyLM([f'{{"value": "{expected}"}}'])
+    dspy.settings.configure(lm=lm)
+
+    question = hard_question(topic="Physics")
+
+    assert isinstance(question, Question)
+    assert question.value == expected
+
+
+def test_simple_type_input():
+    class Question(pydantic.BaseModel):
+        value: str
+
+    class Answer(pydantic.BaseModel):
+        value: str
+
+    @predictor
+    def answer(question: Question) -> Answer:
+        pass
+
+    question = Question(value="What is the speed of light?")
+    lm = DSPDummyLM([f'{{"value": "3e8"}}'])
+    dspy.settings.configure(lm=lm)
+
+    result = answer(question=question)
+
+    assert result == Answer(value="3e8")
+
+
+def test_simple_class():
+    class Answer(pydantic.BaseModel):
+        value: float
+        certainty: float
+        comments: List[str] = pydantic.Field(description="At least two comments about the answer")
+
+    class QA(FunctionalModule):
+        @predictor
+        def hard_question(self, topic: str) -> str:
+            """Think of a hard factual question about a topic. It should be answerable with a number."""
+
+        @cot
+        def answer(self, question: Annotated[str, "Question to answer"]) -> Answer:
+            pass
+
+        def forward(self, **kwargs):
+            question = self.hard_question(**kwargs)
+            return (question, self.answer(question=question))
+
+    expected = Answer(
+        value=3e8,
+        certainty=0.9,
+        comments=["It is the speed of light", "It is a constant"],
+    )
+
+    lm = DSPDummyLM(
+        [
+            "What is the speed of light?",
+            "Some bad reasoning, 3e8 m/s.",
+            "3e8",  # Bad answer 1
+            "{...}",  # Model is asked to create an example
+            "Some good reasoning...",
+            expected.model_dump_json(),  # Good answer
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    qa = QA()
+    assert isinstance(qa, FunctionalModule)
+    assert isinstance(qa.answer, dspy.Module)
+
+    question, answer = qa(topic="Physics")
+
+    print(qa.answer)
+
+    assert question == "What is the speed of light?"
+    assert answer == expected
+
+
+def test_simple_oop():
+    class Question(pydantic.BaseModel):
+        value: str
+
+    class MySignature(dspy.Signature):
+        topic: str = dspy.InputField()
+        output: Question = dspy.OutputField()
+
+    # Run the signature
+    program = TypedPredictor(MySignature)
+    expected = "What is the speed of light?"
+    lm = DSPDummyLM(
+        [
+            Question(value=expected).model_dump_json(),
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    question = program(topic="Physics").output
+
+    assert isinstance(question, Question)
+    assert question.value == expected
+
+
+def test_bootstrap_effectiveness():
+    class SimpleModule(FunctionalModule):
+        @predictor
+        def output(self, input: str) -> str:
+            pass
+
+        def forward(self, **kwargs):
+            return self.output(**kwargs)
+
+    def simple_metric(example, prediction, trace=None):
+        return example.output == prediction.output
+
+    examples = [
+        ex.with_inputs("input")
+        for ex in (
+            Example(input="What is the color of the sky?", output="blue"),
+            Example(
+                input="What does the fox say?",
+                output="Ring-ding-ding-ding-dingeringeding!",
+            ),
+        )
+    ]
+    trainset = [examples[0]]
+    valset = [examples[1]]
+
+    # This test verifies if the bootstrapping process improves the student's predictions
+    student = SimpleModule()
+    teacher = SimpleModule()
+    assert student.output.predictor.signature.equals(teacher.output.predictor.signature)
+
+    lm = DSPDummyLM(["blue", "Ring-ding-ding-ding-dingeringeding!"], follow_examples=True)
+    dspy.settings.configure(lm=lm, trace=[])
+
+    bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
+
+    lm.inspect_history(n=2)
+
+    # Check that the compiled student has the correct demos
+    _, predict = next(compiled_student.named_sub_modules(Predict, skip_compiled=False))
+    demos = predict.demos
+    assert len(demos) == 1
+    assert demos[0].input == trainset[0].input
+    assert demos[0].output == trainset[0].output
+
+    # Test the compiled student's prediction.
+    # We are using a DSPDummyLM with follow_examples=True, which means that
+    # even though it would normally reply with "Ring-ding-ding-ding-dingeringeding!"
+    # on the second output, if it seems an example that perfectly matches the
+    # prompt, it will use that instead. That is why we expect "blue" here.
+    prediction = compiled_student(input=trainset[0].input)
+    assert prediction == trainset[0].output
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `input`, produce the fields `output`.
+
+        ---
+
+        Follow the following format.
+
+        Input: ${input}
+        Output: ${output}
+
+        ---
+
+        Input: What is the color of the sky?
+        Output: blue
+
+        ---
+
+        Input: What is the color of the sky?
+        Output: blue"""
+    )
+
+
+def test_regex():
+    class TravelInformation(BaseModel):
+        origin: str = Field(pattern=r"^[A-Z]{3}$")
+        destination: str = Field(pattern=r"^[A-Z]{3}$")
+        date: datetime.date
+
+    @predictor
+    def flight_information(email: str) -> TravelInformation:
+        pass
+
+    email = textwrap.dedent(
+        """\
+        We're excited to welcome you aboard your upcoming flight from
+        John F. Kennedy International Airport (JFK) to Los Angeles International Airport (LAX)
+        on December 25, 2022. Here's everything you need to know before you take off: ...
+    """
+    )
+    lm = DSPDummyLM(
+        [
+            # Example with a bad origin code.
+            '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
+            # Example to help the model understand
+            "{...}",
+            # Fixed
+            '{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}',
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    assert flight_information(email=email) == TravelInformation(
+        origin="JFK", destination="LAX", date=datetime.date(2022, 12, 25)
+    )
+
+
+def test_custom_model_validate_json():
+    class Airport(BaseModel):
+        code: str = Field(pattern=r"^[A-Z]{3}$")
+        lat: float
+        lon: float
+
+    class TravelInformation(BaseModel):
+        origin: Airport
+        destination: Airport
+        date: datetime.date
+
+        @classmethod
+        def model_validate_json(
+            cls, json_data: str, *, strict: Optional[bool] = None, context: Optional[dict[str, Any]] = None
+        ) -> "TravelInformation":
+            try:
+                __tracebackhide__ = True
+                return cls.__pydantic_validator__.validate_json(json_data, strict=strict, context=context)
+            except ValidationError:
+                for substring_length in range(len(json_data), 1, -1):
+                    for start in range(len(json_data) - substring_length + 1):
+                        substring = json_data[start : start + substring_length]
+                        try:
+                            __tracebackhide__ = True
+                            res = cls.__pydantic_validator__.validate_json(substring, strict=strict, context=context)
+                            return res
+                        except ValidationError as exc:
+                            last_exc = exc
+                            pass
+            raise ValueError("Could not find valid json") from last_exc
+
+    @predictor
+    def flight_information(email: str) -> TravelInformation:
+        pass
+
+    email = textwrap.dedent(
+        """\
+        We're excited to welcome you aboard your upcoming flight from
+        John F. Kennedy International Airport (JFK) to Los Angeles International Airport (LAX)
+        on December 25, 2022. Here's everything you need to know before you take off: ...
+    """
+    )
+    lm = DSPDummyLM(
+        [
+            # Example with a bad origin code.
+            (
+                "Here is your json: "
+                "{"
+                '"origin": {"code":"JFK", "lat":40.6446, "lon":-73.7797}, '
+                '"destination": {"code":"LAX", "lat":33.942791, "lon":-118.410042}, '
+                '"date": "2022-12-25"}'
+            ),
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    assert flight_information(email=email) == TravelInformation(
+        origin={"code": "JFK", "lat": 40.6446, "lon": -73.7797},
+        destination={"code": "LAX", "lat": 33.942791, "lon": -118.410042},
+        date=datetime.date(2022, 12, 25),
+    )
+
+
+def test_raises():
+    class TravelInformation(BaseModel):
+        origin: str = Field(pattern=r"^[A-Z]{3}$")
+        destination: str = Field(pattern=r"^[A-Z]{3}$")
+        date: datetime.date
+
+    @predictor
+    def flight_information(email: str) -> TravelInformation:
+        pass
+
+    lm = DSPDummyLM(
+        [
+            "A list of bad inputs",
+            '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
+            '{"origin": "JFK", "destination": "LAX", "date": "bad date"}',
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    with pytest.raises(ValueError):
+        flight_information(email="Some email")
+
+
+def test_multi_errors():
+    class TravelInformation(BaseModel):
+        origin: str = Field(pattern=r"^[A-Z]{3}$")
+        destination: str = Field(pattern=r"^[A-Z]{3}$")
+        date: datetime.date
+
+    @predictor
+    def flight_information(email: str) -> TravelInformation:
+        pass
+
+    lm = DSPDummyLM(
+        [
+            # First origin is wrong, then destination, then all is good
+            '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
+            "{...}",  # Example to help the model understand
+            '{"origin": "JFK", "destination": "LA0", "date": "2022-12-25"}',
+            "{...}",  # Example to help the model understand
+            '{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}',
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    assert flight_information(email="Some email") == TravelInformation(
+        origin="JFK", destination="LAX", date=datetime.date(2022, 12, 25)
+    )
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `email`, produce the fields `flight_information`.
+
+        ---
+
+        Follow the following format.
+
+        Email: ${email}
+
+        Past Error in Flight Information: An error to avoid in the future
+
+        Past Error (2) in Flight Information: An error to avoid in the future
+
+        Flight Information: ${flight_information}. Respond with a single JSON object. JSON Schema: {"properties": {"origin": {"pattern": "^[A-Z]{3}$", "title": "Origin", "type": "string"}, "destination": {"pattern": "^[A-Z]{3}$", "title": "Destination", "type": "string"}, "date": {"format": "date", "title": "Date", "type": "string"}}, "required": ["origin", "destination", "date"], "title": "TravelInformation", "type": "object"}
+
+        ---
+
+        Email: Some email
+
+        Past Error in Flight Information: String should match pattern '^[A-Z]{3}$': origin (error type: string_pattern_mismatch)
+
+        Past Error (2) in Flight Information: String should match pattern '^[A-Z]{3}$': destination (error type: string_pattern_mismatch)
+
+        Flight Information: {"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}"""
+    )
+
+
+def test_field_validator():
+    class UserDetails(BaseModel):
+        name: str
+        age: int
+
+        @field_validator("name")
+        @classmethod
+        def validate_name(cls, v):
+            if v.upper() != v:
+                raise ValueError("Name must be in uppercase.")
+            return v
+
+    @predictor
+    def get_user_details() -> UserDetails:
+        pass
+
+    # Keep making the mistake (lower case name) until we run
+    # out of retries.
+    lm = DSPDummyLM(
+        [
+            '{"name": "lower case name", "age": 25}',
+        ]
+        * 10
+    )
+    dspy.settings.configure(lm=lm)
+
+    with pytest.raises(ValueError):
+        get_user_details()
+
+    print(lm.get_convo(-1))
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields , produce the fields `get_user_details`.
+
+        ---
+
+        Follow the following format.
+
+        Past Error in Get User Details: An error to avoid in the future
+        Past Error (2) in Get User Details: An error to avoid in the future
+        Get User Details: ${get_user_details}. Respond with a single JSON object. JSON Schema: {"properties": {"name": {"title": "Name", "type": "string"}, "age": {"title": "Age", "type": "integer"}}, "required": ["name", "age"], "title": "UserDetails", "type": "object"}
+
+        ---
+
+        Past Error in Get User Details: Value error, Name must be in uppercase.: name (error type: value_error)
+        Past Error (2) in Get User Details: Value error, Name must be in uppercase.: name (error type: value_error)
+        Get User Details: {"name": "lower case name", "age": 25}"""
+    )
+
+
+def test_annotated_field():
+    @predictor
+    def test(input: Annotated[str, Field(description="description")]) -> Annotated[float, Field(gt=0, lt=1)]:
+        pass
+
+    # First try 0, which fails, then try 0.5, which passes
+    lm = DSPDummyLM(["0", "0.5"])
+    dspy.settings.configure(lm=lm)
+
+    output = test(input="input")
+
+    assert output == 0.5
+
+
+def test_multiple_outputs():
+    lm = DSPDummyLM([str(i) for i in range(100)])
+    dspy.settings.configure(lm=lm)
+
+    test = TypedPredictor("input -> output")
+    output = test(input="input", config=dict(n=3)).completions.output
+    assert output == ["0", "1", "2"]
+
+
+def test_multiple_outputs_int():
+    lm = DSPDummyLM([str(i) for i in range(100)])
+    dspy.settings.configure(lm=lm)
+
+    class TestSignature(dspy.Signature):
+        input: int = dspy.InputField()
+        output: int = dspy.OutputField()
+
+    test = TypedPredictor(TestSignature)
+
+    output = test(input=8, config=dict(n=3)).completions.output
+    assert output == [0, 1, 2]
+
+
+def test_multiple_outputs_int_cot():
+    # Note: Multiple outputs only work when the language model "speculatively" generates all the outputs in one go.
+    lm = DSPDummyLM(
+        [
+            "thoughts 0\nOutput: 0\n",
+            "thoughts 1\nOutput: 1\n",
+            "thoughts 2\nOutput: 2\n",
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    test = TypedChainOfThought("input:str -> output:int")
+
+    output = test(input="8", config=dict(n=3)).completions.output
+    assert output == [0, 1, 2]
+
+
+def test_parse_type_string():
+    lm = DSPDummyLM([str(i) for i in range(100)])
+    dspy.settings.configure(lm=lm)
+
+    test = TypedPredictor("input:int -> output:int")
+
+    output = test(input=8, config=dict(n=3)).completions.output
+    assert output == [0, 1, 2]
+
+
+def test_literal():
+    lm = DSPDummyLM(['"2"', '"3"'])
+    dspy.settings.configure(lm=lm)
+
+    @predictor
+    def f() -> Literal["2", "3"]:
+        pass
+
+    assert f() == "2"
+
+
+def test_literal_missmatch():
+    lm = DSPDummyLM([f'"{i}"' for i in range(5, 100)])
+    dspy.settings.configure(lm=lm)
+
+    @predictor(max_retries=1)
+    def f() -> Literal["2", "3"]:
+        pass
+
+    with pytest.raises(Exception) as e_info:
+        f()
+
+    assert e_info.value.args[1]["f"] == "Input should be '2' or '3':  (error type: literal_error)"
+
+
+def test_literal_int():
+    lm = DSPDummyLM(["2", "3"])
+    dspy.settings.configure(lm=lm)
+
+    @predictor
+    def f() -> Literal[2, 3]:
+        pass
+
+    assert f() == 2
+
+
+def test_literal_int_missmatch():
+    lm = DSPDummyLM([f"{i}" for i in range(5, 100)])
+    dspy.settings.configure(lm=lm)
+
+    @predictor(max_retries=1)
+    def f() -> Literal[2, 3]:
+        pass
+
+    with pytest.raises(Exception) as e_info:
+        f()
+
+    assert e_info.value.args[1]["f"] == "Input should be 2 or 3:  (error type: literal_error)"
+
+
+def test_fields_on_base_signature():
+    class SimpleOutput(dspy.Signature):
+        output: float = dspy.OutputField(gt=0, lt=1)
+
+    lm = DSPDummyLM(
+        [
+            "2.1",  # Bad output
+            "0.5",  # Good output
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    predictor = TypedPredictor(SimpleOutput)
+
+    assert predictor().output == 0.5
+
+
+def test_synthetic_data_gen():
+    class SyntheticFact(BaseModel):
+        fact: str = Field(..., description="a statement")
+        varacity: bool = Field(..., description="is the statement true or false")
+
+    class ExampleSignature(dspy.Signature):
+        """Generate an example of a synthetic fact."""
+
+        fact: SyntheticFact = dspy.OutputField()
+
+    lm = DSPDummyLM(
+        [
+            '{"fact": "The sky is blue", "varacity": true}',
+            '{"fact": "The sky is green", "varacity": false}',
+            '{"fact": "The sky is red", "varacity": true}',
+            '{"fact": "The earth is flat", "varacity": false}',
+            '{"fact": "The earth is round", "varacity": true}',
+            '{"fact": "The earth is a cube", "varacity": false}',
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    generator = TypedPredictor(ExampleSignature)
+    examples = generator(config=dict(n=3))
+    for ex in examples.completions.fact:
+        assert isinstance(ex, SyntheticFact)
+    assert examples.completions.fact[0] == SyntheticFact(fact="The sky is blue", varacity=True)
+
+    # If you have examples and want more
+    existing_examples = [
+        dspy.Example(fact="The sky is blue", varacity=True),
+        dspy.Example(fact="The sky is green", varacity=False),
+    ]
+    trained = LabeledFewShot().compile(student=generator, trainset=existing_examples)
+
+    augmented_examples = trained(config=dict(n=3))
+    for ex in augmented_examples.completions.fact:
+        assert isinstance(ex, SyntheticFact)
+
+
+def test_list_input2():
+    # Inspired by the Signature Optimizer
+
+    class ScoredString(pydantic.BaseModel):
+        string: str
+        score: float
+
+    class ScoredSignature(dspy.Signature):
+        attempted_signatures: list[ScoredString] = dspy.InputField()
+        proposed_signature: str = dspy.OutputField()
+
+    program = TypedChainOfThought(ScoredSignature)
+
+    lm = DSPDummyLM(["Thoughts", "Output"])
+    dspy.settings.configure(lm=lm)
+
+    output = program(
+        attempted_signatures=[
+            ScoredString(string="string 1", score=0.5),
+            ScoredString(string="string 2", score=0.4),
+            ScoredString(string="string 3", score=0.3),
+        ]
+    ).proposed_signature
+
+    print(lm.get_convo(-1))
+
+    assert output == "Output"
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `attempted_signatures`, produce the fields `proposed_signature`.
+
+        ---
+
+        Follow the following format.
+
+        Attempted Signatures: ${attempted_signatures}
+        Reasoning: Let's think step by step in order to ${produce the proposed_signature}. We ...
+        Proposed Signature: ${proposed_signature}
+
+        ---
+
+        Attempted Signatures: [{"string":"string 1","score":0.5},{"string":"string 2","score":0.4},{"string":"string 3","score":0.3}]
+        Reasoning: Let's think step by step in order to Thoughts
+        Proposed Signature: Output"""
+    )
+
+
+def test_custom_reasoning_field():
+    class Question(pydantic.BaseModel):
+        value: str
+
+    class QuestionSignature(dspy.Signature):
+        topic: str = dspy.InputField()
+        question: Question = dspy.OutputField()
+
+    reasoning = dspy.OutputField(
+        prefix="Custom Reasoning: Let's break this down. To generate a question about",
+        desc="${topic}, we should ...",
+    )
+
+    program = TypedChainOfThought(QuestionSignature, reasoning=reasoning)
+
+    expected = "What is the speed of light?"
+    lm = DSPDummyLM(["Thoughts", f'{{"value": "{expected}"}}'])
+    dspy.settings.configure(lm=lm)
+
+    output = program(topic="Physics")
+
+    assert isinstance(output.question, Question)
+    assert output.question.value == expected
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `topic`, produce the fields `question`.
+
+        ---
+
+        Follow the following format.
+
+        Topic: ${topic}
+        Custom Reasoning: Let's break this down. To generate a question about ${topic}, we should ...
+        Question: ${question}. Respond with a single JSON object. JSON Schema: {"properties": {"value": {"title": "Value", "type": "string"}}, "required": ["value"], "title": "Question", "type": "object"}
+
+        ---
+
+        Topic: Physics
+        Custom Reasoning: Let's break this down. To generate a question about Thoughts
+        Question: {"value": "What is the speed of light?"}"""
+    )
+
+
+def test_generic_signature():
+    T = TypeVar("T")
+
+    class GenericSignature(dspy.Signature, Generic[T]):
+        """My signature"""
+
+        output: T = dspy.OutputField()
+
+    predictor = TypedPredictor(GenericSignature[int])
+    assert predictor.signature.instructions == "My signature"
+
+    lm = DSPDummyLM(["23"])
+    dspy.settings.configure(lm=lm)
+
+    assert predictor().output == 23
+
+
+def test_lm_as_validator():
+    @predictor
+    def is_square(n: int) -> bool:
+        """Is n a square number?"""
+
+    def check_square(n):
+        assert is_square(n=n)
+        return n
+
+    @predictor
+    def next_square(n: int) -> Annotated[int, AfterValidator(check_square)]:
+        """What is the next square number after n?"""
+
+    lm = DSPDummyLM(["3", "False", "4", "True"])
+    dspy.settings.configure(lm=lm)
+
+    m = next_square(n=2)
+    lm.inspect_history(n=2)
+
+    assert m == 4
+
+
+def test_annotated_validator():
+    def is_square(n: int) -> int:
+        root = n**0.5
+        if not root.is_integer():
+            raise ValueError(f"{n} is not a square")
+        return n
+
+    class MySignature(dspy.Signature):
+        """What is the next square number after n?"""
+
+        n: int = dspy.InputField()
+        next_square: Annotated[int, AfterValidator(is_square)] = dspy.OutputField()
+
+    lm = DSPDummyLM(["3", "4"])
+    dspy.settings.configure(lm=lm)
+
+    m = TypedPredictor(MySignature)(n=2).next_square
+    lm.inspect_history(n=2)
+
+    assert m == 4
+
+
+def test_annotated_validator_functional():
+    def is_square(n: int) -> int:
+        if not (n**0.5).is_integer():
+            raise ValueError(f"{n} is not a square")
+        return n
+
+    @predictor
+    def next_square(n: int) -> Annotated[int, AfterValidator(is_square)]:
+        """What is the next square number after n?"""
+
+    lm = DSPDummyLM(["3", "4"])
+    dspy.settings.configure(lm=lm)
+
+    m = next_square(n=2)
+    lm.inspect_history(n=2)
+
+    assert m == 4
+
+
+def test_demos():
+    demos = [
+        dspy.Example(input="What is the speed of light?", output="3e8"),
+    ]
+    program = LabeledFewShot(k=len(demos)).compile(
+        student=dspy.TypedPredictor("input -> output"),
+        trainset=[ex.with_inputs("input") for ex in demos],
+    )
+
+    lm = DSPDummyLM(["Paris"])
+    dspy.settings.configure(lm=lm)
+
+    assert program(input="What is the capital of France?").output == "Paris"
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `input`, produce the fields `output`.
+
+        ---
+
+        Follow the following format.
+
+        Input: ${input}
+        Output: ${output}
+
+        ---
+
+        Input: What is the speed of light?
+        Output: 3e8
+
+        ---
+
+        Input: What is the capital of France?
+        Output: Paris"""
+    )
+
+
+def _test_demos_missing_input():
+    demos = [dspy.Example(input="What is the speed of light?", output="3e8")]
+    program = LabeledFewShot(k=len(demos)).compile(
+        student=dspy.TypedPredictor("input -> output, thoughts"),
+        trainset=[ex.with_inputs("input") for ex in demos],
+    )
+    dspy.settings.configure(lm=DSPDummyLM(["My thoughts", "Paris"]))
+    assert program(input="What is the capital of France?").output == "Paris"
+
+    assert dspy.settings.lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `input`, produce the fields `output`.
+
+        ---
+
+        Follow the following format.
+
+        Input: ${input}
+        Thoughts: ${thoughts}
+        Output: ${output}
+
+        ---
+
+        Input: What is the speed of light?
+        Output: 3e8
+
+        ---
+
+        Input: What is the capital of France?
+        Thoughts: My thoughts
+        Output: Paris"""
+    )
+
+
+def test_conlist():
+    dspy.settings.configure(lm=DSPDummyLM(["[]", "[1]", "[1, 2]", "[1, 2, 3]"]))
+
+    @predictor
+    def make_numbers(input: str) -> Annotated[list[int], Field(min_items=2)]:
+        pass
+
+    assert make_numbers(input="What are the first two numbers?") == [1, 2]
+
+
+def test_conlist2():
+    dspy.settings.configure(lm=DSPDummyLM(["[]", "[1]", "[1, 2]", "[1, 2, 3]"]))
+
+    make_numbers = TypedPredictor("input:str -> output:Annotated[List[int], Field(min_items=2)]")
+    assert make_numbers(input="What are the first two numbers?").output == [1, 2]
+
+
+def test_model_validator():
+    class MySignature(dspy.Signature):
+        input_data: str = dspy.InputField()
+        allowed_categories: list[str] = dspy.InputField()
+        category: str = dspy.OutputField()
+
+        @model_validator(mode="after")
+        def check_cateogry(self):
+            if self.category not in self.allowed_categories:
+                raise ValueError(f"category not in {self.allowed_categories}")
+            return self
+
+    lm = DSPDummyLM(["horse", "dog"])
+    dspy.settings.configure(lm=lm)
+    predictor = TypedPredictor(MySignature)
+
+    pred = predictor(input_data="What is the best animal?", allowed_categories=["cat", "dog"])
+    assert pred.category == "dog"
diff --git a/tests/dsp_LM/functional/test_signature_opt_typed.py b/tests/dsp_LM/functional/test_signature_opt_typed.py
new file mode 100644
index 0000000000..3533926b62
--- /dev/null
+++ b/tests/dsp_LM/functional/test_signature_opt_typed.py
@@ -0,0 +1,187 @@
+import json
+from typing import Generic, TypeVar
+
+import pydantic
+from pydantic_core import to_jsonable_python
+
+import dspy
+from dspy.evaluate import Evaluate
+from dspy.evaluate.metrics import answer_exact_match
+from dspy.functional import TypedPredictor
+from dspy.teleprompt.signature_opt_typed import make_info, optimize_signature
+from dspy.utils import DSPDummyLM
+
+hotpotqa = [
+    ex.with_inputs("question")
+    for ex in [
+        dspy.Example(
+            question="At My Window was released by which American singer-songwriter?",
+            answer="John Townes Van Zandt",
+        ),
+        dspy.Example(
+            question="which  American actor was Candace Kita  guest starred with ",
+            answer="Bill Murray",
+        ),
+        dspy.Example(
+            question="Which of these publications was most recently published, Who Put the Bomp or Self?",
+            answer="Self",
+        ),
+        dspy.Example(
+            question="The Victorians - Their Story In Pictures is a documentary series written by an author born in what year?",
+            answer="1950",
+        ),
+        dspy.Example(
+            question="Which magazine has published articles by Scott Shaw, Tae Kwon Do Times or Southwest Art?",
+            answer="Tae Kwon Do Times",
+        ),
+        dspy.Example(
+            question="In what year was the club founded that played Manchester City in the 1972 FA Charity Shield",
+            answer="1874",
+        ),
+        dspy.Example(
+            question="Which is taller, the Empire State Building or the Bank of America Tower?",
+            answer="The Empire State Building",
+        ),
+        dspy.Example(
+            question='Which American actress who made their film debut in the 1995 teen drama "Kids" was the co-founder of Voto Latino?',
+            answer="Rosario Dawson",
+        ),
+        dspy.Example(
+            question="Tombstone stared an actor born May 17, 1955 known as who?",
+            answer="Bill Paxton",
+        ),
+        dspy.Example(
+            question="What is the code name for the German offensive that started this Second World War engagement on the Eastern Front (a few hundred kilometers from Moscow) between Soviet and German forces, which included 102nd Infantry Division?",
+            answer="Operation Citadel",
+        ),
+        dspy.Example(
+            question='Who acted in the shot film The Shore and is also the youngest actress ever to play Ophelia in a Royal Shakespeare Company production of "Hamlet." ?',
+            answer="Kerry Condon",
+        ),
+        dspy.Example(
+            question="Which company distributed this 1977 American animated film produced by Walt Disney Productions for which Sherman Brothers wrote songs?",
+            answer="Buena Vista Distribution",
+        ),
+        dspy.Example(
+            question="Samantha Cristoforetti and Mark Shuttleworth are both best known for being first in their field to go where? ",
+            answer="space",
+        ),
+        dspy.Example(
+            question="Having the combination of excellent foot speed and bat speed helped Eric Davis, create what kind of outfield for the Los Angeles Dodgers? ",
+            answer="Outfield of Dreams",
+        ),
+        dspy.Example(
+            question="Which Pakistani cricket umpire who won 3 consecutive ICC umpire of the year awards in 2009, 2010, and 2011 will be in the ICC World Twenty20?",
+            answer="Aleem Sarwar Dar",
+        ),
+        dspy.Example(
+            question="The Organisation that allows a community to influence their operation or use and to enjoy the benefits arisingwas founded in what year?",
+            answer="2010",
+        ),
+        dspy.Example(
+            question='"Everything Has Changed" is a song from an album released under which record label ?',
+            answer="Big Machine Records",
+        ),
+        dspy.Example(
+            question="Who is older, Aleksandr Danilovich Aleksandrov or Anatoly Fomenko?",
+            answer="Aleksandr Danilovich Aleksandrov",
+        ),
+        dspy.Example(
+            question="On the coast of what ocean is the birthplace of Diogal Sakho?",
+            answer="Atlantic",
+        ),
+        dspy.Example(
+            question="This American guitarist best known for her work with the Iron Maidens is an ancestor of a composer who was known as what?",
+            answer="The Waltz King",
+        ),
+    ]
+]
+
+
+def test_opt():
+    class BasicQA(dspy.Signature):
+        question: str = dspy.InputField()
+        answer: str = dspy.OutputField()
+
+    qa_model = DSPDummyLM([])
+    prompt_model = DSPDummyLM(
+        [
+            # Seed prompts
+            "some thoughts",
+            '[{"instructions": "I", "question_desc": "$q", "question_prefix": "Q:", "answer_desc": "$a", "answer_prefix": "A:"}]',
+        ]
+    )
+    dspy.settings.configure(lm=qa_model)
+
+    result = optimize_signature(
+        student=TypedPredictor(BasicQA),
+        evaluator=Evaluate(devset=hotpotqa, metric=answer_exact_match, num_threads=1),
+        initial_prompts=1,
+        n_iterations=2,
+        verbose=True,
+        prompt_model=prompt_model,
+        strategy="last",
+    )
+
+    # Since we are requesting the last signature, it doesn't matter that our qa_model is
+    # bad, and gets 0 score. We should still get the last signature.
+    class ExpectedSignature(dspy.Signature):
+        "I"
+
+        question: str = dspy.InputField(desc="$q", prefix="Q:")
+        answer: str = dspy.OutputField(desc="$a", prefix="A:")
+
+    assert result.program.signature.equals(ExpectedSignature)
+
+    assert result.scores == [0, 0]
+
+
+def test_opt_composed():
+    class MyModule(dspy.Module):
+        def __init__(self):
+            self.p1 = TypedPredictor("question:str -> considerations:list[str]", max_retries=1)
+            self.p2 = TypedPredictor("considerations:list[str] -> answer:str", max_retries=1)
+
+        def forward(self, question):
+            considerations = self.p1(question=question).considerations
+            return self.p2(considerations=considerations)
+
+    class ExpectedSignature1(dspy.Signature):
+        "I1"
+
+        question: str = dspy.InputField(desc="$q", prefix="Q:")
+        considerations: list[str] = dspy.OutputField(desc="$c", prefix="C:")
+
+    info1 = make_info(ExpectedSignature1)
+
+    class ExpectedSignature2(dspy.Signature):
+        "I2"
+
+        considerations: list[str] = dspy.InputField(desc="$c", prefix="C:")
+        answer: str = dspy.OutputField(desc="$a", prefix="A:")
+
+    info2 = make_info(ExpectedSignature2)
+
+    qa_model = DSPDummyLM([])
+    prompt_model = DSPDummyLM(
+        [
+            "some thoughts",
+            json.dumps([to_jsonable_python(info1)]),
+            "some thoughts",
+            json.dumps([to_jsonable_python(info2)]),
+        ]
+    )
+    dspy.settings.configure(lm=qa_model)
+
+    result = optimize_signature(
+        student=MyModule(),
+        evaluator=lambda x: 0,  # We don't care about the evaluator here
+        initial_prompts=1,
+        n_iterations=2,
+        verbose=True,
+        prompt_model=prompt_model,
+        strategy="last",
+    )
+
+    assert result.program.p1.signature.equals(ExpectedSignature1)
+    assert result.program.p2.signature.equals(ExpectedSignature2)
diff --git a/tests/dsp_LM/modules/__init__.py b/tests/dsp_LM/modules/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/predict/__init__.py b/tests/dsp_LM/predict/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/predict/test_chain_of_thought.py b/tests/dsp_LM/predict/test_chain_of_thought.py
new file mode 100644
index 0000000000..2567ae032f
--- /dev/null
+++ b/tests/dsp_LM/predict/test_chain_of_thought.py
@@ -0,0 +1,36 @@
+import textwrap
+
+import dspy
+from dspy import ChainOfThought
+from dspy.utils import DSPDummyLM
+
+
+def test_initialization_with_string_signature():
+    lm = DSPDummyLM(["find the number after 1", "2"])
+    dspy.settings.configure(lm=lm)
+    predict = ChainOfThought("question -> answer")
+    assert list(predict.extended_signature.output_fields.keys()) == [
+        "rationale",
+        "answer",
+    ]
+    assert predict(question="What is 1+1?").answer == "2"
+
+    print(lm.get_convo(-1))
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `question`, produce the fields `answer`.
+
+        ---
+
+        Follow the following format.
+
+        Question: ${question}
+        Reasoning: Let's think step by step in order to ${produce the answer}. We ...
+        Answer: ${answer}
+
+        ---
+
+        Question: What is 1+1?
+        Reasoning: Let's think step by step in order to find the number after 1
+        Answer: 2"""
+    )
diff --git a/tests/dsp_LM/predict/test_chain_of_thought_with_hint.py b/tests/dsp_LM/predict/test_chain_of_thought_with_hint.py
new file mode 100644
index 0000000000..0afa06b1c0
--- /dev/null
+++ b/tests/dsp_LM/predict/test_chain_of_thought_with_hint.py
@@ -0,0 +1,43 @@
+import dspy
+from dspy import ChainOfThoughtWithHint
+from dspy.utils import DSPDummyLM
+
+
+def test_cot_with_no_hint():
+    lm = DSPDummyLM(["find the number after 1", "2"])
+    dspy.settings.configure(lm=lm)
+    predict = ChainOfThoughtWithHint("question -> answer")
+    # Check output fields have the right order
+    assert list(predict.extended_signature2.output_fields.keys()) == [
+        "rationale",
+        "hint",
+        "answer",
+    ]
+    assert predict(question="What is 1+1?").answer == "2"
+
+    final_convo = lm.get_convo(-1)
+    assert final_convo.endswith(
+        "Question: What is 1+1?\n"
+        "Reasoning: Let's think step by step in order to find the number after 1\n"
+        "Answer: 2"
+    )
+
+
+def test_cot_with_hint():
+    lm = DSPDummyLM(["find the number after 1", "2"])
+    dspy.settings.configure(lm=lm)
+    predict = ChainOfThoughtWithHint("question -> answer")
+    assert list(predict.extended_signature2.output_fields.keys()) == [
+        "rationale",
+        "hint",
+        "answer",
+    ]
+    assert predict(question="What is 1+1?", hint="think small").answer == "2"
+
+    final_convo = lm.get_convo(-1)
+    assert final_convo.endswith(
+        "Question: What is 1+1?\n\n"
+        "Reasoning: Let's think step by step in order to find the number after 1\n\n"
+        "Hint: think small\n\n"
+        "Answer: 2"
+    )
diff --git a/tests/dsp_LM/predict/test_multi_chain_comparison.py b/tests/dsp_LM/predict/test_multi_chain_comparison.py
new file mode 100644
index 0000000000..e97c3dfbd0
--- /dev/null
+++ b/tests/dsp_LM/predict/test_multi_chain_comparison.py
@@ -0,0 +1,40 @@
+import pytest
+
+import dspy
+from dspy.utils.dummies import DSPDummyLM
+
+
+def test_basic_example():
+    class BasicQA(dspy.Signature):
+        """Answer questions with short factoid answers."""
+
+        question = dspy.InputField()
+        answer = dspy.OutputField(desc="often between 1 and 5 words")
+
+    # Example completions generated by a model for reference
+    completions = [
+        dspy.Prediction(
+            rationale="I recall that during clear days, the sky often appears this color.",
+            answer="blue",
+        ),
+        dspy.Prediction(
+            rationale="Based on common knowledge, I believe the sky is typically seen as this color.",
+            answer="green",
+        ),
+        dspy.Prediction(
+            rationale="From images and depictions in media, the sky is frequently represented with this hue.",
+            answer="blue",
+        ),
+    ]
+
+    # Pass signature to MultiChainComparison module
+    compare_answers = dspy.MultiChainComparison(BasicQA)
+
+    # Call the MultiChainComparison on the completions
+    question = "What is the color of the sky?"
+    lm = DSPDummyLM(["my rationale", "blue"])
+    dspy.settings.configure(lm=lm)
+    final_pred = compare_answers(completions, question=question)
+
+    assert final_pred.rationale == "my rationale"
+    assert final_pred.answer == "blue"
diff --git a/tests/dsp_LM/predict/test_predict.py b/tests/dsp_LM/predict/test_predict.py
new file mode 100644
index 0000000000..9158987e3a
--- /dev/null
+++ b/tests/dsp_LM/predict/test_predict.py
@@ -0,0 +1,101 @@
+import copy
+import textwrap
+
+import pydantic
+import pytest
+import ujson
+
+import dspy
+from dspy import Predict, Signature, TypedPredictor
+from dspy.utils.dummies import DSPDummyLM
+
+
+def test_call_method():
+    predict_instance = Predict("input -> output")
+    lm = DSPDummyLM(["test output"])
+    dspy.settings.configure(lm=lm)
+    result = predict_instance(input="test input")
+    assert result.output == "test output"
+    assert lm.get_convo(-1) == (
+        "Given the fields `input`, produce the fields `output`.\n"
+        "\n---\n\n"
+        "Follow the following format.\n\n"
+        "Input: ${input}\n"
+        "Output: ${output}\n"
+        "\n---\n\n"
+        "Input: test input\n"
+        "Output: test output"
+    )
+
+
+def test_forward_method():
+    program = Predict("question -> answer")
+    dspy.settings.configure(lm=DSPDummyLM([]))
+    result = program(question="What is 1+1?").answer
+    assert result == "No more responses"
+
+
+def test_forward_method2():
+    program = Predict("question -> answer1, answer2")
+    dspy.settings.configure(lm=DSPDummyLM(["my first answer", "my second answer"]))
+    result = program(question="What is 1+1?")
+    assert result.answer1 == "my first answer"
+    assert result.answer2 == "my second answer"
+
+
+def test_config_management():
+    predict_instance = Predict("input -> output")
+    predict_instance.update_config(new_key="value")
+    config = predict_instance.get_config()
+    assert "new_key" in config and config["new_key"] == "value"
+
+
+def test_multi_output():
+    program = Predict("question -> answer", n=2)
+    dspy.settings.configure(lm=DSPDummyLM(["my first answer", "my second answer"]))
+    results = program(question="What is 1+1?")
+    assert results.completions.answer[0] == "my first answer"
+    assert results.completions.answer[1] == "my second answer"
+
+
+def test_multi_output2():
+    program = Predict("question -> answer1, answer2", n=2)
+    dspy.settings.configure(
+        lm=DSPDummyLM(
+            [
+                "my 0 answer\nAnswer 2: my 2 answer",
+                "my 1 answer\nAnswer 2: my 3 answer",
+            ],
+        )
+    )
+    results = program(question="What is 1+1?")
+    assert results.completions.answer1[0] == "my 0 answer"
+    assert results.completions.answer1[1] == "my 1 answer"
+    assert results.completions.answer2[0] == "my 2 answer"
+    assert results.completions.answer2[1] == "my 3 answer"
+
+
+def test_output_only():
+    class OutputOnlySignature(dspy.Signature):
+        output = dspy.OutputField()
+
+    predictor = Predict(OutputOnlySignature)
+
+    lm = DSPDummyLM(["short answer"])
+    dspy.settings.configure(lm=lm)
+    assert predictor().output == "short answer"
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields , produce the fields `output`.
+
+        ---
+
+        Follow the following format.
+
+        Output: ${output}
+
+        ---
+
+        Output: short answer"""
+    )
diff --git a/tests/dsp_LM/predict/test_program_of_thought.py b/tests/dsp_LM/predict/test_program_of_thought.py
new file mode 100644
index 0000000000..e5522a847c
--- /dev/null
+++ b/tests/dsp_LM/predict/test_program_of_thought.py
@@ -0,0 +1,135 @@
+import textwrap
+
+import dspy
+from dspy import ProgramOfThought, Signature
+from dspy.utils import DSPDummyLM
+
+
+class BasicQA(Signature):
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="often between 1 and 5 words")
+
+
+def test_pot_code_generation():
+    pot = ProgramOfThought(BasicQA)
+    lm = DSPDummyLM(
+        [
+            "Reason_A",
+            "```python\nresult = 1+1\n```",
+            "Reason_B",
+            "2",
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+    res = pot(question="What is 1+1?")
+    assert res.answer == "2"
+    assert lm.get_convo(index=-1) == textwrap.dedent(
+        """\
+        Given the final code `question`, `final_generated_code`, `code_output`, provide the final `answer`.
+
+        ---
+
+        Follow the following format.
+
+        Question: ${question}
+
+        Code: python code that answers the question
+
+        Code Output: output of previously-generated python code
+
+        Reasoning: Let's think step by step in order to ${produce the answer}. We ...
+
+        Answer: often between 1 and 5 words
+
+        ---
+
+        Question: What is 1+1?
+
+        Code: result = 1+1
+
+        Code Output: 2
+
+        Reasoning: Let's think step by step in order to Reason_B
+
+        Answer: 2"""
+    )
+
+
+def test_pot_code_generation_with_error():
+    pot = ProgramOfThought(BasicQA)
+    lm = DSPDummyLM(
+        [
+            "Reason_A",
+            "```python\nresult = 1+0/0\n```",
+            "Reason_B",  # Error: division by zero
+            "```python\nresult = 1+1\n```",
+            "Reason_C",
+            "2",
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+    res = pot(question="What is 1+1?")
+    assert res.answer == "2"
+
+    # The first code example failed
+    assert lm.get_convo(index=2) == textwrap.dedent(
+        """\
+        You are given `question`, `previous_code`, `error` due to an error in previous code.
+        Your task is to correct the error and provide the new `generated_code`.
+
+        ---
+
+        Follow the following format.
+
+        Question: ${question}
+
+        Previous Code: previously-generated python code that errored
+
+        Error: error message from previously-generated python code
+
+        Reasoning: Let's think step by step in order to ${produce the generated_code}. We ...
+
+        Code: python code that answers the question
+
+        ---
+
+        Question: What is 1+1?
+
+        Previous Code: result = 1+0/0
+
+        Error: division by zero
+
+        Reasoning: Let's think step by step in order to Reason_B"""
+    )
+
+    # The second code example succeeded
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the final code `question`, `final_generated_code`, `code_output`, provide the final `answer`.
+
+        ---
+
+        Follow the following format.
+
+        Question: ${question}
+
+        Code: python code that answers the question
+
+        Code Output: output of previously-generated python code
+
+        Reasoning: Let's think step by step in order to ${produce the answer}. We ...
+
+        Answer: often between 1 and 5 words
+
+        ---
+
+        Question: What is 1+1?
+
+        Code: result = 1+1
+
+        Code Output: 2
+
+        Reasoning: Let's think step by step in order to Reason_C
+
+        Answer: 2"""
+    )
diff --git a/tests/dsp_LM/predict/test_react.py b/tests/dsp_LM/predict/test_react.py
new file mode 100644
index 0000000000..37979ddbc0
--- /dev/null
+++ b/tests/dsp_LM/predict/test_react.py
@@ -0,0 +1,154 @@
+from dataclasses import dataclass
+
+import dspy
+from dspy.utils.dummies import DSPDummyLM, dummy_rm
+
+
+def test_example_no_tools():
+    # Create a simple dataset which the model will use with the Retrieve tool.
+    lm = DSPDummyLM(
+        [
+            "Initial thoughts",  # Thought_1
+            "Finish[blue]",  # Action_1
+        ]
+    )
+    dspy.settings.configure(lm=lm, rm=dummy_rm())
+
+    program = dspy.ReAct("question -> answer")
+
+    # Check default tools
+    assert isinstance(program.tools["Finish"], dspy.Example)
+
+    # Call the ReAct module on a particular input
+    question = "What is the color of the sky?"
+    result = program(question=question)
+    assert result.answer == "blue"
+
+    # For debugging
+    print("---")
+    for row in lm.history:
+        print(row["prompt"])
+        print("Response:", row["response"]["choices"][0]["text"])
+        print("---")
+
+    assert lm.get_convo(-1).endswith(
+        "Question: What is the color of the sky?\n" "Thought 1: Initial thoughts\n" "Action 1: Finish[blue]"
+    )
+
+
+def test_example_search():
+    # Createa a simple dataset which the model will use with the Retrieve tool.
+    lm = DSPDummyLM(
+        [
+            "Initial thoughts",  # Thought_1
+            "Search[the color of the sky]",  # Thought_1
+            "More thoughts",  # Thought_2
+            "Finish[blue]",  # Action_2
+        ]
+    )
+    rm = dummy_rm(
+        [
+            "We all know the color of the sky is blue.",
+            "Somethng about the sky colors",
+            "This sentence is completely irellevant to answer the question.",
+            "Let's add some more sentences to act as summy passages.",
+            "Let's add some more sentences to act as summy passages.",
+            "Let's add some more sentences to act as summy passages.",
+        ]
+    )
+    dspy.settings.configure(lm=lm, rm=rm)
+
+    program = dspy.ReAct("question -> answer")
+
+    # Check default tools
+    assert len(program.tools) == 2
+    assert isinstance(program.tools["Search"], dspy.Retrieve)
+    assert isinstance(program.tools["Finish"], dspy.Example)
+
+    # Call the ReAct module on a particular input
+    question = "What is the color of the sky?"
+    result = program(question=question)
+    assert result.answer == "blue"
+
+    # For debugging
+    print(lm.get_convo(-1))
+
+    assert lm.get_convo(-1).endswith(
+        "Question: What is the color of the sky?\n\n"
+        "Thought 1: Initial thoughts\n\n"
+        "Action 1: Search[the color of the sky]\n\n"
+        "Observation 1:\n"
+        "[1] «We all know the color of the sky is blue.»\n"
+        "[2] «Somethng about the sky colors»\n"
+        "[3] «This sentence is completely irellevant to answer the question.»\n\n"
+        "Thought 2: More thoughts\n\n"
+        "Action 2: Finish[blue]"
+    )
+
+
+class DummyTool1:
+    name = "Tool1"
+    input_variable = "query"
+    desc = ""
+    num_calls = 0
+
+    def __call__(self, *args, **kwargs):
+        # test case with no passages attribute
+        assert args[0] == "foo"
+        self.num_calls += 1
+        return "tool 1 output"
+
+
+@dataclass
+class DummyOutput:
+    passages: str
+
+
+class DummyTool2:
+    name = "Tool2"
+    input_variable = "query"
+    desc = ""
+    num_calls = 0
+
+    def __call__(self, *args, **kwargs):
+        # test case with passages attribute
+        assert args[0] == "bar"
+        self.num_calls += 1
+        return DummyOutput(passages="tool 2 output")
+
+
+def test_custom_tools():
+    lm = DSPDummyLM(
+        [
+            "Initial thoughts",
+            "Tool1[foo]",
+            "More thoughts",
+            "Tool2[bar]",
+            "Even more thoughts",
+            "Finish[baz]",
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    tool1 = DummyTool1()
+    tool2 = DummyTool2()
+    program = dspy.ReAct("question -> answer", tools=[tool1, tool2])
+
+    question = "What is the color of the sky?"
+    result = program(question=question)
+    assert result.answer == "baz"
+
+    # each tool should be called only once
+    assert tool1.num_calls == 1
+    assert tool2.num_calls == 1
+    assert lm.get_convo(-1).endswith(
+        "Question: What is the color of the sky?\n\n"
+        "Thought 1: Initial thoughts\n\n"
+        "Action 1: Tool1[foo]\n\n"
+        "Observation 1: tool 1 output\n\n"
+        "Thought 2: More thoughts\n\n"
+        "Action 2: Tool2[bar]\n\n"
+        "Observation 2: tool 2 output\n\n"
+        "Thought 3: Even more thoughts\n\n"
+        "Action 3: Finish[baz]"
+    )
diff --git a/tests/dsp_LM/predict/test_retry.py b/tests/dsp_LM/predict/test_retry.py
new file mode 100644
index 0000000000..bd22984d48
--- /dev/null
+++ b/tests/dsp_LM/predict/test_retry.py
@@ -0,0 +1,110 @@
+import functools
+
+import pydantic
+
+import dspy
+from dspy.primitives.assertions import assert_transform_module, backtrack_handler
+from dspy.utils import DSPDummyLM
+
+
+def test_retry_simple():
+    predict = dspy.Predict("question -> answer")
+    retry_module = dspy.Retry(predict)
+
+    # Test Retry has created the correct new signature
+    for field in predict.signature.output_fields:
+        assert f"past_{field}" in retry_module.new_signature.input_fields
+    assert "feedback" in retry_module.new_signature.input_fields
+
+    lm = DSPDummyLM(["blue"])
+    dspy.settings.configure(lm=lm)
+    result = retry_module.forward(
+        question="What color is the sky?",
+        past_outputs={"answer": "red"},
+        feedback="Try harder",
+    )
+    assert result.answer == "blue"
+
+    print(lm.get_convo(-1))
+    assert lm.get_convo(-1).endswith(
+        "Question: What color is the sky?\n\n" "Previous Answer: red\n\n" "Instructions: Try harder\n\n" "Answer: blue"
+    )
+
+
+def test_retry_forward_with_feedback():
+    # First we make a mistake, then we fix it
+    lm = DSPDummyLM(["red", "blue"])
+    dspy.settings.configure(lm=lm, trace=[])
+
+    class SimpleModule(dspy.Module):
+        def __init__(self):
+            super().__init__()
+            self.predictor = dspy.Predict("question -> answer")
+
+        def forward(self, **kwargs):
+            result = self.predictor(**kwargs)
+            print(f"SimpleModule got {result.answer=}")
+            dspy.Suggest(result.answer == "blue", "Please think harder")
+            return result
+
+    program = SimpleModule()
+    program = assert_transform_module(
+        program.map_named_predictors(dspy.Retry),
+        functools.partial(backtrack_handler, max_backtracks=1),
+    )
+
+    result = program(question="What color is the sky?")
+
+    assert result.answer == "blue"
+
+    print(lm.get_convo(-1))
+    assert lm.get_convo(-1).endswith(
+        "Question: What color is the sky?\n\n"
+        "Previous Answer: red\n\n"
+        "Instructions: Please think harder\n\n"
+        "Answer: blue"
+    )
+
+
+def test_retry_forward_with_typed_predictor():
+    # First we make a mistake, then we fix it
+    lm = DSPDummyLM(['{"answer":"red"}', '{"answer":"blue"}'])
+    dspy.settings.configure(lm=lm, trace=[])
+
+    class AnswerQuestion(dspy.Signature):
+        """Answer questions with succint responses."""
+
+        class Input(pydantic.BaseModel):
+            question: str
+
+        class Output(pydantic.BaseModel):
+            answer: str
+
+        input: Input = dspy.InputField()
+        output: Output = dspy.OutputField()
+
+    class QuestionAnswerer(dspy.Module):
+        def __init__(self):
+            super().__init__()
+            self.answer_question = dspy.TypedPredictor(AnswerQuestion)
+
+        def forward(self, **kwargs):
+            result = self.answer_question(input=AnswerQuestion.Input(**kwargs)).output
+            dspy.Suggest(result.answer == "blue", "Please think harder")
+            return result
+
+    program = QuestionAnswerer()
+    program = assert_transform_module(
+        program.map_named_predictors(dspy.Retry),
+        functools.partial(backtrack_handler, max_backtracks=1),
+    )
+
+    result = program(question="What color is the sky?")
+
+    assert result.answer == "blue"
+    assert lm.get_convo(-1).endswith(
+        'Input: {"question":"What color is the sky?"}\n\n'
+        'Previous Output: {"answer":"red"}\n\n'
+        "Instructions: Please think harder\n\n"
+        'Output: {"answer":"blue"}'
+    )
diff --git a/tests/dsp_LM/primitives/__init__.py b/tests/dsp_LM/primitives/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/primitives/test_program.py b/tests/dsp_LM/primitives/test_program.py
new file mode 100644
index 0000000000..dc817882a1
--- /dev/null
+++ b/tests/dsp_LM/primitives/test_program.py
@@ -0,0 +1,21 @@
+import dspy
+from dspy.primitives.program import Module, set_attribute_by_name  # Adjust the import based on your file structure
+from dspy.utils import DSPDummyLM
+
+
+class HopModule(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.predict1 = dspy.Predict("question -> query")
+        self.predict2 = dspy.Predict("query -> answer")
+
+    def forward(self, question):
+        query = self.predict1(question=question).query
+        return self.predict2(query=query)
+
+
+def test_forward():
+    program = HopModule()
+    dspy.settings.configure(lm=DSPDummyLM({"What is 1+1?": "let me check", "let me check": "2"}))
+    result = program(question="What is 1+1?").answer
+    assert result == "2"
diff --git a/tests/dsp_LM/retrieve/__init__.py b/tests/dsp_LM/retrieve/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/retrieve/test_llama_index_rm.py b/tests/dsp_LM/retrieve/test_llama_index_rm.py
new file mode 100644
index 0000000000..735c1a9407
--- /dev/null
+++ b/tests/dsp_LM/retrieve/test_llama_index_rm.py
@@ -0,0 +1,61 @@
+import logging
+
+import pytest
+
+import dspy
+from dsp.modules.dummy_lm import DSPDummyLM
+from dspy.datasets import HotPotQA
+
+try:
+    from llama_index.core import Settings, VectorStoreIndex
+    from llama_index.core.base.base_retriever import BaseRetriever
+    from llama_index.core.embeddings.mock_embed_model import MockEmbedding
+    from llama_index.core.readers.string_iterable import StringIterableReader
+
+    from dspy.retrieve.llama_index_rm import LlamaIndexRM
+
+except ImportError:
+    logging.info("Optional dependency llama-index is not installed - skipping LlamaIndexRM tests.")
+
+
+@pytest.fixture()
+def rag_setup() -> dict:
+    """Builds the necessary fixtures to test LI"""
+    pytest.importorskip("llamaindex")
+    dataset = HotPotQA(train_seed=1, train_size=8, eval_seed=2023, dev_size=4, test_size=0)
+    trainset = [x.with_inputs("question") for x in dataset.train]
+    devset = [x.with_inputs("question") for x in dataset.dev]
+    ragset = [f"Question: {x.question} Answer: {x.answer}" for x in dataset.train]
+    dummyset = {x.question: x.answer for x in dataset.train}
+
+    Settings.embed_model = MockEmbedding(8)
+    docs = StringIterableReader().load_data(texts=ragset)
+    index = VectorStoreIndex.from_documents(documents=docs)
+    retriever = index.as_retriever()
+    rm = LlamaIndexRM(retriever)
+
+    return {
+        "index": index,
+        "retriever": retriever,
+        "rm": rm,
+        "lm": DSPDummyLM(answers=dummyset),
+        "trainset": trainset,
+        "devset": devset,
+    }
+
+
+def test_lirm_as_rm(rag_setup):
+    """Test the retriever as retriever method"""
+    pytest.importorskip("llamaindex")
+    retriever = rag_setup.get("retriever")
+    test_res_li = retriever.retrieve("At My Window was released by which American singer-songwriter?")
+    rm = rag_setup.get("rm")
+    test_res_dspy = rm.forward("At My Window was released by which American singer-songwriter?")
+
+    assert isinstance(retriever, BaseRetriever), "Ensuring that the retriever is a LI Retriever object"
+    assert isinstance(test_res_li, list), "Ensuring results are a list from LI Retriever"
+
+    assert isinstance(rm, dspy.Retrieve), "Ensuring the RM is a retriever object from dspy"
+    assert isinstance(test_res_dspy, list), "Ensuring the results are a list from the DSPy retriever"
+
+    assert len(test_res_li) == len(test_res_dspy), "Rough equality check of the results"
diff --git a/tests/dsp_LM/signatures/__init__.py b/tests/dsp_LM/signatures/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/signatures/test_signature.py b/tests/dsp_LM/signatures/test_signature.py
new file mode 100644
index 0000000000..fffa58ab2c
--- /dev/null
+++ b/tests/dsp_LM/signatures/test_signature.py
@@ -0,0 +1,41 @@
+import textwrap
+from typing import List
+
+import pydantic
+import pytest
+
+import dspy
+from dspy import InputField, OutputField, Signature, infer_prefix
+from dspy.utils.dummies import DSPDummyLM
+
+
+def test_multiline_instructions():
+    class MySignature(Signature):
+        """First line
+        Second line
+            Third line"""
+
+        output = OutputField()
+
+    predictor = dspy.Predict(MySignature)
+
+    lm = DSPDummyLM(["short answer"])
+    dspy.settings.configure(lm=lm)
+    assert predictor().output == "short answer"
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        First line
+        Second line
+            Third line
+
+        ---
+
+        Follow the following format.
+
+        Output: ${output}
+
+        ---
+
+        Output: short answer"""
+    )
diff --git a/tests/dsp_LM/teleprompt/__init__.py b/tests/dsp_LM/teleprompt/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/teleprompt/test_bootstrap.py b/tests/dsp_LM/teleprompt/test_bootstrap.py
new file mode 100644
index 0000000000..936daf8e4b
--- /dev/null
+++ b/tests/dsp_LM/teleprompt/test_bootstrap.py
@@ -0,0 +1,156 @@
+import textwrap
+
+import pytest
+
+import dspy
+from dspy import Example
+from dspy.predict import Predict
+from dspy.teleprompt import BootstrapFewShot
+from dspy.utils.dummies import DSPDummyLM
+
+
+# Define a simple metric function for testing
+def simple_metric(example, prediction, trace=None):
+    # Simplified metric for testing: true if prediction matches expected output
+    return example.output == prediction.output
+
+
+examples = [
+    Example(input="What is the color of the sky?", output="blue").with_inputs("input"),
+    Example(input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!"),
+]
+trainset = [examples[0]]
+valset = [examples[1]]
+
+
+class SimpleModule(dspy.Module):
+    def __init__(self, signature):
+        super().__init__()
+        self.predictor = Predict(signature)
+
+    def forward(self, **kwargs):
+        return self.predictor(**kwargs)
+
+
+def test_compile_with_predict_instances():
+    # Create Predict instances for student and teacher
+    # Note that dspy.Predict is not itself a module, so we can't use it directly here
+    student = SimpleModule("input -> output")
+    teacher = SimpleModule("input -> output")
+
+    lm = DSPDummyLM(["Initial thoughts", "Finish[blue]"])
+    dspy.settings.configure(lm=lm)
+
+    # Initialize BootstrapFewShot and compile the student
+    bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
+
+    assert compiled_student is not None, "Failed to compile student"
+    assert hasattr(compiled_student, "_compiled") and compiled_student._compiled, "Student compilation flag not set"
+
+
+def test_bootstrap_effectiveness():
+    # This test verifies if the bootstrapping process improves the student's predictions
+    student = SimpleModule("input -> output")
+    teacher = SimpleModule("input -> output")
+    lm = DSPDummyLM(["blue", "Ring-ding-ding-ding-dingeringeding!"], follow_examples=True)
+    dspy.settings.configure(lm=lm, trace=[])
+
+    bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
+
+    # Check that the compiled student has the correct demos
+    assert len(compiled_student.predictor.demos) == 1
+    assert compiled_student.predictor.demos[0].input == trainset[0].input
+    assert compiled_student.predictor.demos[0].output == trainset[0].output
+
+    # Test the compiled student's prediction.
+    # We are using a DSPDummyLM with follow_examples=True, which means that
+    # even though it would normally reply with "Ring-ding-ding-ding-dingeringeding!"
+    # on the second output, if it seems an example that perfectly matches the
+    # prompt, it will use that instead. That is why we expect "blue" here.
+    prediction = compiled_student(input=trainset[0].input)
+    assert prediction.output == trainset[0].output
+
+    # For debugging
+    print("Convo")
+    print(lm.get_convo(-1))
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `input`, produce the fields `output`.
+
+        ---
+
+        Follow the following format.
+
+        Input: ${input}
+        Output: ${output}
+
+        ---
+
+        Input: What is the color of the sky?
+        Output: blue
+
+        ---
+
+        Input: What is the color of the sky?
+        Output: blue"""
+    )
+
+
+def test_error_handling_during_bootstrap():
+    """
+    Test to verify error handling during the bootstrapping process
+    """
+
+    class BuggyModule(dspy.Module):
+        def __init__(self, signature):
+            super().__init__()
+            self.predictor = Predict(signature)
+
+        def forward(self, **kwargs):
+            raise RuntimeError("Simulated error")
+
+    student = SimpleModule("input -> output")
+    teacher = BuggyModule("input -> output")
+
+    # Setup DSPDummyLM to simulate an error scenario
+    lm = DSPDummyLM(
+        [
+            "Initial thoughts",  # Simulate initial teacher's prediction
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    bootstrap = BootstrapFewShot(
+        metric=simple_metric,
+        max_bootstrapped_demos=1,
+        max_labeled_demos=1,
+        max_errors=1,
+    )
+
+    with pytest.raises(RuntimeError, match="Simulated error"):
+        bootstrap.compile(student, teacher=teacher, trainset=trainset)
+
+
+def test_validation_set_usage():
+    """
+    Test to ensure the validation set is correctly used during bootstrapping
+    """
+    student = SimpleModule("input -> output")
+    teacher = SimpleModule("input -> output")
+
+    lm = DSPDummyLM(
+        [
+            "Initial thoughts",
+            "Finish[blue]",  # Expected output for both training and validation
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
+
+    # Check that validation examples are part of student's demos after compilation
+    assert len(compiled_student.predictor.demos) >= len(valset), "Validation set not used in compiled student demos"
diff --git a/tests/dsp_LM/teleprompt/test_copro_optimizer.py b/tests/dsp_LM/teleprompt/test_copro_optimizer.py
new file mode 100644
index 0000000000..50011eecc4
--- /dev/null
+++ b/tests/dsp_LM/teleprompt/test_copro_optimizer.py
@@ -0,0 +1,149 @@
+import textwrap
+
+import pytest
+
+import dspy
+from dspy import Example
+from dspy.teleprompt.signature_opt import COPRO
+from dspy.utils.dummies import DSPDummyLM
+
+
+# Define a simple metric function for testing
+def simple_metric(example, prediction):
+    # Simplified metric for testing: true if prediction matches expected output
+    return example.output == prediction.output
+
+
+# Example training and validation sets
+trainset = [
+    Example(input="Question: What is the color of the sky?", output="blue").with_inputs("input"),
+    Example(input="Question: What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs(
+        "input"
+    ),
+]
+
+
+def test_signature_optimizer_initialization():
+    optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
+    assert optimizer.metric == simple_metric, "Metric not correctly initialized"
+    assert optimizer.breadth == 2, "Breadth not correctly initialized"
+    assert optimizer.depth == 1, "Depth not correctly initialized"
+    assert optimizer.init_temperature == 1.4, "Initial temperature not correctly initialized"
+
+
+class SimpleModule(dspy.Module):
+    def __init__(self, signature):
+        super().__init__()
+        # COPRO doesn't work with dspy.Predict
+        self.predictor = dspy.ChainOfThought(signature)
+
+    def forward(self, **kwargs):
+        return self.predictor(**kwargs)
+
+
+def test_signature_optimizer_optimization_process():
+    optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
+    dspy.settings.configure(lm=DSPDummyLM(["Optimized instruction 1", "Optimized instruction 2"]))
+
+    student = SimpleModule("input -> output")
+
+    # Assuming the compile method of COPRO requires a student module, a development set, and evaluation kwargs
+    optimized_student = optimizer.compile(
+        student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
+    )
+
+    # Check that the optimized student has been modified from the original
+    # This check can be more specific based on how the optimization modifies the student
+    assert optimized_student is not student, "Optimization did not modify the student"
+
+    # Further tests can be added to verify the specifics of the optimization process,
+    # such as checking the instructions of the optimized student's predictors.
+
+
+def test_signature_optimizer_statistics_tracking():
+    optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
+    optimizer.track_stats = True  # Enable statistics tracking
+
+    dspy.settings.configure(lm=DSPDummyLM(["Optimized instruction"]))
+    student = SimpleModule("input -> output")
+    optimized_student = optimizer.compile(
+        student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
+    )
+
+    # Verify that statistics have been tracked and attached to the optimized student
+    assert hasattr(optimized_student, "total_calls"), "Total calls statistic not tracked"
+    assert hasattr(optimized_student, "results_best"), "Best results statistics not tracked"
+
+
+# Assuming the setup_signature_optimizer fixture and simple_metric function are defined as before
+
+
+def test_optimization_and_output_verification():
+    lm = DSPDummyLM(
+        [
+            "Optimized Prompt",
+            "Optimized Prefix",
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+    optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
+
+    student = SimpleModule("input -> output")
+
+    # Compile the student with the optimizer
+    optimized_student = optimizer.compile(
+        student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
+    )
+
+    # Simulate calling the optimized student with a new input
+    test_input = "What is the capital of France?"
+    prediction = optimized_student(input=test_input)
+
+    print(lm.get_convo(-1))
+
+    assert prediction.output == "No more responses"
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Optimized Prompt
+
+        ---
+
+        Follow the following format.
+
+        Input: ${input}
+        Reasoning: Let's think step by step in order to ${produce the output}. We ...
+        Optimized Prefix ${output}
+
+        ---
+
+        Input: What is the capital of France?
+        Reasoning: Let's think step by step in order to No more responses
+        Optimized Prefix No more responses"""
+    )
+
+
+def test_statistics_tracking_during_optimization():
+    dspy.settings.configure(lm=DSPDummyLM(["Optimized instruction for stats tracking"]))
+
+    optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
+    optimizer.track_stats = True  # Enable statistics tracking
+
+    student = SimpleModule("input -> output")
+    optimized_student = optimizer.compile(
+        student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
+    )
+
+    # Verify that statistics have been tracked
+    assert hasattr(optimized_student, "total_calls"), "Optimizer did not track total metric calls"
+    assert optimized_student.total_calls > 0, "Optimizer reported no metric calls"
+
+    # Check if the results_best and results_latest contain valid statistics
+    assert "results_best" in optimized_student.__dict__, "Optimizer did not track the best results"
+    assert "results_latest" in optimized_student.__dict__, "Optimizer did not track the latest results"
+    assert len(optimized_student.results_best) > 0, "Optimizer did not properly populate the best results statistics"
+    assert (
+        len(optimized_student.results_latest) > 0
+    ), "Optimizer did not properly populate the latest results statistics"
+
+    # Additional detailed checks can be added here to verify the contents of the tracked statistics
diff --git a/tests/dsp_LM/teleprompt/test_knn_fewshot.py b/tests/dsp_LM/teleprompt/test_knn_fewshot.py
new file mode 100644
index 0000000000..97c2dbbe3d
--- /dev/null
+++ b/tests/dsp_LM/teleprompt/test_knn_fewshot.py
@@ -0,0 +1,65 @@
+import pytest
+
+import dsp
+import dspy
+from dspy.teleprompt.knn_fewshot import KNNFewShot
+from dspy.utils.dummies import DSPDummyLM, DummyVectorizer
+
+
+def mock_example(question: str, answer: str) -> dsp.Example:
+    """Creates a mock DSP example with specified question and answer."""
+    return dspy.Example(question=question, answer=answer).with_inputs("question")
+
+
+@pytest.fixture
+def setup_knn_few_shot():
+    """Sets up a KNNFewShot instance for testing."""
+    trainset = [
+        mock_example("What is the capital of France?", "Paris"),
+        mock_example("What is the largest ocean?", "Pacific"),
+        mock_example("What is 2+2?", "4"),
+    ]
+    dsp.SentenceTransformersVectorizer = DummyVectorizer
+    knn_few_shot = KNNFewShot(k=2, trainset=trainset)
+    return knn_few_shot
+
+
+class SimpleModule(dspy.Module):
+    def __init__(self, signature):
+        super().__init__()
+        self.predictor = dspy.Predict(signature)
+
+    def forward(self, *args, **kwargs):
+        return self.predictor(**kwargs)
+
+    def reset_copy(self):
+        # Creates a new instance of SimpleModule with the same predictor
+        return SimpleModule(self.predictor.signature)
+
+
+# TODO: Test not working yet
+def _test_knn_few_shot_compile(setup_knn_few_shot):
+    """Tests the compile method of KNNFewShot with SimpleModule as student."""
+    student = SimpleModule("input -> output")
+    teacher = SimpleModule("input -> output")  # Assuming teacher uses the same module type
+
+    # Setup DSPDummyLM with a response for a query similar to one of the training examples
+    lm = DSPDummyLM(["Madrid", "10"])
+    dspy.settings.configure(lm=lm)  # Responses for the capital of Spain and the result of 5+5)
+
+    knn_few_shot = setup_knn_few_shot
+    trainset = knn_few_shot.KNN.trainset
+    compiled_student = knn_few_shot.compile(student, teacher=teacher, trainset=trainset, valset=None)
+
+    assert len(compiled_student.predictor.demos) == 1
+    assert compiled_student.predictor.demos[0].input == trainset[0].input
+    assert compiled_student.predictor.demos[0].output == trainset[0].output
+    # Simulate a query that is similar to one of the training examples
+    output = compiled_student.forward(input="What is the capital of Spain?").output
+
+    print("CONVO")
+    print(lm.get_convo(-1))
+
+    # Validate that the output corresponds to one of the expected DSPDummyLM responses
+    # This assumes the compiled_student's forward method will execute the predictor with the given query
+    assert output in ["Madrid", "10"], "The compiled student did not return the correct output based on the query"
diff --git a/tests/teleprompt/test_mipro_optimizer.py b/tests/dsp_LM/teleprompt/test_mipro_optimizer.py
similarity index 88%
rename from tests/teleprompt/test_mipro_optimizer.py
rename to tests/dsp_LM/teleprompt/test_mipro_optimizer.py
index 17e94a580e..86d8c00d0d 100644
--- a/tests/teleprompt/test_mipro_optimizer.py
+++ b/tests/dsp_LM/teleprompt/test_mipro_optimizer.py
@@ -1,11 +1,13 @@
+import re
 import textwrap
+
 import pytest
-import re
+
 import dspy
 from dsp.modules import LM
-from dspy.teleprompt.signature_opt_bayesian import MIPRO
-from dspy.utils.dummies import DummyLM
 from dspy import Example
+from dspy.teleprompt.signature_opt_bayesian import MIPRO
+from dspy.utils.dummies import DSPDummyLM
 
 
 # Define a simple metric function for testing
@@ -13,6 +15,7 @@ def simple_metric(example, prediction, trace=None):
     # Simplified metric for testing: true if prediction matches expected output
     return example.output == prediction.output
 
+
 # Some example data
 capitals = {
     "Germany": "Berlin",
@@ -31,10 +34,11 @@ def simple_metric(example, prediction, trace=None):
 # Example training and validation sets
 trainset = [
     Example(input="What is the color of the sky?", output="blue").with_inputs("input"),
-    Example(
-        input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!"
-    ).with_inputs("input"),
-] + [Example(input=f"What is the capital of {country}?", output=capital).with_inputs("input") for country, capital in capitals.items()]
+    Example(input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs("input"),
+] + [
+    Example(input=f"What is the capital of {country}?", output=capital).with_inputs("input")
+    for country, capital in capitals.items()
+]
 
 
 class ConditionalLM(LM):
@@ -73,11 +77,11 @@ def basic_request(self, prompt, num_candidates=1, **kwargs):
             # For other questions, the model will answer with the last word of the question.
             else:
                 answer = current_question.split()[-1]
-            
+
             answer = "think deeply.\nOutput: " + answer
 
-        RED, GREEN, RESET = '\033[91m', '\033[92m', '\033[0m'
-        print("=== DummyLM ===")
+        RED, GREEN, RESET = "\033[91m", "\033[92m", "\033[0m"
+        print("=== DSPDummyLM ===")
         print(prompt, end="")
         print(f"{RED}{answer}{RESET}")
         print("===")
@@ -108,20 +112,14 @@ def __call__(self, prompt, only_completed=True, return_sorted=False, **kwargs):
 
     def get_convo(self, index):
         """get the prompt + anwer from the ith message"""
-        return self.history[index]['prompt'] \
-            + " " \
-            + self.history[index]['response']['choices'][0]['text']
+        return self.history[index]["prompt"] + " " + self.history[index]["response"]["choices"][0]["text"]
 
 
 def test_bayesian_signature_optimizer_initialization():
-    optimizer = MIPRO(
-        metric=simple_metric, num_candidates=10, init_temperature=1.4, verbose=True, track_stats=True
-    )
+    optimizer = MIPRO(metric=simple_metric, num_candidates=10, init_temperature=1.4, verbose=True, track_stats=True)
     assert optimizer.metric == simple_metric, "Metric not correctly initialized"
     assert optimizer.num_candidates == 10, "Incorrect 'num_candidates' parameter initialization"
-    assert (
-        optimizer.init_temperature == 1.4
-    ), "Initial temperature not correctly initialized"
+    assert optimizer.init_temperature == 1.4, "Initial temperature not correctly initialized"
     assert optimizer.verbose is True, "Verbose flag not correctly initialized"
     assert optimizer.track_stats is True, "Track stats flag not correctly initialized"
 
@@ -165,9 +163,7 @@ def test_signature_optimizer_optimization_process():
 
 
 def test_signature_optimizer_bad_lm():
-    dspy.settings.configure(
-        lm=DummyLM([f"Optimized instruction {i}" for i in range(30)])
-    )
+    dspy.settings.configure(lm=DSPDummyLM([f"Optimized instruction {i}" for i in range(30)]))
     student = SimpleModule(signature="input -> output")
     optimizer = MIPRO(
         metric=simple_metric,
@@ -177,7 +173,7 @@ def test_signature_optimizer_bad_lm():
         track_stats=False,
     )
 
-    # Krista: when the code tries to generate bootstrapped examples, the examples are generated using DummyLM,
+    # Krista: when the code tries to generate bootstrapped examples, the examples are generated using DSPDummyLM,
     # which only outputs "Optimized instruction i" this means that none of the bootstrapped examples are successful,
     # and therefore the set of examples that we're using to generate new prompts is empty
     with pytest.raises(ValueError):
@@ -233,9 +229,9 @@ def test_optimization_and_output_verification():
         Input:
 
         ---
-        
+
         Follow the following format.
-        
+
         Input: ${input}
         Reasoning: Let's think step by step in order to ${produce the output}. We ...
         Output: ${output}
@@ -264,4 +260,4 @@ def test_optimization_and_output_verification():
         Output: Madrid"""
     )
 
-    assert lm.get_convo(-1) == expected_lm_output
\ No newline at end of file
+    assert lm.get_convo(-1) == expected_lm_output
diff --git a/tests/dsp_LM/teleprompt/test_random_search.py b/tests/dsp_LM/teleprompt/test_random_search.py
new file mode 100644
index 0000000000..9d8e63d236
--- /dev/null
+++ b/tests/dsp_LM/teleprompt/test_random_search.py
@@ -0,0 +1,39 @@
+import dspy
+from dspy import Example
+from dspy.predict import Predict
+from dspy.teleprompt import BootstrapFewShotWithRandomSearch
+from dspy.utils.dummies import DSPDummyLM
+
+
+class SimpleModule(dspy.Module):
+    def __init__(self, signature):
+        super().__init__()
+        self.predictor = Predict(signature)
+
+    def forward(self, **kwargs):
+        return self.predictor(**kwargs)
+
+
+def simple_metric(example, prediction, trace=None):
+    return example.output == prediction.output
+
+
+def test_basic_workflow():
+    """Test to ensure the basic compile flow runs without errors."""
+    student = SimpleModule("input -> output")
+    teacher = SimpleModule("input -> output")
+
+    lm = DSPDummyLM(
+        [
+            "Initial thoughts",
+            "Finish[blue]",  # Expected output for both training and validation
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    optimizer = BootstrapFewShotWithRandomSearch(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    trainset = [
+        Example(input="What is the color of the sky?", output="blue").with_inputs("input"),
+        Example(input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs("input"),
+    ]
+    optimizer.compile(student, teacher=teacher, trainset=trainset)
diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py
index c8034f53a0..a8552c7aa6 100644
--- a/tests/evaluate/test_evaluate.py
+++ b/tests/evaluate/test_evaluate.py
@@ -34,7 +34,14 @@ def test_evaluate_initialization():
 
 
 def test_evaluate_call():
-    dspy.settings.configure(lm=DummyLM({"What is 1+1?": "2", "What is 2+2?": "4"}))
+    dspy.settings.configure(
+        lm=DummyLM(
+            {
+                "What is 1+1?": {"answer": "2"},
+                "What is 2+2?": {"answer": "4"},
+            }
+        )
+    )
     devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
     program = Predict("question -> answer")
     assert program(question="What is 1+1?").answer == "2"
@@ -48,7 +55,7 @@ def test_evaluate_call():
 
 
 def test_multithread_evaluate_call():
-    dspy.settings.configure(lm=DummyLM({"What is 1+1?": "2", "What is 2+2?": "4"}))
+    dspy.settings.configure(lm=DummyLM({"What is 1+1?": {"answer": "2"}, "What is 2+2?": {"answer": "4"}}))
     devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
     program = Predict("question -> answer")
     assert program(question="What is 1+1?").answer == "2"
@@ -65,13 +72,13 @@ def test_multithread_evaluate_call():
 def test_multi_thread_evaluate_call_cancelled(monkeypatch):
     # slow LM that sleeps for 1 second before returning the answer
     class SlowLM(DummyLM):
-        def __call__(self, prompt, **kwargs):
+        def __call__(self, *args, **kwargs):
             import time
 
             time.sleep(1)
-            return super().__call__(prompt, **kwargs)
+            return super().__call__(*args, **kwargs)
 
-    dspy.settings.configure(lm=SlowLM({"What is 1+1?": "2", "What is 2+2?": "4"}))
+    dspy.settings.configure(lm=SlowLM({"What is 1+1?": {"answer": "2"}, "What is 2+2?": {"answer": "4"}}))
 
     devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
     program = Predict("question -> answer")
@@ -101,7 +108,7 @@ def sleep_then_interrupt():
 
 
 def test_evaluate_call_bad():
-    dspy.settings.configure(lm=DummyLM({"What is 1+1?": "0", "What is 2+2?": "0"}))
+    dspy.settings.configure(lm=DummyLM({"What is 1+1?": {"answer": "0"}, "What is 2+2?": {"answer": "0"}}))
     devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
     program = Predict("question -> answer")
     ev = Evaluate(
diff --git a/tests/functional/test_functional.py b/tests/functional/test_functional.py
index 9e84d83933..61ac8c15a1 100644
--- a/tests/functional/test_functional.py
+++ b/tests/functional/test_functional.py
@@ -21,7 +21,7 @@ def hard_question(topic: str) -> str:
         """Think of a hard factual question about a topic."""
 
     expected = "What is the speed of light?"
-    lm = DummyLM([expected])
+    lm = DummyLM([{"hard_question": expected}])
     dspy.settings.configure(lm=lm)
 
     question = hard_question(topic="Physics")
@@ -36,7 +36,7 @@ def hard_questions(topics: List[str]) -> List[str]:
         pass
 
     expected = ["What is the speed of light?", "What is the speed of sound?"]
-    lm = DummyLM(['["What is the speed of light?", "What is the speed of sound?"]'])
+    lm = DummyLM([{"hard_questions": '["What is the speed of light?", "What is the speed of sound?"]'}])
     dspy.settings.configure(lm=lm)
 
     question = hard_questions(topics=["Physics", "Music"])
@@ -54,7 +54,7 @@ def hard_question(topic: str) -> Question:
         """Think of a hard factual question about a topic."""
 
     expected = "What is the speed of light?"
-    lm = DummyLM([f'{{"value": "{expected}"}}'])
+    lm = DummyLM([{"hard_question": f'{{"value": "{expected}"}}'}])
     dspy.settings.configure(lm=lm)
 
     question = hard_question(topic="Physics")
@@ -75,7 +75,7 @@ def answer(question: Question) -> Answer:
         pass
 
     question = Question(value="What is the speed of light?")
-    lm = DummyLM([f'{{"value": "3e8"}}'])
+    lm = DummyLM([{"answer": '{"value": "3e8"}'}])
     dspy.settings.configure(lm=lm)
 
     result = answer(question=question)
@@ -110,12 +110,10 @@ def forward(self, **kwargs):
 
     lm = DummyLM(
         [
-            "What is the speed of light?",
-            "Some bad reasoning, 3e8 m/s.",
-            "3e8",  # Bad answer 1
-            "{...}",  # Model is asked to create an example
-            "Some good reasoning...",
-            expected.model_dump_json(),  # Good answer
+            {"hard_question": "What is the speed of light?"},
+            {"reasoning": "Some bad reasoning, 3e8 m/s.", "answer": "3e8"},  # Bad answer 1
+            {"json_object": "{...}"},  # Model is asked to create an example
+            {"reasoning": "Some good reasoning, 3e8 m/s.", "answer": f"{expected.model_dump_json()}"},  # Good answer
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -145,7 +143,7 @@ class MySignature(dspy.Signature):
     expected = "What is the speed of light?"
     lm = DummyLM(
         [
-            Question(value=expected).model_dump_json(),
+            {"output": f"{Question(value=expected).model_dump_json()}"},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -229,8 +227,6 @@ def simple_metric(example, prediction, trace=None):
     bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
     compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
 
-    lm.inspect_history(n=2)
-
     # Check that the compiled student has the correct demos
     _, predict = next(compiled_student.named_sub_modules(Predict, skip_compiled=False))
     demos = predict.demos
@@ -246,28 +242,6 @@ def simple_metric(example, prediction, trace=None):
     prediction = compiled_student(input=trainset[0].input)
     assert prediction == trainset[0].output
 
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields `input`, produce the fields `output`.
-
-        ---
-
-        Follow the following format.
-
-        Input: ${input}
-        Output: ${output}
-
-        ---
-
-        Input: What is the color of the sky?
-        Output: blue
-
-        ---
-
-        Input: What is the color of the sky?
-        Output: blue"""
-    )
-
 
 def test_regex():
     class TravelInformation(BaseModel):
@@ -289,11 +263,11 @@ def flight_information(email: str) -> TravelInformation:
     lm = DummyLM(
         [
             # Example with a bad origin code.
-            '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
+            {"flight_information": '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}'},
             # Example to help the model understand
-            "{...}",
+            {"json_object": "{...}"},
             # Fixed
-            '{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}',
+            {"flight_information": '{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}'},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -349,11 +323,12 @@ def flight_information(email: str) -> TravelInformation:
         [
             # Example with a bad origin code.
             (
-                "Here is your json: "
-                "{"
-                '"origin": {"code":"JFK", "lat":40.6446, "lon":-73.7797}, '
-                '"destination": {"code":"LAX", "lat":33.942791, "lon":-118.410042}, '
-                '"date": "2022-12-25"}'
+                {
+                    "flight_information": "Here is your json: {"
+                    '"origin": {"code":"JFK", "lat":40.6446, "lon":-73.7797}, '
+                    '"destination": {"code":"LAX", "lat":33.942791, "lon":-118.410042}, '
+                    '"date": "2022-12-25"}'
+                }
             ),
         ]
     )
@@ -378,9 +353,8 @@ def flight_information(email: str) -> TravelInformation:
 
     lm = DummyLM(
         [
-            "A list of bad inputs",
-            '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
-            '{"origin": "JFK", "destination": "LAX", "date": "bad date"}',
+            {"flight_information": '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}'},
+            {"flight_information": '{"origin": "JFK", "destination": "LAX", "date": "bad date"}'},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -402,11 +376,11 @@ def flight_information(email: str) -> TravelInformation:
     lm = DummyLM(
         [
             # First origin is wrong, then destination, then all is good
-            '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
-            "{...}",  # Example to help the model understand
-            '{"origin": "JFK", "destination": "LA0", "date": "2022-12-25"}',
-            "{...}",  # Example to help the model understand
-            '{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}',
+            {"flight_information": '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}'},
+            {"json_object": "{...}"},  # Example to help the model understand
+            {"flight_information": '{"origin": "JFK", "destination": "LA0", "date": "2022-12-25"}'},
+            {"json_object": "{...}"},  # Example to help the model understand
+            {"flight_information": '{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}'},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -414,32 +388,6 @@ def flight_information(email: str) -> TravelInformation:
     assert flight_information(email="Some email") == TravelInformation(
         origin="JFK", destination="LAX", date=datetime.date(2022, 12, 25)
     )
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields `email`, produce the fields `flight_information`.
-
-        ---
-
-        Follow the following format.
-
-        Email: ${email}
-
-        Past Error in Flight Information: An error to avoid in the future
-
-        Past Error (2) in Flight Information: An error to avoid in the future
-
-        Flight Information: ${flight_information}. Respond with a single JSON object. JSON Schema: {"properties": {"origin": {"pattern": "^[A-Z]{3}$", "title": "Origin", "type": "string"}, "destination": {"pattern": "^[A-Z]{3}$", "title": "Destination", "type": "string"}, "date": {"format": "date", "title": "Date", "type": "string"}}, "required": ["origin", "destination", "date"], "title": "TravelInformation", "type": "object"}
-
-        ---
-
-        Email: Some email
-
-        Past Error in Flight Information: String should match pattern '^[A-Z]{3}$': origin (error type: string_pattern_mismatch)
-
-        Past Error (2) in Flight Information: String should match pattern '^[A-Z]{3}$': destination (error type: string_pattern_mismatch)
-
-        Flight Information: {"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}"""
-    )
 
 
 def test_field_validator():
@@ -460,37 +408,12 @@ def get_user_details() -> UserDetails:
 
     # Keep making the mistake (lower case name) until we run
     # out of retries.
-    lm = DummyLM(
-        [
-            '{"name": "lower case name", "age": 25}',
-        ]
-        * 10
-    )
+    lm = DummyLM([{"get_user_details": '{"name": "lower case name", "age": 25}'}] * 10)
     dspy.settings.configure(lm=lm)
 
     with pytest.raises(ValueError):
         get_user_details()
 
-    print(lm.get_convo(-1))
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields , produce the fields `get_user_details`.
-
-        ---
-
-        Follow the following format.
-
-        Past Error in Get User Details: An error to avoid in the future
-        Past Error (2) in Get User Details: An error to avoid in the future
-        Get User Details: ${get_user_details}. Respond with a single JSON object. JSON Schema: {"properties": {"name": {"title": "Name", "type": "string"}, "age": {"title": "Age", "type": "integer"}}, "required": ["name", "age"], "title": "UserDetails", "type": "object"}
-
-        ---
-
-        Past Error in Get User Details: Value error, Name must be in uppercase.: name (error type: value_error)
-        Past Error (2) in Get User Details: Value error, Name must be in uppercase.: name (error type: value_error)
-        Get User Details: {"name": "lower case name", "age": 25}"""
-    )
-
 
 def test_annotated_field():
     @predictor
@@ -498,7 +421,7 @@ def test(input: Annotated[str, Field(description="description")]) -> Annotated[f
         pass
 
     # First try 0, which fails, then try 0.5, which passes
-    lm = DummyLM(["0", "0.5"])
+    lm = DummyLM([{"test": "0"}, {"test": "0.5"}])
     dspy.settings.configure(lm=lm)
 
     output = test(input="input")
@@ -507,7 +430,7 @@ def test(input: Annotated[str, Field(description="description")]) -> Annotated[f
 
 
 def test_multiple_outputs():
-    lm = DummyLM([str(i) for i in range(100)])
+    lm = DummyLM([{"output": f"{i}"} for i in range(100)])
     dspy.settings.configure(lm=lm)
 
     test = TypedPredictor("input -> output")
@@ -516,7 +439,7 @@ def test_multiple_outputs():
 
 
 def test_multiple_outputs_int():
-    lm = DummyLM([str(i) for i in range(100)])
+    lm = DummyLM([{"output": f"{i}"} for i in range(100)])
     dspy.settings.configure(lm=lm)
 
     class TestSignature(dspy.Signature):
@@ -533,9 +456,9 @@ def test_multiple_outputs_int_cot():
     # Note: Multiple outputs only work when the language model "speculatively" generates all the outputs in one go.
     lm = DummyLM(
         [
-            "thoughts 0\nOutput: 0\n",
-            "thoughts 1\nOutput: 1\n",
-            "thoughts 2\nOutput: 2\n",
+            {"reasoning": "thoughts 0", "output": "0"},
+            {"reasoning": "thoughts 1", "output": "1"},
+            {"reasoning": "thoughts 2", "output": "2"},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -547,7 +470,7 @@ def test_multiple_outputs_int_cot():
 
 
 def test_parse_type_string():
-    lm = DummyLM([str(i) for i in range(100)])
+    lm = DummyLM([{"output": f"{i}"} for i in range(100)])
     dspy.settings.configure(lm=lm)
 
     test = TypedPredictor("input:int -> output:int")
@@ -557,7 +480,7 @@ def test_parse_type_string():
 
 
 def test_literal():
-    lm = DummyLM(['"2"', '"3"'])
+    lm = DummyLM([{"f": '"2"'}, {"f": '"3"'}])
     dspy.settings.configure(lm=lm)
 
     @predictor
@@ -568,7 +491,7 @@ def f() -> Literal["2", "3"]:
 
 
 def test_literal_missmatch():
-    lm = DummyLM([f'"{i}"' for i in range(5, 100)])
+    lm = DummyLM([{"f": f"{i}"} for i in range(5, 100)])
     dspy.settings.configure(lm=lm)
 
     @predictor(max_retries=1)
@@ -582,7 +505,7 @@ def f() -> Literal["2", "3"]:
 
 
 def test_literal_int():
-    lm = DummyLM(["2", "3"])
+    lm = DummyLM([{"f": "2"}, {"f": "3"}])
     dspy.settings.configure(lm=lm)
 
     @predictor
@@ -593,7 +516,7 @@ def f() -> Literal[2, 3]:
 
 
 def test_literal_int_missmatch():
-    lm = DummyLM([f"{i}" for i in range(5, 100)])
+    lm = DummyLM([{"f": f"{i}"} for i in range(5, 100)])
     dspy.settings.configure(lm=lm)
 
     @predictor(max_retries=1)
@@ -612,8 +535,8 @@ class SimpleOutput(dspy.Signature):
 
     lm = DummyLM(
         [
-            "2.1",  # Bad output
-            "0.5",  # Good output
+            {"output": "2.1"},  # Bad output
+            {"output": "0.5"},  # Good output
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -635,12 +558,12 @@ class ExampleSignature(dspy.Signature):
 
     lm = DummyLM(
         [
-            '{"fact": "The sky is blue", "varacity": true}',
-            '{"fact": "The sky is green", "varacity": false}',
-            '{"fact": "The sky is red", "varacity": true}',
-            '{"fact": "The earth is flat", "varacity": false}',
-            '{"fact": "The earth is round", "varacity": true}',
-            '{"fact": "The earth is a cube", "varacity": false}',
+            {"fact": '{"fact": "The sky is blue", "varacity": true}'},
+            {"fact": '{"fact": "The sky is green", "varacity": false}'},
+            {"fact": '{"fact": "The sky is red", "varacity": true}'},
+            {"fact": '{"fact": "The earth is flat", "varacity": false}'},
+            {"fact": '{"fact": "The earth is round", "varacity": true}'},
+            {"fact": '{"fact": "The earth is a cube", "varacity": false}'},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -676,7 +599,7 @@ class ScoredSignature(dspy.Signature):
 
     program = TypedChainOfThought(ScoredSignature)
 
-    lm = DummyLM(["Thoughts", "Output"])
+    lm = DummyLM([{"reasoning": "Thoughts", "proposed_signature": "Output"}])
     dspy.settings.configure(lm=lm)
 
     output = program(
@@ -691,25 +614,6 @@ class ScoredSignature(dspy.Signature):
 
     assert output == "Output"
 
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields `attempted_signatures`, produce the fields `proposed_signature`.
-
-        ---
-
-        Follow the following format.
-
-        Attempted Signatures: ${attempted_signatures}
-        Reasoning: Let's think step by step in order to ${produce the proposed_signature}. We ...
-        Proposed Signature: ${proposed_signature}
-
-        ---
-
-        Attempted Signatures: [{"string":"string 1","score":0.5},{"string":"string 2","score":0.4},{"string":"string 3","score":0.3}]
-        Reasoning: Let's think step by step in order to Thoughts
-        Proposed Signature: Output"""
-    )
-
 
 def test_custom_reasoning_field():
     class Question(pydantic.BaseModel):
@@ -727,7 +631,7 @@ class QuestionSignature(dspy.Signature):
     program = TypedChainOfThought(QuestionSignature, reasoning=reasoning)
 
     expected = "What is the speed of light?"
-    lm = DummyLM(["Thoughts", f'{{"value": "{expected}"}}'])
+    lm = DummyLM([{"reasoning": "Thoughts", "question": f'{{"value": "{expected}"}}'}])
     dspy.settings.configure(lm=lm)
 
     output = program(topic="Physics")
@@ -735,25 +639,6 @@ class QuestionSignature(dspy.Signature):
     assert isinstance(output.question, Question)
     assert output.question.value == expected
 
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields `topic`, produce the fields `question`.
-
-        ---
-
-        Follow the following format.
-
-        Topic: ${topic}
-        Custom Reasoning: Let's break this down. To generate a question about ${topic}, we should ...
-        Question: ${question}. Respond with a single JSON object. JSON Schema: {"properties": {"value": {"title": "Value", "type": "string"}}, "required": ["value"], "title": "Question", "type": "object"}
-
-        ---
-
-        Topic: Physics
-        Custom Reasoning: Let's break this down. To generate a question about Thoughts
-        Question: {"value": "What is the speed of light?"}"""
-    )
-
 
 def test_generic_signature():
     T = TypeVar("T")
@@ -766,7 +651,7 @@ class GenericSignature(dspy.Signature, Generic[T]):
     predictor = TypedPredictor(GenericSignature[int])
     assert predictor.signature.instructions == "My signature"
 
-    lm = DummyLM(["23"])
+    lm = DummyLM([{"output": "23"}])
     dspy.settings.configure(lm=lm)
 
     assert predictor().output == 23
@@ -779,7 +664,7 @@ class ValidatedSignature(dspy.Signature):
         @pydantic.field_validator("a")
         @classmethod
         def space_in_a(cls, a: str) -> str:
-            if not " " in a:
+            if " " not in a:
                 raise ValueError("a must contain a space")
             return a
 
@@ -802,11 +687,17 @@ def check_square(n):
     def next_square(n: int) -> Annotated[int, AfterValidator(check_square)]:
         """What is the next square number after n?"""
 
-    lm = DummyLM(["3", "False", "4", "True"])
+    lm = DummyLM(
+        [
+            {"next_square": "3"},
+            {"is_square": "False"},
+            {"next_square": "4"},
+            {"is_square": "True"},
+        ]
+    )
     dspy.settings.configure(lm=lm)
 
     m = next_square(n=2)
-    lm.inspect_history(n=2)
 
     assert m == 4
 
@@ -824,7 +715,7 @@ class MySignature(dspy.Signature):
         n: int = dspy.InputField()
         next_square: Annotated[int, AfterValidator(is_square)] = dspy.OutputField()
 
-    lm = DummyLM(["3", "4"])
+    lm = DummyLM([{"next_square": "3"}, {"next_square": "4"}])
     dspy.settings.configure(lm=lm)
 
     m = TypedPredictor(MySignature)(n=2).next_square
@@ -843,7 +734,7 @@ def is_square(n: int) -> int:
     def next_square(n: int) -> Annotated[int, AfterValidator(is_square)]:
         """What is the next square number after n?"""
 
-    lm = DummyLM(["3", "4"])
+    lm = DummyLM([{"next_square": "3"}, {"next_square": "4"}])
     dspy.settings.configure(lm=lm)
 
     m = next_square(n=2)
@@ -861,70 +752,34 @@ def test_demos():
         trainset=[ex.with_inputs("input") for ex in demos],
     )
 
-    lm = DummyLM(["Paris"])
+    lm = DummyLM([{"output": "Paris"}])
     dspy.settings.configure(lm=lm)
 
     assert program(input="What is the capital of France?").output == "Paris"
 
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields `input`, produce the fields `output`.
-
-        ---
-
-        Follow the following format.
-
-        Input: ${input}
-        Output: ${output}
 
-        ---
-
-        Input: What is the speed of light?
-        Output: 3e8
-
-        ---
-
-        Input: What is the capital of France?
-        Output: Paris"""
-    )
-
-
-def _test_demos_missing_input():
+def test_demos_missing_input_in_demo():
     demos = [dspy.Example(input="What is the speed of light?", output="3e8")]
     program = LabeledFewShot(k=len(demos)).compile(
         student=dspy.TypedPredictor("input -> output, thoughts"),
         trainset=[ex.with_inputs("input") for ex in demos],
     )
-    dspy.settings.configure(lm=DummyLM(["My thoughts", "Paris"]))
+    lm = DummyLM([{"thoughts": "My thoughts", "output": "Paris"}])
+    dspy.settings.configure(lm=lm)
     assert program(input="What is the capital of France?").output == "Paris"
 
-    assert dspy.settings.lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields `input`, produce the fields `output`.
-
-        ---
-
-        Follow the following format.
-
-        Input: ${input}
-        Thoughts: ${thoughts}
-        Output: ${output}
-
-        ---
-
-        Input: What is the speed of light?
-        Output: 3e8
-
-        ---
-
-        Input: What is the capital of France?
-        Thoughts: My thoughts
-        Output: Paris"""
-    )
-
 
 def test_conlist():
-    dspy.settings.configure(lm=DummyLM(["[]", "[1]", "[1, 2]", "[1, 2, 3]"]))
+    dspy.settings.configure(
+        lm=DummyLM(
+            [
+                {"make_numbers": "[]"},
+                {"make_numbers": "[1]"},
+                {"make_numbers": "[1, 2]"},
+                {"make_numbers": "[1, 2, 3]"},
+            ]
+        )
+    )
 
     @predictor
     def make_numbers(input: str) -> Annotated[list[int], Field(min_items=2)]:
@@ -934,7 +789,16 @@ def make_numbers(input: str) -> Annotated[list[int], Field(min_items=2)]:
 
 
 def test_conlist2():
-    dspy.settings.configure(lm=DummyLM(["[]", "[1]", "[1, 2]", "[1, 2, 3]"]))
+    dspy.settings.configure(
+        lm=DummyLM(
+            [
+                {"output": "[]"},
+                {"output": "[1]"},
+                {"output": "[1, 2]"},
+                {"output": "[1, 2, 3]"},
+            ]
+        )
+    )
 
     make_numbers = TypedPredictor("input:str -> output:Annotated[List[int], Field(min_items=2)]")
     assert make_numbers(input="What are the first two numbers?").output == [1, 2]
@@ -952,7 +816,7 @@ def check_cateogry(self):
                 raise ValueError(f"category not in {self.allowed_categories}")
             return self
 
-    lm = DummyLM(["horse", "dog"])
+    lm = DummyLM([{"category": "horse"}, {"category": "dog"}])
     dspy.settings.configure(lm=lm)
     predictor = TypedPredictor(MySignature)
 
diff --git a/tests/functional/test_signature_opt_typed.py b/tests/functional/test_signature_opt_typed.py
index 59a610e6b6..6778f6b694 100644
--- a/tests/functional/test_signature_opt_typed.py
+++ b/tests/functional/test_signature_opt_typed.py
@@ -1,17 +1,13 @@
-from typing import Generic, TypeVar
+import json
 
-import pydantic
-import dspy
-from dspy.evaluate import Evaluate
-from dspy.functional import TypedPredictor
-from dspy.teleprompt.signature_opt_typed import optimize_signature, make_info
-from dspy.utils import DummyLM
+from pydantic_core import to_jsonable_python
 
+import dspy
 from dspy.evaluate import Evaluate
 from dspy.evaluate.metrics import answer_exact_match
 from dspy.functional import TypedPredictor
-import json
-from pydantic_core import to_jsonable_python
+from dspy.teleprompt.signature_opt_typed import make_info, optimize_signature
+from dspy.utils import DummyLM
 
 hotpotqa = [
     ex.with_inputs("question")
@@ -105,12 +101,14 @@ class BasicQA(dspy.Signature):
         question: str = dspy.InputField()
         answer: str = dspy.OutputField()
 
-    qa_model = DummyLM([])
+    qa_model = DummyLM([{"answer": "foo"}] * 100)
     prompt_model = DummyLM(
         [
+            {
+                "reasoning": "some thoughts",
+                "proposed_signatures": '[{"instructions": "I", "question_desc": "$q", "question_prefix": "Q:", "answer_desc": "$a", "answer_prefix": "A:"}]',
+            }
             # Seed prompts
-            "some thoughts",
-            '[{"instructions": "I", "question_desc": "$q", "question_prefix": "Q:", "answer_desc": "$a", "answer_prefix": "A:"}]',
         ]
     )
     dspy.settings.configure(lm=qa_model)
@@ -167,10 +165,8 @@ class ExpectedSignature2(dspy.Signature):
     qa_model = DummyLM([])
     prompt_model = DummyLM(
         [
-            "some thoughts",
-            json.dumps([to_jsonable_python(info1)]),
-            "some thoughts",
-            json.dumps([to_jsonable_python(info2)]),
+            {"reasoning": "some thoughts", "proposed_signatures": json.dumps([to_jsonable_python(info1)])},
+            {"reasoning": "some thoughts", "proposed_signatures": json.dumps([to_jsonable_python(info2)])},
         ]
     )
     dspy.settings.configure(lm=qa_model)
diff --git a/tests/predict/test_chain_of_thought.py b/tests/predict/test_chain_of_thought.py
index c1d08e729c..2451bef664 100644
--- a/tests/predict/test_chain_of_thought.py
+++ b/tests/predict/test_chain_of_thought.py
@@ -1,35 +1,16 @@
 import textwrap
+
 import dspy
 from dspy import ChainOfThought
 from dspy.utils import DummyLM
 
 
 def test_initialization_with_string_signature():
-    lm = DummyLM(["find the number after 1", "2"])
+    lm = DummyLM([{"reasoning": "find the number after 1", "answer": "2"}])
     dspy.settings.configure(lm=lm)
     predict = ChainOfThought("question -> answer")
     assert list(predict.extended_signature.output_fields.keys()) == [
-        "rationale",
+        "reasoning",
         "answer",
     ]
     assert predict(question="What is 1+1?").answer == "2"
-
-    print(lm.get_convo(-1))
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields `question`, produce the fields `answer`.
-
-        ---
-
-        Follow the following format.
-
-        Question: ${question}
-        Reasoning: Let's think step by step in order to ${produce the answer}. We ...
-        Answer: ${answer}
-
-        ---
-
-        Question: What is 1+1?
-        Reasoning: Let's think step by step in order to find the number after 1
-        Answer: 2"""
-    )
diff --git a/tests/predict/test_chain_of_thought_with_hint.py b/tests/predict/test_chain_of_thought_with_hint.py
index c28fb375ff..77c4f9ac2b 100644
--- a/tests/predict/test_chain_of_thought_with_hint.py
+++ b/tests/predict/test_chain_of_thought_with_hint.py
@@ -1,10 +1,12 @@
+import textwrap
+
 import dspy
 from dspy import ChainOfThoughtWithHint
 from dspy.utils import DummyLM
 
 
 def test_cot_with_no_hint():
-    lm = DummyLM(["find the number after 1", "2"])
+    lm = DummyLM([{"rationale": "find the number after 1", "answer": "2"}])
     dspy.settings.configure(lm=lm)
     predict = ChainOfThoughtWithHint("question -> answer")
     # Check output fields have the right order
@@ -15,16 +17,9 @@ def test_cot_with_no_hint():
     ]
     assert predict(question="What is 1+1?").answer == "2"
 
-    final_convo = lm.get_convo(-1)
-    assert final_convo.endswith(
-        "Question: What is 1+1?\n"
-        "Reasoning: Let's think step by step in order to find the number after 1\n"
-        "Answer: 2"
-    )
-
 
 def test_cot_with_hint():
-    lm = DummyLM(["find the number after 1", "2"])
+    lm = DummyLM([{"rationale": "find the number after 1", "hint": "Is it helicopter?", "answer": "2"}])
     dspy.settings.configure(lm=lm)
     predict = ChainOfThoughtWithHint("question -> answer")
     assert list(predict.extended_signature2.output_fields.keys()) == [
@@ -33,11 +28,3 @@ def test_cot_with_hint():
         "answer",
     ]
     assert predict(question="What is 1+1?", hint="think small").answer == "2"
-
-    final_convo = lm.get_convo(-1)
-    assert final_convo.endswith(
-        "Question: What is 1+1?\n\n"
-        "Reasoning: Let's think step by step in order to find the number after 1\n\n"
-        "Hint: think small\n\n"
-        "Answer: 2"
-    )
diff --git a/tests/predict/test_multi_chain_comparison.py b/tests/predict/test_multi_chain_comparison.py
index 8c936a2d80..8354f64d07 100644
--- a/tests/predict/test_multi_chain_comparison.py
+++ b/tests/predict/test_multi_chain_comparison.py
@@ -30,7 +30,7 @@ class BasicQA(dspy.Signature):
 
     # Call the MultiChainComparison on the completions
     question = "What is the color of the sky?"
-    lm = DummyLM(["my rationale", "blue"])
+    lm = DummyLM([{"rationale": "my rationale", "answer": "blue"}])
     dspy.settings.configure(lm=lm)
     final_pred = compare_answers(completions, question=question)
 
diff --git a/tests/predict/test_predict.py b/tests/predict/test_predict.py
index c701b380a8..b04e087e8b 100644
--- a/tests/predict/test_predict.py
+++ b/tests/predict/test_predict.py
@@ -1,11 +1,13 @@
-import dspy
-from dspy import Predict, Signature, TypedPredictor
-from dspy.utils.dummies import DummyLM
 import copy
 import textwrap
+
 import pydantic
 import ujson
 
+import dspy
+from dspy import Predict, Signature, TypedPredictor
+from dspy.utils.dummies import DummyLM
+
 
 def test_initialization_with_string_signature():
     signature_string = "input1, input2 -> output"
@@ -39,20 +41,10 @@ def test_lm_after_dump_and_load_state():
 
 def test_call_method():
     predict_instance = Predict("input -> output")
-    lm = DummyLM(["test output"])
+    lm = DummyLM([{"output": "test output"}])
     dspy.settings.configure(lm=lm)
     result = predict_instance(input="test input")
     assert result.output == "test output"
-    assert lm.get_convo(-1) == (
-        "Given the fields `input`, produce the fields `output`.\n"
-        "\n---\n\n"
-        "Follow the following format.\n\n"
-        "Input: ${input}\n"
-        "Output: ${output}\n"
-        "\n---\n\n"
-        "Input: test input\n"
-        "Output: test output"
-    )
 
 
 def test_instructions_after_dump_and_load_state():
@@ -137,14 +129,14 @@ class Output(pydantic.BaseModel):
 
 def test_forward_method():
     program = Predict("question -> answer")
-    dspy.settings.configure(lm=DummyLM([]))
+    dspy.settings.configure(lm=DummyLM([{"answer": "No more responses"}]))
     result = program(question="What is 1+1?").answer
     assert result == "No more responses"
 
 
 def test_forward_method2():
     program = Predict("question -> answer1, answer2")
-    dspy.settings.configure(lm=DummyLM(["my first answer", "my second answer"]))
+    dspy.settings.configure(lm=DummyLM([{"answer1": "my first answer", "answer2": "my second answer"}]))
     result = program(question="What is 1+1?")
     assert result.answer1 == "my first answer"
     assert result.answer2 == "my second answer"
@@ -159,7 +151,7 @@ def test_config_management():
 
 def test_multi_output():
     program = Predict("question -> answer", n=2)
-    dspy.settings.configure(lm=DummyLM(["my first answer", "my second answer"]))
+    dspy.settings.configure(lm=DummyLM([{"answer": "my first answer"}, {"answer": "my second answer"}]))
     results = program(question="What is 1+1?")
     assert results.completions.answer[0] == "my first answer"
     assert results.completions.answer[1] == "my second answer"
@@ -170,8 +162,8 @@ def test_multi_output2():
     dspy.settings.configure(
         lm=DummyLM(
             [
-                "my 0 answer\nAnswer 2: my 2 answer",
-                "my 1 answer\nAnswer 2: my 3 answer",
+                {"answer1": "my 0 answer", "answer2": "my 2 answer"},
+                {"answer1": "my 1 answer", "answer2": "my 3 answer"},
             ],
         )
     )
@@ -202,21 +194,6 @@ class OutputOnlySignature(dspy.Signature):
 
     predictor = Predict(OutputOnlySignature)
 
-    lm = DummyLM(["short answer"])
+    lm = DummyLM([{"output": "short answer"}])
     dspy.settings.configure(lm=lm)
     assert predictor().output == "short answer"
-
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields , produce the fields `output`.
-        
-        ---
-        
-        Follow the following format.
-        
-        Output: ${output}
-        
-        ---
-        
-        Output: short answer"""
-    )
diff --git a/tests/predict/test_program_of_thought.py b/tests/predict/test_program_of_thought.py
index 2aa153a1d6..007c4e664b 100644
--- a/tests/predict/test_program_of_thought.py
+++ b/tests/predict/test_program_of_thought.py
@@ -1,121 +1,38 @@
-from dspy import Signature, ProgramOfThought
+import textwrap
+
 import dspy
+from dspy import ProgramOfThought, Signature
 from dspy.utils import DummyLM
-import textwrap
+
 
 class BasicQA(Signature):
     question = dspy.InputField()
     answer = dspy.OutputField(desc="often between 1 and 5 words")
 
+
 def test_pot_code_generation():
-    pot = ProgramOfThought(BasicQA)
-    lm = DummyLM([
-        "Reason_A",
-        "```python\nresult = 1+1\n```", 
-        "Reason_B",
-        "2",
-    ])
+    lm = DummyLM(
+        [
+            {"reasoning": "Reason_A", "generated_code": "```python\nresult = 1+1\n```"},
+            {"reasoning": "Reason_B", "answer": "2"},
+        ]
+    )
     dspy.settings.configure(lm=lm)
+    pot = ProgramOfThought(BasicQA)
     res = pot(question="What is 1+1?")
     assert res.answer == "2"
-    assert lm.get_convo(index=-1) == textwrap.dedent("""\
-        Given the final code `question`, `final_generated_code`, `code_output`, provide the final `answer`.
-
-        ---
-
-        Follow the following format.
-
-        Question: ${question}
-
-        Code: python code that answers the question
-
-        Code Output: output of previously-generated python code
-
-        Reasoning: Let's think step by step in order to ${produce the answer}. We ...
 
-        Answer: often between 1 and 5 words
-
-        ---
-
-        Question: What is 1+1?
-
-        Code: result = 1+1
-
-        Code Output: 2
-
-        Reasoning: Let's think step by step in order to Reason_B
-
-        Answer: 2""")
 
 def test_pot_code_generation_with_error():
-    pot = ProgramOfThought(BasicQA)
-    lm = DummyLM([
-        "Reason_A",
-        "```python\nresult = 1+0/0\n```",
-        "Reason_B", # Error: division by zero
-        "```python\nresult = 1+1\n```",
-        "Reason_C",
-        "2",
-    ])
+    lm = DummyLM(
+        [
+            {"reasoning": "Reason_A", "generated_code": "```python\nresult = 1+0/0\n```"},
+            {"reasoning": "Reason_B", "generated_code": "```python\nresult = 1+1\n```"},
+            {"reasoning": "Reason_C", "answer": "2"},
+        ]
+    )
     dspy.settings.configure(lm=lm)
+
+    pot = ProgramOfThought(BasicQA)
     res = pot(question="What is 1+1?")
     assert res.answer == "2"
-
-    # The first code example failed
-    assert lm.get_convo(index=2) == textwrap.dedent("""\
-        You are given `question`, `previous_code`, `error` due to an error in previous code.
-        Your task is to correct the error and provide the new `generated_code`.
-
-        ---
-
-        Follow the following format.
-
-        Question: ${question}
-
-        Previous Code: previously-generated python code that errored
-
-        Error: error message from previously-generated python code
-
-        Reasoning: Let's think step by step in order to ${produce the generated_code}. We ...
-
-        Code: python code that answers the question
-
-        ---
-
-        Question: What is 1+1?
-
-        Previous Code: result = 1+0/0
-
-        Error: division by zero
-
-        Reasoning: Let's think step by step in order to Reason_B""")
-
-    # The second code example succeeded
-    assert lm.get_convo(-1) == textwrap.dedent("""\
-        Given the final code `question`, `final_generated_code`, `code_output`, provide the final `answer`.
-
-        ---
-
-        Follow the following format.
-
-        Question: ${question}
-
-        Code: python code that answers the question
-
-        Code Output: output of previously-generated python code
-
-        Reasoning: Let's think step by step in order to ${produce the answer}. We ...
-
-        Answer: often between 1 and 5 words
-
-        ---
-
-        Question: What is 1+1?
-
-        Code: result = 1+1
-
-        Code Output: 2
-
-        Reasoning: Let's think step by step in order to Reason_C
-
-        Answer: 2""")
diff --git a/tests/predict/test_react.py b/tests/predict/test_react.py
index 9b187471b4..2573abcb8d 100644
--- a/tests/predict/test_react.py
+++ b/tests/predict/test_react.py
@@ -8,8 +8,7 @@ def test_example_no_tools():
     # Createa a simple dataset which the model will use with the Retrieve tool.
     lm = dspy.utils.DummyLM(
         [
-            "Initial thoughts",  # Thought_1
-            "Finish[blue]",  # Action_1
+            {"Thought_1": "Initial thoughts", "Action_1": "Finish[blue]"},
         ]
     )
     dspy.settings.configure(lm=lm, rm=dummy_rm())
@@ -24,28 +23,13 @@ def test_example_no_tools():
     result = program(question=question)
     assert result.answer == "blue"
 
-    # For debugging
-    print("---")
-    for row in lm.history:
-        print(row["prompt"])
-        print("Response:", row["response"]["choices"][0]["text"])
-        print("---")
-
-    assert lm.get_convo(-1).endswith(
-        "Question: What is the color of the sky?\n"
-        "Thought 1: Initial thoughts\n"
-        "Action 1: Finish[blue]"
-    )
-
 
 def test_example_search():
     # Createa a simple dataset which the model will use with the Retrieve tool.
     lm = dspy.utils.DummyLM(
         [
-            "Initial thoughts",  # Thought_1
-            "Search[the color of the sky]",  # Thought_1
-            "More thoughts",  # Thought_2
-            "Finish[blue]",  # Action_2
+            {"Thought_1": "Initial thoughts", "Action_1": "Search[the color of the sky]"},
+            {"Thought_2": "More thoughts", "Action_2": "Finish[blue]\n\n"},
         ]
     )
     rm = dummy_rm(
@@ -72,21 +56,6 @@ def test_example_search():
     result = program(question=question)
     assert result.answer == "blue"
 
-    # For debugging
-    print(lm.get_convo(-1))
-
-    assert lm.get_convo(-1).endswith(
-        "Question: What is the color of the sky?\n\n"
-        "Thought 1: Initial thoughts\n\n"
-        "Action 1: Search[the color of the sky]\n\n"
-        "Observation 1:\n"
-        "[1] «We all know the color of the sky is blue.»\n"
-        "[2] «Somethng about the sky colors»\n"
-        "[3] «This sentence is completely irellevant to answer the question.»\n\n"
-        "Thought 2: More thoughts\n\n"
-        "Action 2: Finish[blue]"
-    )
-
 
 class DummyTool1:
     name = "Tool1"
@@ -122,12 +91,9 @@ def __call__(self, *args, **kwargs):
 def test_custom_tools():
     lm = dspy.utils.DummyLM(
         [
-            "Initial thoughts",
-            "Tool1[foo]",
-            "More thoughts",
-            "Tool2[bar]",
-            "Even more thoughts",
-            "Finish[baz]",
+            {"Thought_1": "Initial thoughts", "Action_1": "Tool1[foo]"},
+            {"Thought_2": "More thoughts", "Action_2": "Tool2[bar]"},
+            {"Thought_3": "Even more thoughts", "Action_3": "Finish[baz]"},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -143,17 +109,6 @@ def test_custom_tools():
     # each tool should be called only once
     assert tool1.num_calls == 1
     assert tool2.num_calls == 1
-    assert lm.get_convo(-1).endswith(
-        "Question: What is the color of the sky?\n\n"
-        "Thought 1: Initial thoughts\n\n"
-        "Action 1: Tool1[foo]\n\n"
-        "Observation 1: tool 1 output\n\n"
-        "Thought 2: More thoughts\n\n"
-        "Action 2: Tool2[bar]\n\n"
-        "Observation 2: tool 2 output\n\n"
-        "Thought 3: Even more thoughts\n\n"
-        "Action 3: Finish[baz]"
-    )
 
 
 def test_signature_instructions():
diff --git a/tests/predict/test_retry.py b/tests/predict/test_retry.py
index 5ceee8a627..687a18dbf7 100644
--- a/tests/predict/test_retry.py
+++ b/tests/predict/test_retry.py
@@ -1,8 +1,10 @@
 import functools
+
+import pydantic
+
 import dspy
-from dspy.utils import DummyLM
 from dspy.primitives.assertions import assert_transform_module, backtrack_handler
-import pydantic
+from dspy.utils import DummyLM
 
 
 def test_retry_simple():
@@ -14,7 +16,7 @@ def test_retry_simple():
         assert f"past_{field}" in retry_module.new_signature.input_fields
     assert "feedback" in retry_module.new_signature.input_fields
 
-    lm = DummyLM(["blue"])
+    lm = DummyLM([{"answer": "blue"}])
     dspy.settings.configure(lm=lm)
     result = retry_module.forward(
         question="What color is the sky?",
@@ -23,18 +25,10 @@ def test_retry_simple():
     )
     assert result.answer == "blue"
 
-    print(lm.get_convo(-1))
-    assert lm.get_convo(-1).endswith(
-        "Question: What color is the sky?\n\n"
-        "Previous Answer: red\n\n"
-        "Instructions: Try harder\n\n"
-        "Answer: blue"
-    )
-
 
 def test_retry_forward_with_feedback():
     # First we make a mistake, then we fix it
-    lm = DummyLM(["red", "blue"])
+    lm = DummyLM([{"answer": "red"}, {"answer": "blue"}])
     dspy.settings.configure(lm=lm, trace=[])
 
     class SimpleModule(dspy.Module):
@@ -58,18 +52,10 @@ def forward(self, **kwargs):
 
     assert result.answer == "blue"
 
-    print(lm.get_convo(-1))
-    assert lm.get_convo(-1).endswith(
-        "Question: What color is the sky?\n\n"
-        "Previous Answer: red\n\n"
-        "Instructions: Please think harder\n\n"
-        "Answer: blue"
-    )
-
 
 def test_retry_forward_with_typed_predictor():
     # First we make a mistake, then we fix it
-    lm = DummyLM(['{"answer":"red"}', '{"answer":"blue"}'])
+    lm = DummyLM([{"output": '{"answer":"red"}'}, {"output": '{"answer":"blue"}'}])
     dspy.settings.configure(lm=lm, trace=[])
 
     class AnswerQuestion(dspy.Signature):
@@ -103,9 +89,3 @@ def forward(self, **kwargs):
     result = program(question="What color is the sky?")
 
     assert result.answer == "blue"
-    assert lm.get_convo(-1).endswith(
-        'Input: {"question":"What color is the sky?"}\n\n'
-        'Previous Output: {"answer":"red"}\n\n'
-        'Instructions: Please think harder\n\n'
-        'Output: {"answer":"blue"}'
-    )
diff --git a/tests/primitives/test_program.py b/tests/primitives/test_program.py
index ecbbf06ecb..ea6633682f 100644
--- a/tests/primitives/test_program.py
+++ b/tests/primitives/test_program.py
@@ -1,8 +1,5 @@
 import dspy
-from dspy.primitives.program import (
-    Module,
-    set_attribute_by_name,
-)  # Adjust the import based on your file structure
+from dspy.primitives.program import Module, set_attribute_by_name  # Adjust the import based on your file structure
 from dspy.utils import DummyLM
 
 
@@ -39,7 +36,14 @@ def test_predictors():
 
 def test_forward():
     program = HopModule()
-    dspy.settings.configure(lm=DummyLM({"What is 1+1?": "let me check", "let me check": "2"}))
+    dspy.settings.configure(
+        lm=DummyLM(
+            {
+                "What is 1+1?": {"query": "let me check"},
+                "let me check": {"answer": "2"},
+            }
+        )
+    )
     result = program(question="What is 1+1?").answer
     assert result == "2"
 
@@ -166,8 +170,9 @@ def __init__(self):
         self.p0 = dspy.Predict("question -> answer")
         self.p1 = self.p0
 
+
 def test_named_parameters_duplicate_references():
-   module = DuplicateModule()
-   # Only testing for whether exceptions are thrown or not
-   # As Module.named_parameters() is recursive, this is mainly for catching infinite recursion
-   module.named_parameters()
+    module = DuplicateModule()
+    # Only testing for whether exceptions are thrown or not
+    # As Module.named_parameters() is recursive, this is mainly for catching infinite recursion
+    module.named_parameters()
diff --git a/tests/retrieve/test_llama_index_rm.py b/tests/retrieve/test_llama_index_rm.py
index 35711087e9..f06f96388b 100644
--- a/tests/retrieve/test_llama_index_rm.py
+++ b/tests/retrieve/test_llama_index_rm.py
@@ -3,8 +3,8 @@
 import pytest
 
 import dspy
-from dsp.modules.dummy_lm import DummyLM
 from dspy.datasets import HotPotQA
+from dspy.utils.dummies import DummyLM
 
 try:
     from llama_index.core import Settings, VectorStoreIndex
diff --git a/tests/signatures/test_signature.py b/tests/signatures/test_signature.py
index c99d1bdd03..8fb9c5ff24 100644
--- a/tests/signatures/test_signature.py
+++ b/tests/signatures/test_signature.py
@@ -182,6 +182,9 @@ class SubSignature(Signature):
 
 
 def test_multiline_instructions():
+    lm = DummyLM([{"output": "short answer"}])
+    dspy.settings.configure(lm=lm)
+
     class MySignature(Signature):
         """First line
         Second line
@@ -190,28 +193,8 @@ class MySignature(Signature):
         output = OutputField()
 
     predictor = dspy.Predict(MySignature)
-
-    lm = DummyLM(["short answer"])
-    dspy.settings.configure(lm=lm)
     assert predictor().output == "short answer"
 
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        First line
-        Second line
-            Third line
-
-        ---
-
-        Follow the following format.
-
-        Output: ${output}
-
-        ---
-
-        Output: short answer"""
-    )
-
 
 def test_replaced_by_replace_context_manager():
     class SignatureOne(Signature):
diff --git a/tests/teleprompt/test_bootstrap.py b/tests/teleprompt/test_bootstrap.py
index ba2acd5563..a112947a4b 100644
--- a/tests/teleprompt/test_bootstrap.py
+++ b/tests/teleprompt/test_bootstrap.py
@@ -1,10 +1,12 @@
+import textwrap
+
 import pytest
+
 import dspy
-from dspy.predict import Predict
-from dspy.utils.dummies import DummyLM
 from dspy import Example
+from dspy.predict import Predict
 from dspy.teleprompt import BootstrapFewShot
-import textwrap
+from dspy.utils.dummies import DummyLM
 
 
 # Define a simple metric function for testing
@@ -15,9 +17,7 @@ def simple_metric(example, prediction, trace=None):
 
 examples = [
     Example(input="What is the color of the sky?", output="blue").with_inputs("input"),
-    Example(
-        input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!"
-    ),
+    Example(input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!"),
 ]
 trainset = [examples[0]]
 valset = [examples[1]]
@@ -25,9 +25,7 @@ def simple_metric(example, prediction, trace=None):
 
 def test_bootstrap_initialization():
     # Initialize BootstrapFewShot with a dummy metric and minimal setup
-    bootstrap = BootstrapFewShot(
-        metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1
-    )
+    bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
     assert bootstrap.metric == simple_metric, "Metric not correctly initialized"
 
 
@@ -50,32 +48,22 @@ def test_compile_with_predict_instances():
     dspy.settings.configure(lm=lm)
 
     # Initialize BootstrapFewShot and compile the student
-    bootstrap = BootstrapFewShot(
-        metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1
-    )
-    compiled_student = bootstrap.compile(
-        student, teacher=teacher, trainset=trainset
-    )
+    bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
 
     assert compiled_student is not None, "Failed to compile student"
-    assert (
-        hasattr(compiled_student, "_compiled") and compiled_student._compiled
-    ), "Student compilation flag not set"
+    assert hasattr(compiled_student, "_compiled") and compiled_student._compiled, "Student compilation flag not set"
 
 
 def test_bootstrap_effectiveness():
     # This test verifies if the bootstrapping process improves the student's predictions
     student = SimpleModule("input -> output")
     teacher = SimpleModule("input -> output")
-    lm = DummyLM(["blue", "Ring-ding-ding-ding-dingeringeding!"], follow_examples=True)
+    lm = DummyLM([{"output": "blue"}, {"output": "Ring-ding-ding-ding-dingeringeding!"}], follow_examples=True)
     dspy.settings.configure(lm=lm, trace=[])
 
-    bootstrap = BootstrapFewShot(
-        metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1
-    )
-    compiled_student = bootstrap.compile(
-        student, teacher=teacher, trainset=trainset
-    )
+    bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
 
     # Check that the compiled student has the correct demos
     assert len(compiled_student.predictor.demos) == 1
@@ -90,32 +78,6 @@ def test_bootstrap_effectiveness():
     prediction = compiled_student(input=trainset[0].input)
     assert prediction.output == trainset[0].output
 
-    # For debugging
-    print("Convo")
-    print(lm.get_convo(-1))
-
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields `input`, produce the fields `output`.
-
-        ---
-
-        Follow the following format.
-
-        Input: ${input}
-        Output: ${output}
-
-        ---
-
-        Input: What is the color of the sky?
-        Output: blue
-
-        ---
-
-        Input: What is the color of the sky?
-        Output: blue"""
-    )
-
 
 def test_error_handling_during_bootstrap():
     """
@@ -136,7 +98,7 @@ def forward(self, **kwargs):
     # Setup DummyLM to simulate an error scenario
     lm = DummyLM(
         [
-            "Initial thoughts",  # Simulate initial teacher's prediction
+            {"output": "Initial thoughts"},  # Simulate initial teacher's prediction
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -161,20 +123,14 @@ def test_validation_set_usage():
 
     lm = DummyLM(
         [
-            "Initial thoughts",
-            "Finish[blue]",  # Expected output for both training and validation
+            {"output": "Initial thoughts"},
+            {"output": "Finish[blue]"},  # Expected output for both training and validation
         ]
     )
     dspy.settings.configure(lm=lm)
 
-    bootstrap = BootstrapFewShot(
-        metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1
-    )
-    compiled_student = bootstrap.compile(
-        student, teacher=teacher, trainset=trainset
-    )
+    bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
 
     # Check that validation examples are part of student's demos after compilation
-    assert len(compiled_student.predictor.demos) >= len(
-        valset
-    ), "Validation set not used in compiled student demos"
+    assert len(compiled_student.predictor.demos) >= len(valset), "Validation set not used in compiled student demos"
diff --git a/tests/teleprompt/test_copro_optimizer.py b/tests/teleprompt/test_copro_optimizer.py
index c0fe712bcf..c031d3c7a3 100644
--- a/tests/teleprompt/test_copro_optimizer.py
+++ b/tests/teleprompt/test_copro_optimizer.py
@@ -1,20 +1,24 @@
-import textwrap
 import dspy
+from dspy import Example
 from dspy.teleprompt.signature_opt import COPRO
 from dspy.utils.dummies import DummyLM
-from dspy import Example
+
 
 # Define a simple metric function for testing
 def simple_metric(example, prediction):
     # Simplified metric for testing: true if prediction matches expected output
     return example.output == prediction.output
 
+
 # Example training and validation sets
 trainset = [
     Example(input="Question: What is the color of the sky?", output="blue").with_inputs("input"),
-    Example(input="Question: What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs("input"),
+    Example(input="Question: What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs(
+        "input"
+    ),
 ]
 
+
 def test_signature_optimizer_initialization():
     optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
     assert optimizer.metric == simple_metric, "Metric not correctly initialized"
@@ -22,6 +26,7 @@ def test_signature_optimizer_initialization():
     assert optimizer.depth == 1, "Depth not correctly initialized"
     assert optimizer.init_temperature == 1.4, "Initial temperature not correctly initialized"
 
+
 class SimpleModule(dspy.Module):
     def __init__(self, signature):
         super().__init__()
@@ -31,15 +36,27 @@ def __init__(self, signature):
     def forward(self, **kwargs):
         return self.predictor(**kwargs)
 
+
 def test_signature_optimizer_optimization_process():
     optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
-    dspy.settings.configure(lm=DummyLM(["Optimized instruction 1", "Optimized instruction 2"]))
-    
+    dspy.settings.configure(
+        lm=DummyLM(
+            [
+                {
+                    "proposed_instruction": "Optimized instruction 1",
+                    "proposed_prefix_for_output_field": "Optimized instruction 2",
+                },
+            ]
+        )
+    )
+
     student = SimpleModule("input -> output")
-    
+
     # Assuming the compile method of COPRO requires a student module, a development set, and evaluation kwargs
-    optimized_student = optimizer.compile(student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False})
-    
+    optimized_student = optimizer.compile(
+        student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
+    )
+
     # Check that the optimized student has been modified from the original
     # This check can be more specific based on how the optimization modifies the student
     assert optimized_student is not student, "Optimization did not modify the student"
@@ -47,75 +64,93 @@ def test_signature_optimizer_optimization_process():
     # Further tests can be added to verify the specifics of the optimization process,
     # such as checking the instructions of the optimized student's predictors.
 
+
 def test_signature_optimizer_statistics_tracking():
     optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
     optimizer.track_stats = True  # Enable statistics tracking
 
-    dspy.settings.configure(lm=DummyLM(["Optimized instruction"]))
+    dspy.settings.configure(
+        lm=DummyLM(
+            [
+                {
+                    "proposed_instruction": "Optimized instruction 1",
+                    "proposed_prefix_for_output_field": "Optimized instruction 2",
+                },
+            ]
+        )
+    )
     student = SimpleModule("input -> output")
-    optimized_student = optimizer.compile(student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False})
+    optimized_student = optimizer.compile(
+        student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
+    )
 
     # Verify that statistics have been tracked and attached to the optimized student
-    assert hasattr(optimized_student, 'total_calls'), "Total calls statistic not tracked"
-    assert hasattr(optimized_student, 'results_best'), "Best results statistics not tracked"
+    assert hasattr(optimized_student, "total_calls"), "Total calls statistic not tracked"
+    assert hasattr(optimized_student, "results_best"), "Best results statistics not tracked"
+
 
 # Assuming the setup_signature_optimizer fixture and simple_metric function are defined as before
 
+
 def test_optimization_and_output_verification():
-    lm = DummyLM([
-        "Optimized Prompt",
-        "Optimized Prefix",
-    ])
+    lm = DummyLM(
+        [
+            {"proposed_instruction": "Optimized Prompt", "proposed_prefix_for_output_field": "Optimized Prefix"},
+            {"reasoning": "france", "output": "Paris"},
+            {"reasoning": "france", "output": "Paris"},
+            {"reasoning": "france", "output": "Paris"},
+            {"reasoning": "france", "output": "Paris"},
+            {"reasoning": "france", "output": "Paris"},
+            {"reasoning": "france", "output": "Paris"},
+            {"reasoning": "france", "output": "Paris"},
+        ]
+    )
     dspy.settings.configure(lm=lm)
     optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
-    
+
     student = SimpleModule("input -> output")
-    
+
     # Compile the student with the optimizer
-    optimized_student = optimizer.compile(student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False})
-    
+    optimized_student = optimizer.compile(
+        student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
+    )
+
     # Simulate calling the optimized student with a new input
     test_input = "What is the capital of France?"
     prediction = optimized_student(input=test_input)
 
     print(lm.get_convo(-1))
-    
-    assert prediction.output == "No more responses"
-
-    assert lm.get_convo(-1) == textwrap.dedent("""\
-        Optimized Prompt
-
-        ---
 
-        Follow the following format.
+    assert prediction.output == "Paris"
 
-        Input: ${input}
-        Reasoning: Let's think step by step in order to ${produce the output}. We ...
-        Optimized Prefix ${output}
-
-        ---
-
-        Input: What is the capital of France?
-        Reasoning: Let's think step by step in order to No more responses
-        Optimized Prefix No more responses""")
 
 def test_statistics_tracking_during_optimization():
-    dspy.settings.configure(lm=DummyLM(["Optimized instruction for stats tracking"]))
-    
+    dspy.settings.configure(
+        lm=DummyLM(
+            [
+                {"proposed_instruction": "Optimized Prompt", "proposed_prefix_for_output_field": "Optimized Prefix"},
+            ]
+        )
+    )
+
     optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
     optimizer.track_stats = True  # Enable statistics tracking
-    
+
     student = SimpleModule("input -> output")
-    optimized_student = optimizer.compile(student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False})
+    optimized_student = optimizer.compile(
+        student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
+    )
 
     # Verify that statistics have been tracked
-    assert hasattr(optimized_student, 'total_calls'), "Optimizer did not track total metric calls"
+    assert hasattr(optimized_student, "total_calls"), "Optimizer did not track total metric calls"
     assert optimized_student.total_calls > 0, "Optimizer reported no metric calls"
-    
+
     # Check if the results_best and results_latest contain valid statistics
-    assert 'results_best' in optimized_student.__dict__, "Optimizer did not track the best results"
-    assert 'results_latest' in optimized_student.__dict__, "Optimizer did not track the latest results"
+    assert "results_best" in optimized_student.__dict__, "Optimizer did not track the best results"
+    assert "results_latest" in optimized_student.__dict__, "Optimizer did not track the latest results"
     assert len(optimized_student.results_best) > 0, "Optimizer did not properly populate the best results statistics"
-    assert len(optimized_student.results_latest) > 0, "Optimizer did not properly populate the latest results statistics"
+    assert (
+        len(optimized_student.results_latest) > 0
+    ), "Optimizer did not properly populate the latest results statistics"
 
     # Additional detailed checks can be added here to verify the contents of the tracked statistics
diff --git a/tests/teleprompt/test_random_search.py b/tests/teleprompt/test_random_search.py
index e76178c80d..7da4069aaa 100644
--- a/tests/teleprompt/test_random_search.py
+++ b/tests/teleprompt/test_random_search.py
@@ -1,8 +1,9 @@
 import dspy
-from dspy.predict import Predict
-from dspy.utils.dummies import DummyLM
 from dspy import Example
+from dspy.predict import Predict
 from dspy.teleprompt import BootstrapFewShotWithRandomSearch
+from dspy.utils.dummies import DummyLM
+
 
 class SimpleModule(dspy.Module):
     def __init__(self, signature):
@@ -12,9 +13,11 @@ def __init__(self, signature):
     def forward(self, **kwargs):
         return self.predictor(**kwargs)
 
+
 def simple_metric(example, prediction, trace=None):
     return example.output == prediction.output
 
+
 def test_basic_workflow():
     """Test to ensure the basic compile flow runs without errors."""
     student = SimpleModule("input -> output")
@@ -34,4 +37,3 @@ def test_basic_workflow():
         Example(input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs("input"),
     ]
     optimizer.compile(student, teacher=teacher, trainset=trainset)
-