Iterate bedrock testing approach

This applies some ideas introduced in [1] to setup testing for Claude with AWS Bedrock. This replaces the existing Bedrock/Claude helper files with a single StubBedrock set of helpers and in there is a general task to stub the Bedrock converse endpoint and then high and low level tasks for stubbing the Claude tool call responses. I've adapted the code where we test Bedrock to make use of these helpers. [1] https://github.com/alphagov/govuk-chat/pull/27/files#r1935884789
alphagov · Feb 5, 2025 · 6b5c3a7 · 6b5c3a7
1 parent f166e5f
commit 6b5c3a7
Show file tree

Hide file tree

Showing 8 changed files with 138 additions and 108 deletions.
diff --git a/spec/lib/answer_composition/pipeline/claude/structured_answer_composer_spec.rb b/spec/lib/answer_composition/pipeline/claude/structured_answer_composer_spec.rb
@@ -2,58 +2,50 @@
   describe ".call" do
     let(:question) { build :question }
     let(:context) { build(:answer_pipeline_context, question:) }
-    let(:response) do
-      stub_claude_structured_answer_reponse("VAT (Value Added Tax) is a tax applied to most goods and services in the UK.")
-    end
 
     it "uses Bedrock converse endpoint to assign the correct values to the context's answer" do
-      stub_bedrock_request(:converse, response)
-      described_class.call(context)
+      answer = "VAT (Value Added Tax) is a tax applied to most goods and services in the UK."
 
-      expect(context.answer.message.squish).to eq(
-        "VAT (Value Added Tax) is a tax applied to most goods and services in the UK.",
+      stub_bedrock_converse(
+        bedrock_claude_structured_answer_response(question.message, answer),
       )
+
+      described_class.call(context)
+
+      expect(context.answer.message).to eq(answer)
       expect(context.answer.status).to eq("answered")
+    end
 
-      expect(context.answer.llm_responses["structured_answer"]).to match(
-        a_hash_including(
-          output: {
-            message: {
-              role: "assistant",
-              content: [
-                {
-                  tool_use: {
-                    input: { "answer" => "VAT (Value Added Tax) is a tax applied to most goods and services in the UK." },
-                    tool_use_id: "tool_id",
-                    name: "tool_name",
-                  },
-                },
-              ],
-            },
-          },
-          stop_reason: "end_turn",
-          usage: {
-            input_tokens: 10,
-            output_tokens: 20,
-            total_tokens: 30,
-          },
-          metrics: {
-            latency_ms: 999,
-          },
-        ),
+    it "stores the LLM response" do
+      response = bedrock_claude_tool_response(
+        { "answer" => "answer", "confidence" => 0.9 },
+        tool_name: "answer_confidence",
       )
+
+      stub_bedrock_converse(response)
+
+      described_class.call(context)
+      expect(context.answer.llm_responses["structured_answer"]).to match(response)
     end
 
     it "assigns metrics to the answer" do
       allow(Clock).to receive(:monotonic_time).and_return(100.0, 101.5)
-      stub_bedrock_request(:converse, response)
+
+      stub_bedrock_converse(
+        bedrock_claude_tool_response(
+          { "answer" => "answer", "confidence" => 0.9 },
+          tool_name: "answer_confidence",
+          input_tokens: 15,
+          output_tokens: 25,
+        ),
+      )
 
       described_class.call(context)
 
       expect(context.answer.metrics["structured_answer"]).to eq({
         duration: 1.5,
-        llm_prompt_tokens: 10,
-        llm_completion_tokens: 20,
+        llm_prompt_tokens: 15,
+        llm_completion_tokens: 25,
       })
     end
   end

diff --git a/spec/lib/answer_composition/pipeline_runner_spec.rb b/spec/lib/answer_composition/pipeline_runner_spec.rb
@@ -120,17 +120,14 @@
       end
     end
 
-    context "when the step raises an Aws::BedrockRuntime::Errors::ValidationException" do
-      let(:error) do
-        Aws::BedrockRuntime::Errors::ValidationException.new(
-          Seahorse::Client::RequestContext.new,
-          "The model returned the following errors: Input is too long for requested model.",
-        )
+    context "when the step raises an Aws::Errors::ServiceError" do
+      let(:pipeline_step) do
+        client = stub_bedrock_converse("ServerError")
+        ->(_context) { client.converse(model_id: "just-generating-an-error") }
       end
-      let(:pipeline_step) { ->(_context) { raise error } }
 
       it "notifies sentry" do
-        expect(GovukError).to receive(:notify).with(error)
+        expect(GovukError).to receive(:notify).with(kind_of(Aws::Errors::ServiceError))
         described_class.call(question:, pipeline: [pipeline_step])
       end
 
@@ -143,7 +140,7 @@
             question:,
             status: "error_answer_service_error",
             message: Answer::CannedResponses::ANSWER_SERVICE_ERROR_RESPONSE,
-            error_message: "The model returned the following errors: Input is too long for requested model.",
+            error_message: "stubbed-response-error-message",
           )
       end
     end

diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
@@ -35,8 +35,7 @@
   config.include FactoryBot::Syntax::Methods
   config.include MailerExamples
   config.include StubOpenAIChat
-  config.include StubBedrockRequest
-  config.include StubClaudeConverse
+  config.include StubBedrock
   config.include PasswordlessRequestHelpers, type: :request
   config.include StubOpenAIEmbedding
   config.include SidekiqHelpers

diff --git a/spec/support/stub_bedrock.rb b/spec/support/stub_bedrock.rb
@@ -0,0 +1,76 @@
+module StubBedrock
+  # Usage scenarios
+  #
+  # ## Stub question message provides specific answer
+  #
+  # client = stub_bedrock_converse(
+  #   bedrock_claude_structured_answer_response("Expected question", "Expected answer")
+  # )
+  #
+  # ## Simulate basic tool response
+  # client = stub_bedrock_converse(
+  #   bedrock_claude_tool_response({ "key" => "value" }, tool_user
+  # )
+  #
+  # ##
+  #
+  # ## Simulate an error
+  # ```
+  # client = stub_bedrock_converse("NotFound")
+  # client.converse(model_id: "just-generating-an-error")
+  # => Aws::BedrockRuntime::Errors::ServerError: stubbed-response-error-message
+  # ```
+  def stub_bedrock_converse(*responses)
+    bedrock_client = Aws::BedrockRuntime::Client.new(stub_responses: true)
+    allow(Aws::BedrockRuntime::Client).to receive(:new).and_return(bedrock_client)
+    bedrock_client.stub_responses(:converse, responses)
+    bedrock_client
+  end
+
+  def bedrock_claude_structured_answer_response(question, answer)
+    lambda do |context|
+      given_question = context.params.dig(:messages, -1, :content, 0, :text)
+
+      if question && given_question != question
+        raise "Unexpected question received: \"#{given_question}\". Expected \"#{question}\"."
+      end
+
+      bedrock_claude_tool_response(
+        { "answer" => answer, "confidence" => 0.9 },
+        tool_name: "answer_confidence",
+      )
+    end
+  end
+
+  def bedrock_claude_tool_response(tool_input,
+                                   tool_name:,
+                                   tool_use_id: SecureRandom.hex,
+                                   input_tokens: 10,
+                                   output_tokens: 20)
+    {
+      output: {
+        message: {
+          role: "assistant",
+          content: [
+            {
+              tool_use: {
+                input: tool_input,
+                tool_use_id:,
+                name: tool_name,
+              },
+            },
+          ],
+        },
+      },
+      stop_reason: "end_turn",
+      usage: {
+        input_tokens:,
+        output_tokens:,
+        total_tokens: input_tokens + output_tokens,
+      },
+      metrics: {
+        latency_ms: 999,
+      },
+    }
+  end
+end
diff --git a/spec/support/stub_bedrock_request.rb b/spec/support/stub_bedrock_request.rb
diff --git a/spec/support/stub_claude_converse.rb b/spec/support/stub_claude_converse.rb
diff --git a/spec/support/system_spec_helpers.rb b/spec/support/system_spec_helpers.rb
@@ -1,4 +1,10 @@
 module SystemSpecHelpers
+  def given_i_am_using_the_claude_structured_answer_strategy
+    allow(Rails.configuration)
+      .to receive(:answer_strategy)
+      .and_return("claude_structured_answer")
+  end
+
   def given_i_have_confirmed_i_understand_chat_risks
     visit onboarding_limitations_path
 

diff --git a/spec/system/conversation_with_claude_structured_answer_spec.rb b/spec/system/conversation_with_claude_structured_answer_spec.rb
@@ -1,7 +1,9 @@
 RSpec.describe "Conversation with Claude with a structured answer" do
   scenario do
-    given_i_am_a_signed_in_early_access_user
+    given_i_am_using_the_claude_structured_answer_strategy
+    and_i_am_a_signed_in_early_access_user
     and_i_have_confirmed_i_understand_chat_risks
+
     when_i_visit_the_conversation_page
     and_i_enter_a_question
     then_i_see_the_answer_is_pending
@@ -36,21 +38,9 @@ def then_i_see_the_answer_is_pending
 
   def when_the_first_answer_is_generated
     @first_answer = "Lots of tax."
-    stub_bedrock_request(
-      :converse,
-      stub_claude_structured_answer_reponse(@first_answer),
-    )
-
-    execute_queued_sidekiq_jobs
-  end
-
-  def when_the_second_answer_is_generated
-    @second_answer = "Even more tax."
-    stub_bedrock_request(
-      :converse,
-      stub_claude_structured_answer_reponse(@second_answer),
+    stub_bedrock_converse(
+      bedrock_claude_structured_answer_response(@first_question, @first_answer),
     )
-
     execute_queued_sidekiq_jobs
   end
 
@@ -62,21 +52,29 @@ def then_i_see_my_question_on_the_page
     expect(page).to have_content(@first_question)
   end
 
-  def then_i_see_my_second_question_on_the_page
-    expect(page).to have_content(@second_question)
-  end
-
   def and_i_can_see_the_first_answer
     expect(page).to have_content(@first_answer)
   end
 
-  def and_i_can_see_the_second_answer
-    expect(page).to have_content(@second_answer)
-  end
-
   def when_i_enter_a_second_question
     @second_question = "Are you sure?"
     fill_in "Message", with: @second_question
     click_on "Send"
   end
+
+  def when_the_second_answer_is_generated
+    @second_answer = "Even more tax."
+    stub_bedrock_converse(
+      bedrock_claude_structured_answer_response(@second_question, @second_answer),
+    )
+    execute_queued_sidekiq_jobs
+  end
+
+  def then_i_see_my_second_question_on_the_page
+    expect(page).to have_content(@second_question)
+  end
+
+  def and_i_can_see_the_second_answer
+    expect(page).to have_content(@second_answer)
+  end
 end