Skip to content

Commit

Permalink
Iterate bedrock testing approach
Browse files Browse the repository at this point in the history
This applies some ideas introduced in [1] to setup testing for Claude
with AWS Bedrock.

This replaces the existing Bedrock/Claude helper files with a single
StubBedrock set of helpers and in there is a general task to stub the
Bedrock converse endpoint and then high and low level tasks for
stubbing the Claude tool call responses.

I've adapted the code where we test Bedrock to make use of these
helpers.

[1] https://github.com/alphagov/govuk-chat/pull/27/files#r1935884789
  • Loading branch information
kevindew committed Feb 5, 2025
1 parent f166e5f commit 6b5c3a7
Show file tree
Hide file tree
Showing 8 changed files with 138 additions and 108 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,58 +2,50 @@
describe ".call" do
let(:question) { build :question }
let(:context) { build(:answer_pipeline_context, question:) }
let(:response) do
stub_claude_structured_answer_reponse("VAT (Value Added Tax) is a tax applied to most goods and services in the UK.")
end

it "uses Bedrock converse endpoint to assign the correct values to the context's answer" do
stub_bedrock_request(:converse, response)
described_class.call(context)
answer = "VAT (Value Added Tax) is a tax applied to most goods and services in the UK."

expect(context.answer.message.squish).to eq(
"VAT (Value Added Tax) is a tax applied to most goods and services in the UK.",
stub_bedrock_converse(
bedrock_claude_structured_answer_response(question.message, answer),
)

described_class.call(context)

expect(context.answer.message).to eq(answer)
expect(context.answer.status).to eq("answered")
end

expect(context.answer.llm_responses["structured_answer"]).to match(
a_hash_including(
output: {
message: {
role: "assistant",
content: [
{
tool_use: {
input: { "answer" => "VAT (Value Added Tax) is a tax applied to most goods and services in the UK." },
tool_use_id: "tool_id",
name: "tool_name",
},
},
],
},
},
stop_reason: "end_turn",
usage: {
input_tokens: 10,
output_tokens: 20,
total_tokens: 30,
},
metrics: {
latency_ms: 999,
},
),
it "stores the LLM response" do
response = bedrock_claude_tool_response(
{ "answer" => "answer", "confidence" => 0.9 },
tool_name: "answer_confidence",
)

stub_bedrock_converse(response)

described_class.call(context)
expect(context.answer.llm_responses["structured_answer"]).to match(response)
end

it "assigns metrics to the answer" do
allow(Clock).to receive(:monotonic_time).and_return(100.0, 101.5)
stub_bedrock_request(:converse, response)

stub_bedrock_converse(
bedrock_claude_tool_response(
{ "answer" => "answer", "confidence" => 0.9 },
tool_name: "answer_confidence",
input_tokens: 15,
output_tokens: 25,
),
)

described_class.call(context)

expect(context.answer.metrics["structured_answer"]).to eq({
duration: 1.5,
llm_prompt_tokens: 10,
llm_completion_tokens: 20,
llm_prompt_tokens: 15,
llm_completion_tokens: 25,
})
end
end
Expand Down
15 changes: 6 additions & 9 deletions spec/lib/answer_composition/pipeline_runner_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -120,17 +120,14 @@
end
end

context "when the step raises an Aws::BedrockRuntime::Errors::ValidationException" do
let(:error) do
Aws::BedrockRuntime::Errors::ValidationException.new(
Seahorse::Client::RequestContext.new,
"The model returned the following errors: Input is too long for requested model.",
)
context "when the step raises an Aws::Errors::ServiceError" do
let(:pipeline_step) do
client = stub_bedrock_converse("ServerError")
->(_context) { client.converse(model_id: "just-generating-an-error") }
end
let(:pipeline_step) { ->(_context) { raise error } }

it "notifies sentry" do
expect(GovukError).to receive(:notify).with(error)
expect(GovukError).to receive(:notify).with(kind_of(Aws::Errors::ServiceError))
described_class.call(question:, pipeline: [pipeline_step])
end

Expand All @@ -143,7 +140,7 @@
question:,
status: "error_answer_service_error",
message: Answer::CannedResponses::ANSWER_SERVICE_ERROR_RESPONSE,
error_message: "The model returned the following errors: Input is too long for requested model.",
error_message: "stubbed-response-error-message",
)
end
end
Expand Down
3 changes: 1 addition & 2 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@
config.include FactoryBot::Syntax::Methods
config.include MailerExamples
config.include StubOpenAIChat
config.include StubBedrockRequest
config.include StubClaudeConverse
config.include StubBedrock
config.include PasswordlessRequestHelpers, type: :request
config.include StubOpenAIEmbedding
config.include SidekiqHelpers
Expand Down
76 changes: 76 additions & 0 deletions spec/support/stub_bedrock.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
module StubBedrock
# Usage scenarios
#
# ## Stub question message provides specific answer
#
# client = stub_bedrock_converse(
# bedrock_claude_structured_answer_response("Expected question", "Expected answer")
# )
#
# ## Simulate basic tool response
# client = stub_bedrock_converse(
# bedrock_claude_tool_response({ "key" => "value" }, tool_user
# )
#
# ##
#
# ## Simulate an error
# ```
# client = stub_bedrock_converse("NotFound")
# client.converse(model_id: "just-generating-an-error")
# => Aws::BedrockRuntime::Errors::ServerError: stubbed-response-error-message
# ```
def stub_bedrock_converse(*responses)
bedrock_client = Aws::BedrockRuntime::Client.new(stub_responses: true)
allow(Aws::BedrockRuntime::Client).to receive(:new).and_return(bedrock_client)
bedrock_client.stub_responses(:converse, responses)
bedrock_client
end

def bedrock_claude_structured_answer_response(question, answer)
lambda do |context|
given_question = context.params.dig(:messages, -1, :content, 0, :text)

if question && given_question != question
raise "Unexpected question received: \"#{given_question}\". Expected \"#{question}\"."
end

bedrock_claude_tool_response(
{ "answer" => answer, "confidence" => 0.9 },
tool_name: "answer_confidence",
)
end
end

def bedrock_claude_tool_response(tool_input,
tool_name:,
tool_use_id: SecureRandom.hex,
input_tokens: 10,
output_tokens: 20)
{
output: {
message: {
role: "assistant",
content: [
{
tool_use: {
input: tool_input,
tool_use_id:,
name: tool_name,
},
},
],
},
},
stop_reason: "end_turn",
usage: {
input_tokens:,
output_tokens:,
total_tokens: input_tokens + output_tokens,
},
metrics: {
latency_ms: 999,
},
}
end
end
7 changes: 0 additions & 7 deletions spec/support/stub_bedrock_request.rb

This file was deleted.

31 changes: 0 additions & 31 deletions spec/support/stub_claude_converse.rb

This file was deleted.

6 changes: 6 additions & 0 deletions spec/support/system_spec_helpers.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
module SystemSpecHelpers
def given_i_am_using_the_claude_structured_answer_strategy
allow(Rails.configuration)
.to receive(:answer_strategy)
.and_return("claude_structured_answer")
end

def given_i_have_confirmed_i_understand_chat_risks
visit onboarding_limitations_path

Expand Down
44 changes: 21 additions & 23 deletions spec/system/conversation_with_claude_structured_answer_spec.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
RSpec.describe "Conversation with Claude with a structured answer" do
scenario do
given_i_am_a_signed_in_early_access_user
given_i_am_using_the_claude_structured_answer_strategy
and_i_am_a_signed_in_early_access_user
and_i_have_confirmed_i_understand_chat_risks

when_i_visit_the_conversation_page
and_i_enter_a_question
then_i_see_the_answer_is_pending
Expand Down Expand Up @@ -36,21 +38,9 @@ def then_i_see_the_answer_is_pending

def when_the_first_answer_is_generated
@first_answer = "Lots of tax."
stub_bedrock_request(
:converse,
stub_claude_structured_answer_reponse(@first_answer),
)

execute_queued_sidekiq_jobs
end

def when_the_second_answer_is_generated
@second_answer = "Even more tax."
stub_bedrock_request(
:converse,
stub_claude_structured_answer_reponse(@second_answer),
stub_bedrock_converse(
bedrock_claude_structured_answer_response(@first_question, @first_answer),
)

execute_queued_sidekiq_jobs
end

Expand All @@ -62,21 +52,29 @@ def then_i_see_my_question_on_the_page
expect(page).to have_content(@first_question)
end

def then_i_see_my_second_question_on_the_page
expect(page).to have_content(@second_question)
end

def and_i_can_see_the_first_answer
expect(page).to have_content(@first_answer)
end

def and_i_can_see_the_second_answer
expect(page).to have_content(@second_answer)
end

def when_i_enter_a_second_question
@second_question = "Are you sure?"
fill_in "Message", with: @second_question
click_on "Send"
end

def when_the_second_answer_is_generated
@second_answer = "Even more tax."
stub_bedrock_converse(
bedrock_claude_structured_answer_response(@second_question, @second_answer),
)
execute_queued_sidekiq_jobs
end

def then_i_see_my_second_question_on_the_page
expect(page).to have_content(@second_question)
end

def and_i_can_see_the_second_answer
expect(page).to have_content(@second_answer)
end
end

0 comments on commit 6b5c3a7

Please sign in to comment.