Skip to content

Commit

Permalink
Remove PR trigger from workflow,
Browse files Browse the repository at this point in the history
Add more logging and a little cleanup in sglang_frontend_test
  • Loading branch information
stbaione authored and renxida committed Nov 19, 2024
1 parent 5cac718 commit 627af2d
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 42 deletions.
2 changes: 0 additions & 2 deletions .github/workflows/ci-sglang-integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ name: SGLang Llama Integration Tests

on:
workflow_dispatch:
# TODO: Remove after validating action
pull_request:
schedule:
# Run periodically, every 4 hours. This is ran periodically with the
# intent of catching regressions early, and allowing for those
Expand Down
92 changes: 52 additions & 40 deletions app_tests/integration_tests/llm/sglang/sglang_frontend_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,46 +75,49 @@ def tip_suggestion(s):
def test_multi_turn_qa(load_comparison_model, start_server, register_shortfin_backend):
model = load_comparison_model

logger.debug("Starting Multi-Turn Question Test...")
question_1 = "Name the capital city of the USA."
question_2 = "The Smithsonian is in this location."

answer_1 = "The capital city of the United States of America is Washington, D.C. (short for District of Columbia).assistant\n\nWould you like to know more about Washington, D.C. or is there something else I can help you with?"
answer_2 = "The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes. It was founded in 1846 and is named after British scientist James Smithson, who left a bequest to"

logger.info("Testing multi-turn Q&A run...")
state = multi_turn_question.run(
question_1="Name the capital city of the USA.",
question_2="The Smithsonian is in this location.",
question_1=question_1,
question_2=question_2,
)

logger.debug("Obtaining messages...")
messages = state.messages()
logger.debug("Messages Obtained...")
logger.info("Received messages from multi-turn call.")

logger.debug("Checking first Q&A turn...")
assert messages[0] == {
"role": "user",
"content": "Name the capital city of the USA.",
"content": question_1,
}
assert messages[1]["role"] == "assistant"

ideal_answer = "The capital city of the United States of America is Washington, D.C. (short for District of Columbia).assistant\n\nWould you like to know more about Washington, D.C. or is there something else I can help you with?"
logger.info("Computing similarity between first question and first answer...")
first_q_answer = messages[1]["content"]
logger.debug("Computing Similarity Score...")

score = compute_similarity(model, ideal_answer, first_q_answer)
score = compute_similarity(model, answer_1, first_q_answer)
if not score > ACCEPTED_THRESHOLD:
raise AccuracyValidationException(
f"Accuracy error between {expected_answer} and {first_q_answer}:\n SCORE: {score}"
f"Accuracy error between {answer_1} and {first_q_answer}:\n SCORE: {score}"
)
logger.info("Similarity passed")

assert messages[2] == {
"role": "user",
"content": "The Smithsonian is in this location.",
"content": question_2,
}
assert messages[3]["role"] == "assistant"

expected_answer = "The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes. It was founded in 1846 and is named after British scientist James Smithson, who left a bequest to"
logger.info("Testing similarity between second question and second answer...")
second_q_answer = messages[3]["content"]
score = compute_similarity(model, expected_answer, second_q_answer)
score = compute_similarity(model, answer_2, second_q_answer)
if not score > ACCEPTED_THRESHOLD:
raise AccuracyValidationException(
f"Accuracy error between {expected_answer} and {second_q_answer}:\n SCORE: {score}"
f"Accuracy error between {answer_2} and {second_q_answer}:\n SCORE: {score}"
)
logger.info("Similarity passed.")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -148,27 +151,26 @@ def clean_message(message: str):
expected_answer_1 = "The capital city of the United States of America is Washington, D.C. (short for District of Columbia).assistant\n\nWould you like to know more about Washington, D.C. or is there something else I can help you with?"
expected_answer_2 = "The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes. It was founded in 1846 and is named after British scientist James Smithson, who left a bequest to"

logger.debug("Starting Multi-Turn Question Test...")
logger.info("Testing multi-turn Q&A run w/ stream...")
state = multi_turn_question.run(
question_1=question_1,
question_2=question_2,
stream=True,
)

logger.debug("Obtaining messages...")
messages = ""
for chunk in state.text_iter():
messages += chunk
logger.debug("Messages Obtained...")
logger.info("Received messages from multi-turn call.")

logger.info("Computing similarity between expectation and result")
expected_result = f"user: {question_1}\nassistant: {expected_answer_1}\nuser: {question_2}\nassistant: {expected_answer_2}"
cleaned_messages = clean_message(messages)
score = compute_similarity(model, cleaned_messages, expected_result)
if not score > ACCEPTED_THRESHOLD:
raise AccuracyValidationException(
f"Accuracy error between {expected_result} and {messages}:\n SCORE: {score}"
)

logger.debug(f"Stream Messages: {messages}")
logger.info("Similarity passed.")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -196,6 +198,7 @@ def test_batch_multi_turn_qa(
expected_answer_2_1 = "The largest city in the USA is New York City, with a population of over 8.4 million people, according to the United States Census Bureau (2020 estimates).assistant\n\nHowever, I should note that the largest city in the"
expected_answer_2_2 = "That's correct, the iconic Empire State Building is located in Midtown Manhattan, New York City. It's one of the most recognizable landmarks in the world and a symbol of the city's grandeur and history.assistant\n\nAnd, by"

logger.info("Testing batch multi-turn Q&A run...")
states = multi_turn_question.run_batch(
[
{
Expand All @@ -212,62 +215,66 @@ def test_batch_multi_turn_qa(
first_qa = states[0]
second_qa = states[1]

first_messages = first_qa.messages()
second_messages = second_qa.messages()
assert first_messages[0] == {
first_qa_messages = first_qa.messages()
second_qa_messages = second_qa.messages()

logger.info("Testing first batch of messages...")
assert first_qa_messages[0] == {
"role": "user",
"content": question_1_1,
}

assert first_messages[1]["role"] == "assistant"
first_answer = first_messages[1]["content"]
assert first_qa_messages[1]["role"] == "assistant"
first_answer = first_qa_messages[1]["content"]
expected_answer = expected_answer_1_1
score = compute_similarity(model, expected_answer, first_answer.lower())
score = compute_similarity(model, expected_answer, first_answer)
if not score > ACCEPTED_THRESHOLD:
raise AccuracyValidationException(
f"Accuracy error between {expected_answer} and {first_answer}:\n SCORE: {score}"
)

assert first_messages[2] == {
assert first_qa_messages[2] == {
"role": "user",
"content": question_1_2,
}
first_messages[3]["role"] = "assistant"
second_answer = first_messages[3]["content"]
first_qa_messages[3]["role"] = "assistant"
second_answer = first_qa_messages[3]["content"]
expected_answer = expected_answer_1_2
score = compute_similarity(model, expected_answer, second_answer)
if not score > ACCEPTED_THRESHOLD:
raise AccuracyValidationException(
f"Accuracy error between {expected_answer} and {second_answer}:\n SCORE: {score}"
)
logger.info("First batch passed.")

second_messages = second_qa.messages()
assert second_messages[0] == {
logger.info("Testing second batch of messages...")
assert second_qa_messages[0] == {
"role": "user",
"content": question_2_1,
}

assert second_messages[1]["role"] == "assistant"
first_answer = second_messages[1]["content"]
assert second_qa_messages[1]["role"] == "assistant"
first_answer = second_qa_messages[1]["content"]
expected_answer = expected_answer_2_1
score = compute_similarity(model, expected_answer, first_answer.lower())
score = compute_similarity(model, expected_answer, first_answer)
if not score > ACCEPTED_THRESHOLD:
raise AccuracyValidationException(
f"Accuracy error between {expected_answer} and {first_answer}:\n SCORE: {score}"
)

assert second_messages[2] == {
assert second_qa_messages[2] == {
"role": "user",
"content": question_2_2,
}
second_messages[3]["role"] = "assistant"
second_answer = second_messages[3]["content"]
second_qa_messages[3]["role"] = "assistant"
second_answer = second_qa_messages[3]["content"]
expected_answer = expected_answer_2_2
score = compute_similarity(model, expected_answer, second_answer)
if not score > ACCEPTED_THRESHOLD:
raise AccuracyValidationException(
f"Accuracy error between {expected_answer} and {second_answer}:\n SCORE: {score}"
)
logger.info("Second batch passed.")


@pytest.mark.parametrize(
Expand All @@ -283,8 +290,12 @@ def test_batch_multi_turn_qa(
def test_fork(load_comparison_model, start_server, register_shortfin_backend):
model = load_comparison_model

logger.info("Testing fork...")
state = tip_suggestion.run()
result = state.text()
logger.info("Fork response received.")

logger.info("Computing similarity...")
expected_answer = """Here are two tips for staying healthy: 1. Balanced Diet. 2. Regular Exercise.
Tip 1:A balanced diet is essential for maintaining good health. It involves consuming a variety of foods from different food groups, including fruits, vegetables, whole grains, lean proteins, and healthy fats. A balanced diet provides the body with the necessary nutrients, vitamins, and
Tip 2:Regular exercise is essential for maintaining a healthy body. It helps to improve cardiovascular health, increase strength and flexibility, and boost the immune system. Regular physical activity can also reduce the risk of chronic diseases such as heart disease, diabetes, and certain types of cancer
Expand All @@ -295,3 +306,4 @@ def test_fork(load_comparison_model, start_server, register_shortfin_backend):
raise AccuracyValidationException(
f"Accuracy error between {expected_answer} and {result}:\n SCORE: {score}"
)
logger.info("Similarity passed.")

0 comments on commit 627af2d

Please sign in to comment.