diff --git a/.github/workflows/ci-sglang-integration-tests.yml b/.github/workflows/ci-sglang-integration-tests.yml index ee68ba322..1c382617d 100644 --- a/.github/workflows/ci-sglang-integration-tests.yml +++ b/.github/workflows/ci-sglang-integration-tests.yml @@ -8,8 +8,6 @@ name: SGLang Llama Integration Tests on: workflow_dispatch: - # TODO: Remove after validating action - pull_request: schedule: # Run periodically, every 4 hours. This is ran periodically with the # intent of catching regressions early, and allowing for those diff --git a/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py b/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py index 07e7828f0..efab14ea7 100644 --- a/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py +++ b/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py @@ -75,46 +75,49 @@ def tip_suggestion(s): def test_multi_turn_qa(load_comparison_model, start_server, register_shortfin_backend): model = load_comparison_model - logger.debug("Starting Multi-Turn Question Test...") + question_1 = "Name the capital city of the USA." + question_2 = "The Smithsonian is in this location." + + answer_1 = "The capital city of the United States of America is Washington, D.C. (short for District of Columbia).assistant\n\nWould you like to know more about Washington, D.C. or is there something else I can help you with?" + answer_2 = "The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes. It was founded in 1846 and is named after British scientist James Smithson, who left a bequest to" + + logger.info("Testing multi-turn Q&A run...") state = multi_turn_question.run( - question_1="Name the capital city of the USA.", - question_2="The Smithsonian is in this location.", + question_1=question_1, + question_2=question_2, ) - - logger.debug("Obtaining messages...") messages = state.messages() - logger.debug("Messages Obtained...") + logger.info("Received messages from multi-turn call.") - logger.debug("Checking first Q&A turn...") assert messages[0] == { "role": "user", - "content": "Name the capital city of the USA.", + "content": question_1, } assert messages[1]["role"] == "assistant" - ideal_answer = "The capital city of the United States of America is Washington, D.C. (short for District of Columbia).assistant\n\nWould you like to know more about Washington, D.C. or is there something else I can help you with?" + logger.info("Computing similarity between first question and first answer...") first_q_answer = messages[1]["content"] - logger.debug("Computing Similarity Score...") - - score = compute_similarity(model, ideal_answer, first_q_answer) + score = compute_similarity(model, answer_1, first_q_answer) if not score > ACCEPTED_THRESHOLD: raise AccuracyValidationException( - f"Accuracy error between {expected_answer} and {first_q_answer}:\n SCORE: {score}" + f"Accuracy error between {answer_1} and {first_q_answer}:\n SCORE: {score}" ) + logger.info("Similarity passed") assert messages[2] == { "role": "user", - "content": "The Smithsonian is in this location.", + "content": question_2, } assert messages[3]["role"] == "assistant" - expected_answer = "The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes. It was founded in 1846 and is named after British scientist James Smithson, who left a bequest to" + logger.info("Testing similarity between second question and second answer...") second_q_answer = messages[3]["content"] - score = compute_similarity(model, expected_answer, second_q_answer) + score = compute_similarity(model, answer_2, second_q_answer) if not score > ACCEPTED_THRESHOLD: raise AccuracyValidationException( - f"Accuracy error between {expected_answer} and {second_q_answer}:\n SCORE: {score}" + f"Accuracy error between {answer_2} and {second_q_answer}:\n SCORE: {score}" ) + logger.info("Similarity passed.") @pytest.mark.parametrize( @@ -148,18 +151,18 @@ def clean_message(message: str): expected_answer_1 = "The capital city of the United States of America is Washington, D.C. (short for District of Columbia).assistant\n\nWould you like to know more about Washington, D.C. or is there something else I can help you with?" expected_answer_2 = "The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes. It was founded in 1846 and is named after British scientist James Smithson, who left a bequest to" - logger.debug("Starting Multi-Turn Question Test...") + logger.info("Testing multi-turn Q&A run w/ stream...") state = multi_turn_question.run( question_1=question_1, question_2=question_2, stream=True, ) - - logger.debug("Obtaining messages...") messages = "" for chunk in state.text_iter(): messages += chunk - logger.debug("Messages Obtained...") + logger.info("Received messages from multi-turn call.") + + logger.info("Computing similarity between expectation and result") expected_result = f"user: {question_1}\nassistant: {expected_answer_1}\nuser: {question_2}\nassistant: {expected_answer_2}" cleaned_messages = clean_message(messages) score = compute_similarity(model, cleaned_messages, expected_result) @@ -167,8 +170,7 @@ def clean_message(message: str): raise AccuracyValidationException( f"Accuracy error between {expected_result} and {messages}:\n SCORE: {score}" ) - - logger.debug(f"Stream Messages: {messages}") + logger.info("Similarity passed.") @pytest.mark.parametrize( @@ -196,6 +198,7 @@ def test_batch_multi_turn_qa( expected_answer_2_1 = "The largest city in the USA is New York City, with a population of over 8.4 million people, according to the United States Census Bureau (2020 estimates).assistant\n\nHowever, I should note that the largest city in the" expected_answer_2_2 = "That's correct, the iconic Empire State Building is located in Midtown Manhattan, New York City. It's one of the most recognizable landmarks in the world and a symbol of the city's grandeur and history.assistant\n\nAnd, by" + logger.info("Testing batch multi-turn Q&A run...") states = multi_turn_question.run_batch( [ { @@ -212,62 +215,66 @@ def test_batch_multi_turn_qa( first_qa = states[0] second_qa = states[1] - first_messages = first_qa.messages() - second_messages = second_qa.messages() - assert first_messages[0] == { + first_qa_messages = first_qa.messages() + second_qa_messages = second_qa.messages() + + logger.info("Testing first batch of messages...") + assert first_qa_messages[0] == { "role": "user", "content": question_1_1, } - assert first_messages[1]["role"] == "assistant" - first_answer = first_messages[1]["content"] + assert first_qa_messages[1]["role"] == "assistant" + first_answer = first_qa_messages[1]["content"] expected_answer = expected_answer_1_1 - score = compute_similarity(model, expected_answer, first_answer.lower()) + score = compute_similarity(model, expected_answer, first_answer) if not score > ACCEPTED_THRESHOLD: raise AccuracyValidationException( f"Accuracy error between {expected_answer} and {first_answer}:\n SCORE: {score}" ) - assert first_messages[2] == { + assert first_qa_messages[2] == { "role": "user", "content": question_1_2, } - first_messages[3]["role"] = "assistant" - second_answer = first_messages[3]["content"] + first_qa_messages[3]["role"] = "assistant" + second_answer = first_qa_messages[3]["content"] expected_answer = expected_answer_1_2 score = compute_similarity(model, expected_answer, second_answer) if not score > ACCEPTED_THRESHOLD: raise AccuracyValidationException( f"Accuracy error between {expected_answer} and {second_answer}:\n SCORE: {score}" ) + logger.info("First batch passed.") - second_messages = second_qa.messages() - assert second_messages[0] == { + logger.info("Testing second batch of messages...") + assert second_qa_messages[0] == { "role": "user", "content": question_2_1, } - assert second_messages[1]["role"] == "assistant" - first_answer = second_messages[1]["content"] + assert second_qa_messages[1]["role"] == "assistant" + first_answer = second_qa_messages[1]["content"] expected_answer = expected_answer_2_1 - score = compute_similarity(model, expected_answer, first_answer.lower()) + score = compute_similarity(model, expected_answer, first_answer) if not score > ACCEPTED_THRESHOLD: raise AccuracyValidationException( f"Accuracy error between {expected_answer} and {first_answer}:\n SCORE: {score}" ) - assert second_messages[2] == { + assert second_qa_messages[2] == { "role": "user", "content": question_2_2, } - second_messages[3]["role"] = "assistant" - second_answer = second_messages[3]["content"] + second_qa_messages[3]["role"] = "assistant" + second_answer = second_qa_messages[3]["content"] expected_answer = expected_answer_2_2 score = compute_similarity(model, expected_answer, second_answer) if not score > ACCEPTED_THRESHOLD: raise AccuracyValidationException( f"Accuracy error between {expected_answer} and {second_answer}:\n SCORE: {score}" ) + logger.info("Second batch passed.") @pytest.mark.parametrize( @@ -283,8 +290,12 @@ def test_batch_multi_turn_qa( def test_fork(load_comparison_model, start_server, register_shortfin_backend): model = load_comparison_model + logger.info("Testing fork...") state = tip_suggestion.run() result = state.text() + logger.info("Fork response received.") + + logger.info("Computing similarity...") expected_answer = """Here are two tips for staying healthy: 1. Balanced Diet. 2. Regular Exercise. Tip 1:A balanced diet is essential for maintaining good health. It involves consuming a variety of foods from different food groups, including fruits, vegetables, whole grains, lean proteins, and healthy fats. A balanced diet provides the body with the necessary nutrients, vitamins, and Tip 2:Regular exercise is essential for maintaining a healthy body. It helps to improve cardiovascular health, increase strength and flexibility, and boost the immune system. Regular physical activity can also reduce the risk of chronic diseases such as heart disease, diabetes, and certain types of cancer @@ -295,3 +306,4 @@ def test_fork(load_comparison_model, start_server, register_shortfin_backend): raise AccuracyValidationException( f"Accuracy error between {expected_answer} and {result}:\n SCORE: {score}" ) + logger.info("Similarity passed.")