Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the smoke test (except the benchmark runs) #547

Merged
merged 13 commits into from
Oct 1, 2024
35 changes: 16 additions & 19 deletions .github/workflows/scheduled-smoke-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ jobs:

[openai]
api_key = "${{ secrets.OPENAI_API_KEY }}"

[hugging_face]
token = "${{ secrets.HUGGING_FACE_TOKEN }}"

[demo]
api_key="12345"
Expand All @@ -76,35 +79,29 @@ jobs:
run: |
source .venv/bin/activate
pytest --expensive-tests

# TODO Disabled pending Modelbench#509
# - name: Test standard run
# run: |
# source .venv/bin/activate
# modelbench benchmark --debug -m 1
#
# - name: Test v1 run
# run: |
# source .venv/bin/activate
# modelbench benchmark -m 1 --benchmark GeneralPurposeAiChatBenchmarkV1

- name: Ensure the artifact published on Pypi still works as expected
run: |
rm -rf .venv
mkdir -p ../installation/config
cat ./tests/data/install_pyproject.toml > ../installation/pyproject.toml
cat ./tests/modelgauge_tests/data/install_pyproject.toml > ../installation/pyproject.toml
cd ../installation
touch ./config/secrets.toml
poetry lock
poetry install --no-root
poetry run modelgauge list-tests

- name: Test standard run
run: |
source .venv/bin/activate
modelbench benchmark --debug -m 1

- uses: JasonEtco/create-an-issue@v2
if: failure()
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
RUN_ID: ${{ github.run_id }}
with:
filename: .github/failed-scheduled-issue.md

- name: Test v1 run
run: |
source .venv/bin/activate
modelbench benchmark --debug -m 1 --benchmark GeneralPurposeAiChatBenchmarkV1

- uses: JasonEtco/create-an-issue@v2
if: failure()
env:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def __init__(self, uid: str, inference_endpoint: str, token: HuggingFaceInferenc
def _create_client(self):
endpoint = get_inference_endpoint(self.inference_endpoint, token=self.token.value)

timeout = 60 * 6
timeout = 60 * 10
if endpoint.status in [
InferenceEndpointStatus.PENDING,
InferenceEndpointStatus.INITIALIZING,
Expand All @@ -61,7 +61,7 @@ def _create_client(self):
endpoint.wait(timeout)
elif endpoint.status != InferenceEndpointStatus.RUNNING:
raise ConnectionError(
"Endpoint is not running: Please contact admin to ensure endpoint is starting or running"
f"Endpoint is not running: Please contact admin to ensure endpoint is starting or running (status: {endpoint.status})"
)

self.client = InferenceClient(base_url=endpoint.url, token=self.token.value)
Expand Down
8 changes: 6 additions & 2 deletions plugins/validation_tests/test_object_creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,19 @@ def test_all_suts_construct_and_record_init(sut_name):
assert isinstance(sut.initialization_record, InitializationRecord)


SUTS_THAT_WE_DONT_CARE_ABOUT_FAILING = {"StripedHyena-Nous-7B"}


# This test can take a while, and we don't want a test run to fail
# just because an external service is being slow. So we set a somewhat
# high timeout value that gives the test a chance to complete most of the time,
# but still fails if the external service really is flaky or slow, so we can
# get a sense of a real user's experience.
@expensive_tests
@pytest.mark.timeout(45)
@pytest.mark.parametrize("sut_name", [key for key, _ in SUTS.items()])
@pytest.mark.timeout(650) # up to 10 minutes for Hugging Face spinup, plus some time for the test itself
@pytest.mark.parametrize("sut_name", set(SUTS.keys()) - SUTS_THAT_WE_DONT_CARE_ABOUT_FAILING)
def test_all_suts_can_evaluate(sut_name):

sut = SUTS.make_instance(sut_name, secrets=load_secrets_from_config())
assert isinstance(sut, PromptResponseSUT), "Update this test to handle other types."
if AcceptsTextPrompt in sut.capabilities:
Expand Down
1 change: 0 additions & 1 deletion src/modelbench/suts.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ def _register_required_suts():
"mistral-7b",
"mixtral-8x-7b",
"qwen-72b",
"stripedhyena-nous-7b",
]
]

Expand Down
5 changes: 5 additions & 0 deletions src/modelgauge/instance_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,8 @@ def items(self) -> List[Tuple[str, FactoryEntry[_T]]]:
"""List all items in the registry."""
with self.lock:
return list(self._lookup.items())

def keys(self) -> List[str]:
"""List all keys in the registry."""
with self.lock:
return list(self._lookup.keys())
2 changes: 1 addition & 1 deletion src/modelgauge/tests/safe_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class BaseSafeTestVersion1(PromptResponseTest, ABC):
"iwp",
"hte",
"ssh",
"sxc",
# "sxc", TODO removed for lack of content
]
annotators: AnnotatorSet

Expand Down
Binary file modified tests/modelgauge_tests/data/sample_cache.sqlite
Binary file not shown.
8 changes: 8 additions & 0 deletions tests/modelgauge_tests/test_instance_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,14 @@ def test_lists_all_items():
]


def test_keys():
factory = InstanceFactory[MockClass]()
factory.register(MockClass, "k1", "v1")
factory.register(MockClass, "k2", "v2")
factory.register(MockClass, "k3", "v3")
assert factory.keys() == ["k1", "k2", "k3"]


def test_factory_entry_str():
entry = FactoryEntry(MockClass, uid="k1", args=("v1",), kwargs={"arg2": "v2"})
assert str(entry) == "MockClass(uid=k1, args=('v1',), kwargs={'arg2': 'v2'})"
Expand Down
Loading