Skip to content

Check the status of providers if they provide a status check endpoint #87

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Executor handler module."""

import logging
import os
from json import JSONDecodeError, dumps
Expand All @@ -23,6 +24,7 @@
from etos_lib import ETOS
from etos_lib.opentelemetry.semconv import Attributes as SemConvAttributes
from opentelemetry import trace
from requests import Response
from requests.auth import HTTPBasicAuth, HTTPDigestAuth
from requests.exceptions import ConnectionError as RequestsConnectionError
from requests.exceptions import HTTPError
Expand Down Expand Up @@ -70,7 +72,10 @@ def __decrypt(self, password: Union[str, dict]) -> str:
return Fernet(key).decrypt(password_value).decode()

def __auth(
self, username: str, password: str, type: str = "basic" # pylint:disable=redefined-builtin
self,
username: str,
password: str,
type: str = "basic", # pylint:disable=redefined-builtin
) -> Union[HTTPBasicAuth, HTTPDigestAuth]:
"""Create an authentication for HTTP request.

Expand All @@ -84,25 +89,30 @@ def __auth(
return HTTPBasicAuth(username, password)
return HTTPDigestAuth(username, password)

def do_request(self, request: dict) -> Response:
"""Make an HTTP request based on a request dictionary."""
if request.get("auth"):
request["auth"] = self.__auth(**request["auth"])
method = getattr(self.etos.http, request.pop("method").lower())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will it be robust enough if one failed check leads to termination? Even if there are retries.

response = method(**request)
response.raise_for_status()
return response

def run_tests(self, test_suite: dict) -> None:
"""Run tests in jenkins.

:param test_suite: Tests to execute.
"""
executor = test_suite.get("executor")
request = executor.get("request")
if request.get("auth"):
request["auth"] = self.__auth(**request["auth"])
method = getattr(self.etos.http, request.pop("method").lower())
span_name = "start_execution_space"
with self.tracer.start_as_current_span(span_name, kind=trace.SpanKind.CLIENT) as span:
span.set_attribute(
SemConvAttributes.EXECUTOR_ID, executor["id"] if "id" in executor else ""
)
span.set_attribute("http.request.body", dumps(request))
try:
response = method(**request)
response.raise_for_status()
response = self.do_request(request)
except HTTPError as http_error:
try:
exc = TestStartException(http_error.response.json())
Expand Down
53 changes: 53 additions & 0 deletions projects/etos_suite_runner/src/etos_suite_runner/lib/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test suite handler."""

import os
import json
import logging
Expand All @@ -32,6 +33,7 @@
from jsontas.jsontas import JsonTas
import opentelemetry
from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
from requests.exceptions import ConnectionError as RequestsConnectionError, HTTPError

from .esr_parameters import ESRParameters
from .exceptions import EnvironmentProviderException
Expand Down Expand Up @@ -121,6 +123,51 @@ def outcome(self) -> dict:
return self.test_suite_finished.get("data", {}).get("testSuiteOutcome", {})
return {}

def _environment_active(self) -> bool:
"""Check the status of the test environment."""
status_requests = (
(
self.environment["iut"].get("provider_id", "iut_provider"),
self.environment["iut"].get("status_request"),
),
(
self.environment["executor"].get("provider_id", "execution_space_provider"),
self.environment["executor"].get("status_request"),
),
(
self.environment["log_area"].get("provider_id", "log_area_provider"),
self.environment["log_area"].get("status_request"),
),
)
executor = Executor(self.etos)
for name, request in status_requests:
if request is None:
continue
try:
response = executor.do_request(request)
response.json()
except RequestsConnectionError:
self.logger.exception(
"Could not communicate with the status check endpoint for %r (%r)",
name,
request.get("url"),
)
return False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should an error like this return None, to indicate that health check was inconclusive? So that a few more retries could be done?

In other words, False should be returned only when we know for sure that the environment is inactive.

except HTTPError as http_error:
self.logger.error(
"Got error code %r when communicating with %r (%r)",
http_error.response.status_code,
name,
request.get("url"),
)
return False
except json.JSONDecodeError:
self.logger.exception(
"Failed to parse the response from for %r (%r)", name, request.get("url")
)
return False
return True

def _start(self, identifier: str) -> None:
"""Start ETR for this sub suite.

Expand Down Expand Up @@ -156,6 +203,12 @@ def _start(self, identifier: str) -> None:
if self.finished:
self.logger.info("ETOS test runner has finished", extra={"user_log": True})
break
if not self._environment_active():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there risk of a race condition during termination, when self.finished is still False, but self._environment_active() already returns False?

self.logger.error(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be written to the OpenTelemetry span?

"ETOS test environment is no longer active! Canceling.",
extra={"user_log": True},
)
break
finally:
self.release(identifier)

Expand Down
Loading