From dd6fb954cd12b5be6bf89a27ea5ffb15b043ba5a Mon Sep 17 00:00:00 2001 From: "Karl W. Schulz" Date: Fri, 17 Jan 2025 10:50:24 -0500 Subject: [PATCH] update victoriametrics availability check during exporter startup process; adds a polling loop to retry if server is not available on first try based on experiences using Lustre as a datastore at ORNL Signed-off-by: Karl W. Schulz --- omnistat/standalone.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/omnistat/standalone.py b/omnistat/standalone.py index dfb574d5..62a698bb 100755 --- a/omnistat/standalone.py +++ b/omnistat/standalone.py @@ -96,14 +96,26 @@ def __init__(self, args, config): logging.error("[ERROR]: Please set data_frequency_mins >= 1 minute (%s)" % self.__pushFrequencyMins) sys.exit(1) - # verify victoriaURL is accessible - failed = False - try: - response = requests.get(self.__victoriaURL) - if response.status_code != 204: + # verify victoriaURL is operational and ready to receive queries (poll for a bit if necessary) + failed = True + delay_start = 0.05 + testURL = f"http://{args.endpoint}:{args.port}/ready" + for iter in range(1, 25): + try: + response = requests.get(testURL) + logging.debug("VM ready response = %s" % response) + if response.status_code != 200: + failed = True + else: + failed = False + break + except requests.exceptions.RequestException as e: + logging.debug(e) failed = True - except requests.exceptions.RequestException as e: - failed = True + + delay = delay_start * iter + logging.info("Retrying VM endpoint (sleeping for %.2f sec)" % (delay)) + time.sleep(delay) if failed: logging.error("")