Skip to content

Commit

Permalink
Gracefully handle HTTP errors
Browse files Browse the repository at this point in the history
closes #111

We were setting `raise_for_status` and not handling exceptions raised by aiohttp, which clogged up the logs with stack traces and env variables.

I did not add an automatic retry to HTTP requests because the issues in the cluster are not necessarily going to solve themselves in the scope of 5 retried requests.

I'll be dealing with the gitlab 500 errors in a different PR.
  • Loading branch information
cmelone committed Oct 11, 2024
1 parent 25cadab commit 79695e6
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 13 deletions.
2 changes: 1 addition & 1 deletion gantry/clients/gitlab.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ async def _request(self, url: str, response_type: str) -> dict | str:
returns: the response from Gitlab in the specified format
"""

async with aiohttp.ClientSession(raise_for_status=True) as session:
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=self.headers) as resp:
if response_type == "json":
return await resp.json()
Expand Down
10 changes: 5 additions & 5 deletions gantry/clients/prometheus/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,17 @@ async def query_range(self, query: str | dict, start: int, end: int) -> list:

async def _query(self, url: str) -> list:
"""Query Prometheus with a query string"""
async with aiohttp.ClientSession(raise_for_status=True) as session:
async with aiohttp.ClientSession() as session:
# submit cookie with request
async with session.get(url, cookies=self.cookies) as resp:
try:
return await resp.json()
except aiohttp.ContentTypeError:
logger.error(
"""Prometheus query failed with unexpected response.
The cookie may have expired."""
# this will get caught in collection.py and fetch_job won't continue
raise aiohttp.ClientError(
"""Prometheus query failed with unexpected
response, cookie may have expired."""
)
return {}

def prettify_res(self, response: dict) -> list:
"""Process Prometheus response into a list of dicts with {label: value}"""
Expand Down
20 changes: 13 additions & 7 deletions gantry/routes/collection.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import re

import aiohttp
import aiosqlite

from gantry.clients import db
Expand Down Expand Up @@ -56,20 +57,25 @@ async def fetch_job(
):
return

# check if the job is a ghost
job_log = await gitlab.job_log(job.gl_id)
is_ghost = "No need to rebuild" in job_log
if is_ghost:
logger.warning(f"job {job.gl_id} is a ghost, skipping")
return

try:
# all code that makes HTTP requests should be in this try block

# check if the job is a ghost
job_log = await gitlab.job_log(job.gl_id)
is_ghost = "No need to rebuild" in job_log
if is_ghost:
logger.warning(f"job {job.gl_id} is a ghost, skipping")
return

annotations = await prometheus.job.get_annotations(job.gl_id, job.midpoint)
resources, node_hostname = await prometheus.job.get_resources(
annotations["pod"], job.midpoint
)
usage = await prometheus.job.get_usage(annotations["pod"], job.start, job.end)
node_id = await fetch_node(db_conn, prometheus, node_hostname, job.midpoint)
except aiohttp.ClientError as e:
logger.error(f"Request failed: {e}")
return
except IncompleteData as e:
# missing data, skip this job
logger.error(f"{e} job={job.gl_id}")
Expand Down

0 comments on commit 79695e6

Please sign in to comment.