Skip to content

Commit

Permalink
Merge branch 'main' into handle-ogc-links
Browse files Browse the repository at this point in the history
  • Loading branch information
pvgenuchten authored Oct 9, 2024
2 parents 74b4760 + 29d8183 commit 1ac3f20
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 165 deletions.
2 changes: 1 addition & 1 deletion linkcheck/linkchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def insert_or_update_link(conn, url_result):
""", (urlname, 0 if url_result['valid'] else 1, url_result['valid'], url_result['valid'], MAX_FAILURES))

link_id, deprecated = cur.fetchone()

if not deprecated:
# Insert validation history
cur.execute("""
Expand Down
3 changes: 2 additions & 1 deletion linkcheck/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ psycopg2-binary
asyncpg
linkchecker
databases
python-dotenv
python-dotenv
lxml
255 changes: 92 additions & 163 deletions src/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from databases import Database
from typing import List, Optional
from pydantic import BaseModel
from urllib.parse import unquote
from datetime import datetime
import asyncpg
import logging
Expand All @@ -13,7 +12,6 @@
load_dotenv()

# Database connection setup
# Load environment variables securely (replace with your actual variable names)
DATABASE_URL = "postgresql://" + os.environ.get("POSTGRES_USER") + ":" +\
os.environ.get("POSTGRES_PASSWORD") + "@" + os.environ.get("POSTGRES_HOST") + ":" +\
os.environ.get("POSTGRES_PORT") + "/" + os.environ.get("POSTGRES_DB")
Expand All @@ -29,52 +27,30 @@
)
logger = logging.getLogger(__name__)


# Define response model
class StatusResponse(BaseModel):
id: int
urlname: Optional[str]
parent_urls: Optional[List[str]]
status: Optional[str]
result: Optional[str]
info: Optional[str]
warning: Optional[str]
# Define response models
class LinkResponse(BaseModel):
id_link: int
urlname: str
deprecated: Optional[bool] = None

# Model to get the availability history of a specific url
class URLAvailabilityResponse(BaseModel):
urlname: Optional[str] = None
status: Optional[str] = None
result: Optional[str] = None
info: Optional[str] = None
warning: Optional[str] = None
validation_valid: Optional[str] = None
last_checked: datetime

consecutive_failures: Optional[int] = None

class StatusResponse(LinkResponse):
status_code: Optional[int] = None
is_redirect: Optional[bool] = None
error_message: Optional[str] = None
timestamp: datetime

class TimeoutResponse(LinkResponse):
status_code: Optional[int] = None # Make status_code optional for timeout cases
final_url: Optional[str] = None
is_redirect: Optional[bool] = None
error_message: Optional[str] = None
timestamp: datetime

# Define status lists
REDIRECTION_STATUSES = [
"301 Moved Permanently",
"302 Found (Moved Temporarily)",
"304 Not Modified",
"307 Temporary Redirect",
"308 Permanent Redirect"
]

CLIENT_ERROR_STATUSES = [
"400 Bad Request",
"401 Unauthorized",
"403 Forbidden",
"404 Not Found",
"405 Method Not Allowed",
"409 Conflict"
]

SERVER_ERROR_STATUSES = [
"500 Internal Server Error",
"501 Not Implemented",
"503 Service Unavailable",
"504 Gateway Timeout"
]
REDIRECTION_STATUSES = [301, 302, 304, 307, 308]
CLIENT_ERROR_STATUSES = [400, 401, 403, 404, 405, 409]
SERVER_ERROR_STATUSES = [500, 501, 503, 504]

# Helper function to execute SQL query and fetch results
async def fetch_data(query: str, values: dict = {}):
Expand All @@ -86,27 +62,21 @@ async def fetch_data(query: str, values: dict = {}):
except Exception as e:
logging.error(f"Database query failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Database query failed")

# Endpoint to retrieve data with redirection statuses
@app.get('/Redirection_URLs/3xx', response_model=List[StatusResponse])
async def get_redirection_statuses():
query = """
SELECT
l.id_link AS id,
l.urlname,
l.status,
l.warning,
l.result,
l.info,
array_remove(array_agg(DISTINCT p.parentname), NULL) AS parent_urls
FROM
links l
LEFT JOIN
parent p ON l.id_link = p.fk_link
WHERE
l.status ILIKE ANY (:statuses)
GROUP BY
l.id_link, l.urlname, l.status, l.warning, result, info
SELECT l.id_link, l.urlname, l.deprecated, l.consecutive_failures,
vh.status_code, vh.is_redirect, vh.error_message, vh.timestamp
FROM links l
JOIN validation_history vh ON l.id_link = vh.fk_link
WHERE vh.status_code = ANY(:statuses)
AND vh.timestamp = (
SELECT MAX(timestamp)
FROM validation_history
WHERE fk_link = l.id_link
)
"""
data = await fetch_data(query=query, values={'statuses': REDIRECTION_STATUSES})
return data
Expand All @@ -115,22 +85,16 @@ async def get_redirection_statuses():
@app.get('/Client_Error_URLs/4xx', response_model=List[StatusResponse])
async def get_client_error_statuses():
query = """
SELECT
l.id_link AS id,
l.urlname,
l.status,
l.warning,
l.result,
l.info,
array_remove(array_agg(DISTINCT p.parentname), NULL) AS parent_urls
FROM
links l
LEFT JOIN
parent p ON l.id_link = p.fk_link
WHERE
l.status ILIKE ANY (:statuses)
GROUP BY
l.id_link, l.urlname, l.status, l.warning, result, info
SELECT l.id_link, l.urlname, l.deprecated, l.consecutive_failures,
vh.status_code, vh.is_redirect, vh.error_message, vh.timestamp
FROM links l
JOIN validation_history vh ON l.id_link = vh.fk_link
WHERE vh.status_code = ANY(:statuses)
AND vh.timestamp = (
SELECT MAX(timestamp)
FROM validation_history
WHERE fk_link = l.id_link
)
"""
data = await fetch_data(query=query, values={'statuses': CLIENT_ERROR_STATUSES})
return data
Expand All @@ -139,122 +103,87 @@ async def get_client_error_statuses():
@app.get('/Server_Errors_URLs/5xx', response_model=List[StatusResponse])
async def get_server_error_statuses():
query = """
SELECT
l.id_link AS id,
l.urlname,
l.status,
l.warning,
l.result,
l.info,
array_remove(array_agg(DISTINCT p.parentname), NULL) AS parent_urls
FROM
links l
LEFT JOIN
parent p ON l.id_link = p.fk_link
WHERE
l.status ILIKE ANY (:statuses)
GROUP BY
l.id_link, l.urlname, l.status, l.warning, result, info
SELECT l.id_link, l.urlname, l.deprecated, l.consecutive_failures,
vh.status_code, vh.is_redirect, vh.error_message, vh.timestamp
FROM links l
JOIN validation_history vh ON l.id_link = vh.fk_link
WHERE vh.status_code = ANY(:statuses)
AND vh.timestamp = (
SELECT MAX(timestamp)
FROM validation_history
WHERE fk_link = l.id_link
)
"""
data = await fetch_data(query=query, values={'statuses': SERVER_ERROR_STATUSES})
return data

# Endpoint to retrieve data with client error statuses
# Endpoint to retrieve data for a specific URL
@app.get('/status/{item:path}', response_model=List[StatusResponse])
async def get_status_for_url(item):
query = """
SELECT
l.id_link AS id,
l.urlname,
l.status,
l.warning,
l.result,
l.info,
array_remove(array_agg(DISTINCT p.parentname), NULL) AS parent_urls
FROM
links l
LEFT JOIN
parent p ON l.id_link = p.fk_link
WHERE
l.urlname = :item
GROUP BY
l.id_link, l.urlname, l.status, l.warning, result, info
SELECT l.id_link, l.urlname, l.deprecated, l.consecutive_failures,
vh.status_code, vh.is_redirect, vh.error_message, vh.timestamp
FROM links l
JOIN validation_history vh ON l.id_link = vh.fk_link
WHERE l.urlname = :item
AND vh.timestamp = (
SELECT MAX(timestamp)
FROM validation_history
WHERE fk_link = l.id_link
)
"""
data = await fetch_data(query=query, values={'item': item})
return data

# Endpoint to retrieve URLs that that timed out. Timeout is set to 5 seconds currently
@app.get('/Timeout_URLs', response_model=List[StatusResponse])
# Update the timeout endpoint to match other query structures
@app.get('/Timeout_URLs', response_model=List[TimeoutResponse])
async def get_timeout_urls():
query = """
SELECT
l.id_link AS id,
l.urlname,
l.status,
l.warning,
l.result,
l.info,
array_remove(array_agg(DISTINCT p.parentname), NULL) AS parent_urls
FROM
links l
LEFT JOIN
parent p ON l.id_link = p.fk_link
WHERE
l.status LIKE '%ReadTimeout%' OR l.status LIKE '%ConnectTimeout%'
GROUP BY
l.id_link, l.urlname, l.status, l.warning, result, info
SELECT l.id_link, l.urlname, l.deprecated, l.consecutive_failures,
vh.status_code, vh.is_redirect, vh.error_message, vh.timestamp
FROM links l
JOIN validation_history vh ON l.id_link = vh.fk_link
WHERE (vh.error_message LIKE '%ReadTimeout%' OR vh.error_message LIKE '%ConnectTimeout%')
AND vh.timestamp = (
SELECT MAX(timestamp)
FROM validation_history
WHERE fk_link = l.id_link
)
"""
data = await fetch_data(query=query)
return data

@app.get('/Deprecated URLs', response_model=List[StatusResponse])
@app.get('/Deprecated_URLs', response_model=List[LinkResponse])
async def get_deprecated_urls():
query = """
SELECT
l.id_link AS id,
l.urlname,
l.status,
l.warning,
l.result,
l.info,
l.deprecated,
array_remove(array_agg(DISTINCT p.parentname), NULL) AS parent_urls
FROM
links l
LEFT JOIN
parent p ON l.id_link = p.fk_link
WHERE l.deprecated IS TRUE
GROUP BY
l.id_link, l.urlname, l.status, l.warning, result, info
SELECT id_link, urlname, deprecated, consecutive_failures
FROM links
WHERE deprecated IS TRUE
"""
data = await fetch_data(query=query)
return data

@app.get("/URL_status_history", response_model=List[URLAvailabilityResponse])
@app.get("/URL_status_history", response_model=List[StatusResponse])
async def get_url_status_history(
url: str = Query(..., description="URL to get availability history"),
limit: int = Query(100, ge=1, le=1000, description="Maximum number of results (default: 100, min: 1, max: 1000)")
) -> List[URLAvailabilityResponse]:
) -> List[StatusResponse]:
query = """
SELECT
l.id_link,
l.urlname,
l.status,
l.result,
l.info,
l.warning,
vh.validation_result AS validation_valid,
vh.timestamp AS last_checked,
array_agg(DISTINCT p.parentname) AS parent_urls
l.deprecated,
l.consecutive_failures,
vh.status_code,
vh.is_redirect,
vh.error_message,
vh.timestamp
FROM
links l
LEFT JOIN
parent p ON l.id_link = p.fk_link
LEFT JOIN
JOIN
validation_history vh ON l.id_link = vh.fk_link
WHERE
l.urlname = :url
GROUP BY
l.urlname, l.status, l.result, l.info, l.warning, vh.validation_result, vh.timestamp
ORDER BY
vh.timestamp DESC
LIMIT :limit
Expand All @@ -264,7 +193,7 @@ async def get_url_status_history(
results = await fetch_data(query=query, values={'url': url, 'limit': limit})
logger.info(f"Query returned {len(results)} results for URL: {url}")

response_data = [URLAvailabilityResponse(**dict(row)) for row in results]
response_data = [StatusResponse(**dict(row)) for row in results]

return response_data
except Exception as e:
Expand All @@ -284,4 +213,4 @@ async def shutdown():
try:
await database.disconnect()
except Exception as e:
raise HTTPException(status_code=500, detail="Database disconnection failed") from e
raise HTTPException(status_code=500, detail="Database disconnection failed") from e

0 comments on commit 1ac3f20

Please sign in to comment.