diff --git a/Dockerfile b/Dockerfile index b03cca5..e2007b6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,4 +37,6 @@ WORKDIR /home/link-liveliness-assessment/src EXPOSE 8000 -USER linky \ No newline at end of file +USER linky + +ENTRYPOINT [ "python3", "-m", "uvicorn", "api:app", "--reload", "--host", "0.0.0.0", "--port", "8000" ] \ No newline at end of file diff --git a/README.md b/README.md index 07accca..faf3951 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,31 @@ python -m uvicorn api:app --reload --host 0.0.0.0 --port 8000 ``` To view the service of the FastAPI on [http://127.0.0.1:8000/docs] + +# Get current URL Status History +This endpoint returns the history of a specific URL. +Let say we have the status of a specific URL over time + +| id | url | validation_result | timestamp | +|-----|------------------------|-------------------|-------------------------| +| 1 | https://example.com | 200 OK | 2023-01-01 10:00:00+00 | +| 2 | https://wikipedia.com | 404 Not Found | 2023-01-01 10:00:05+00 | +| 3 | https://example.com | 200 OK | 2023-01-02 11:00:00+00 | +| 4 | https://wikipedia.com | 500 Server Error | 2023-01-02 11:00:05+00 | +| 5 | https://wikipedia.com | 200 OK | 2023-01-02 11:00:10+00 | + +Running the `/Single_url_status_history` endpoint for the +https://wikipedia.com and setting limit = 2 it will fetch the following result: + +| id | url | validation_result | timestamp | +|-----|------------------------|-------------------|-------------------------| +| 1 | https://wikipedia.com | 500 Server Error | 2023-01-02 11:00:05+00 | +| 2 | https://wikipedia.com | 404 Not Found | 2023-01-01 10:00:05+00 | + +This is the URL's history in descenting order in datetime + +# Docker +======= ## Deploy `linky` at a path You can set `ROOTPATH` env var to run the api at a path (default is at root) @@ -70,6 +95,7 @@ export ROOTPATH=/linky ``` ## Docker + A Docker instance must be running for the linkchecker command to work. ## CI/CD diff --git a/src/__pycache__/api.cpython-311.pyc b/src/__pycache__/api.cpython-311.pyc index 0ab7756..6f6b951 100644 Binary files a/src/__pycache__/api.cpython-311.pyc and b/src/__pycache__/api.cpython-311.pyc differ diff --git a/src/api.py b/src/api.py index 3623f10..07d610b 100644 --- a/src/api.py +++ b/src/api.py @@ -1,10 +1,12 @@ -from fastapi import FastAPI, HTTPException +from fastapi import FastAPI, HTTPException, Query from dotenv import load_dotenv from databases import Database -from typing import List +from typing import List, Optional from pydantic import BaseModel from urllib.parse import unquote +from datetime import datetime import asyncpg +import logging import os # Load environment variables from .env file @@ -17,19 +19,37 @@ os.environ.get("POSTGRES_PORT") + "/" + os.environ.get("POSTGRES_DB") database = Database(DATABASE_URL) +rootpath = os.environ.get("ROOTPATH") or "/" # FastAPI app instance -rootpath=os.environ.get("ROOTPATH") or "/" -app = FastAPI(root_path=rootpath) +app = FastAPI( + title="Linkchecker-Liveness", + summary="Evaluate the status of URLs from OGC data catalogues", + root_path=rootpath +) +logger = logging.getLogger(__name__) + # Define response model class StatusResponse(BaseModel): - id: int # Example column, adjust based on your actual table schema - urlname: str - parentname: str - valid: str - warning: str - + id: int + urlname: Optional[str] + parentname: Optional[str] + valid: Optional[str] + warning: Optional[str] + +# Model to get the availability history of a specific url +class URLAvailabilityResponse(BaseModel): + url: Optional[str] + perent_url: Optional[str] + validation_valid: Optional[str] + result: Optional[str] + warning: Optional[str] + lastChecked: Optional[datetime] + +class DeprecatedUrlsResponse(BaseModel): + url: Optional[str] + # Define status lists REDIRECTION_STATUSES = [ "301 Moved Permanently", @@ -110,6 +130,101 @@ async def get_timeout_urls(): data = await fetch_data(query=query) return data +@app.get("/Single_url_status_history", response_model=List[URLAvailabilityResponse]) +async def get_current_url_status_history( + url: str = Query(..., description="URL to get avalability"), + limit: int = Query(100, ge=1, le=1000, description="Maximum number of results (default: 100, min: 1, max: 1000)")) -> List[URLAvailabilityResponse]: + query = """ + SELECT + lo.urlname AS url, + lo.parentname AS parent_url, + lo.result AS result, + lo.warning AS warning, + vh.validation_result AS validation_valid, + vh.timestamp AS last_checked + FROM + linkchecker_output lo + JOIN ( + SELECT + url, + validation_result, + timestamp, + ROW_NUMBER() OVER (PARTITION BY url ORDER BY timestamp DESC) as rn + FROM + validation_history + ) vh ON lo.urlname = vh.url AND vh.rn = 1 + WHERE (lo.urlname = :url) + LIMIT :limit + """ + + try: + results = await fetch_data(query=query, values={'url': url, 'limit': limit}) + logger.info(f"Query returned {len(results)} results.") + + response_data = [URLAvailabilityResponse(**dict(row)) for row in results] + + return response_data + except Exception as e: + logger.error(f"Error occurred: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/All_url_status_history", response_model=List[URLAvailabilityResponse]) +async def get_all_url_status_history( + limit: int = Query(100, ge=1, le=1000, description="Maximum number of results (default: 100, min: 1, max: 1000)")) -> List[URLAvailabilityResponse]: + + query = """ + SELECT + lo.urlname AS url, + lo.parentname AS parent_url, + lo.result AS result, + lo.warning AS warning, + vh.validation_result AS validation_valid, + vh.timestamp AS last_checked + FROM + linkchecker_output lo + JOIN ( + SELECT + url, + validation_result, + timestamp, + ROW_NUMBER() OVER (PARTITION BY url ORDER BY timestamp DESC) as rn + FROM + validation_history + ) vh ON lo.urlname = vh.url AND vh.rn = 1 + ORDER BY + vh.timestamp DESC + LIMIT :limit + """ + + values = {"limit": limit} + + try: + results = await fetch_data(query=query, values=values) + logging.info(f"Query returned {len(results)} results.") + + response_data = [URLAvailabilityResponse(**row) for row in results] + + return response_data + except Exception as e: + logging.error(f"Error occurred: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + +@app.get('/Deprecated URLs', response_model=List[DeprecatedUrlsResponse]) +async def get_deprecated_urls(): + query = """ + SELECT + us.url AS url + FROM + url_status us + WHERE us.deprecated = TRUE + """ + try: + data = await fetch_data(query=query) + return data + except Exception as e: + logging.error(f"Error occurred: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + # Start the application @app.on_event('startup') async def startup(): diff --git a/src/linkchecker.py b/src/linkchecker.py index c6cc573..8e1a7fc 100644 --- a/src/linkchecker.py +++ b/src/linkchecker.py @@ -9,11 +9,14 @@ import re import os +# When a URL reaches MAX_FAILURES consecutive failures it's marked +# as deprecated and excluded from future checks +MAX_FAILURES = 10 + # Load environment variables from .env file load_dotenv() # base catalog - base = os.environ.get("OGCAPI_URL") or "https://demo.pycsw.org/gisdata" collection = os.environ.get("OGCAPI_COLLECTION") or "metadata:main" @@ -21,7 +24,6 @@ catalogue_json_url= f"{base}/collections/{collection}/items?f=json" def setup_database(): - # Connect to the database conn = psycopg2.connect( host=os.environ.get("POSTGRES_HOST"), port=os.environ.get("POSTGRES_PORT"), @@ -31,35 +33,45 @@ def setup_database(): ) cur = conn.cursor() - # Check if the table exists - cur.execute("SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_name = 'linkchecker_output')") - table_exists = cur.fetchone()[0] - - if table_exists: - # If the table exists, truncate it and reset the primary key sequence - cur.execute("TRUNCATE TABLE linkchecker_output RESTART IDENTITY") - else: - # If the table does not exist, create it - create_table_query = """ - CREATE TABLE linkchecker_output ( - id SERIAL PRIMARY KEY, - urlname TEXT, - parentname TEXT, - baseref TEXT, - valid TEXT, - result TEXT, - warning TEXT, - info TEXT, - url TEXT, - name TEXT - ) - """ - cur.execute(create_table_query) - - # Commit the changes + # Create or truncate linkchecker_output table + cur.execute("DROP TABLE IF EXISTS linkchecker_output") + create_table_query = """ + CREATE TABLE linkchecker_output ( + id SERIAL PRIMARY KEY, + urlname TEXT, + parentname TEXT, + baseref TEXT, + valid TEXT, + result TEXT, + warning TEXT, + info TEXT, + url TEXT, + name TEXT + ) + """ + cur.execute(create_table_query) + + # Create validation_history table if it doesn't exist + cur.execute(""" + CREATE TABLE IF NOT EXISTS validation_history ( + id SERIAL PRIMARY KEY, + url TEXT NOT NULL, + validation_result TEXT NOT NULL, + timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + """) + + # Create url_status table if it doesn't exist + cur.execute(""" + CREATE TABLE IF NOT EXISTS url_status ( + url TEXT PRIMARY KEY, + consecutive_failures INTEGER DEFAULT 0, + deprecated BOOLEAN DEFAULT FALSE, + last_checked TIMESTAMP + ) + """) + conn.commit() - - # Return the connection and cursor before closing them return conn, cur def get_pagination_info(url): @@ -123,6 +135,74 @@ def run_linkchecker(urls): # Wait for the process to finish process.wait() +def insert_validation_history(conn, url, validation_result, is_valid): + with conn.cursor() as cur: + # Insert new record in validation_history + cur.execute(""" + INSERT INTO validation_history (url, validation_result) + VALUES (%s, %s) + """, (url, validation_result)) + + # Get current status + cur.execute("SELECT consecutive_failures, deprecated FROM url_status WHERE url = %s", (url,)) + result = cur.fetchone() + + if result: + consecutive_failures, deprecated = result + if not is_valid: + consecutive_failures += 1 + else: + consecutive_failures = 0 + + deprecated = deprecated or (consecutive_failures >= MAX_FAILURES) + + # Update url_status + cur.execute(""" + UPDATE url_status + SET consecutive_failures = %s, + deprecated = %s, + last_checked = CURRENT_TIMESTAMP + WHERE url = %s + """, (consecutive_failures, deprecated, url)) + else: + # Insert new url_status if not exists + cur.execute(""" + INSERT INTO url_status (url, consecutive_failures, deprecated, last_checked) + VALUES (%s, %s, %s, CURRENT_TIMESTAMP) + """, (url, 0 if is_valid else 1, False)) + + conn.commit() + +def is_valid_status(valid_string): + # Return if status is valid or not + parts = valid_string.split() + if parts[0].isdigit(): + if 200 <= int(parts[0]) < 400: # Valid HTTP status codes range + return True + return False + +def get_active_urls(conn): + with conn.cursor() as cur: + cur.execute("SELECT COUNT(*) FROM validation_history") + count = cur.fetchone()[0] + + if count == 0: + return None # The table is empty + else: + cur.execute("SELECT url FROM validation_history WHERE NOT deprecated") + return [row[0] for row in cur.fetchall()] + +def get_all_urls(conn): + with conn.cursor() as cur: + cur.execute("SELECT COUNT(*) FROM validation_history") + count = cur.fetchone()[0] + + if count == 0: + return None # The table is empty + else: + cur.execute("SELECT url FROM validation_history") + return [row[0] for row in cur.fetchall()] + def main(): start_time = time.time() # Start timing # Set up the database and create the table @@ -151,21 +231,31 @@ def main(): 'collections/' + collection + '/items?offset', '?f=json' ] + + # Get the list of active (non-deprecated) URLs + all_known_urls = get_all_urls(conn) - # Filter out links with the specified formats - filtered_links = {link for link in all_links if not any(format_to_remove in (link or "") for format_to_remove in formats_to_remove)} - + if all_known_urls is None: + # First run on empty table, check all links + links_to_check = all_links + else: + # Check all known links plus any new links + links_to_check = set(all_known_urls) | all_links + # Specify the fields to include in the CSV file fields_to_include = ['urlname', 'parentname', 'baseref', 'valid', 'result', 'warning', 'info', 'url', 'name'] print("Checking Links...") # Run LinkChecker and process the output - for line in run_linkchecker(filtered_links): + for line in run_linkchecker(links_to_check): if re.match(r'^http', line): # Remove trailing semicolon and split by semicolon values = line.rstrip(';').split(';') filtered_values = [values[field] if field < len(values) else "" for field in range(len(fields_to_include))] + is_valid = False + if is_valid_status(filtered_values[3]): + is_valid = True # Insert the data into the PostgreSQL table for each link insert_query = """ INSERT INTO linkchecker_output @@ -174,6 +264,8 @@ def main(): """ cur.execute(insert_query, filtered_values) conn.commit() + + insert_validation_history(conn, filtered_values[0], filtered_values[3], is_valid) print("LinkChecker output written to PostgreSQL database")