Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get url availability over time #18

Merged
merged 3 commits into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,28 @@ Run the command below
* python -m uvicorn api:app --reload --host 0.0.0.0 --port 8000
To view the service of the FastAPI on [http://127.0.0.1:8000/docs]

# Get current URL Status History
This endpoint returns the history of a specific URL.
Let say we have the status of a specific URL over time

| id | url | validation_result | timestamp |
|-----|------------------------|-------------------|-------------------------|
| 1 | https://example.com | 200 OK | 2023-01-01 10:00:00+00 |
| 2 | https://wikipedia.com | 404 Not Found | 2023-01-01 10:00:05+00 |
| 3 | https://example.com | 200 OK | 2023-01-02 11:00:00+00 |
| 4 | https://wikipedia.com | 500 Server Error | 2023-01-02 11:00:05+00 |
| 5 | https://wikipedia.com | 200 OK | 2023-01-02 11:00:10+00 |

Running the `/Single_url_status_history` endpoint for the
https://wikipedia.com and setting limit = 2 it will fetch the following result:

| id | url | validation_result | timestamp |
|-----|------------------------|-------------------|-------------------------|
| 1 | https://wikipedia.com | 500 Server Error | 2023-01-02 11:00:05+00 |
| 2 | https://wikipedia.com | 404 Not Found | 2023-01-01 10:00:05+00 |

This is the URL's history in descenting order in datetime

# Docker
A Docker instance must be running for the linkchecker command to work.

Expand Down
Binary file modified src/__pycache__/api.cpython-311.pyc
Binary file not shown.
107 changes: 99 additions & 8 deletions src/api.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from fastapi import FastAPI, HTTPException
from fastapi import FastAPI, HTTPException, Query
from dotenv import load_dotenv
from databases import Database
from typing import List
from typing import List, Optional
from pydantic import BaseModel
from urllib.parse import unquote
from datetime import datetime
import asyncpg
import logging
import os

# Load environment variables from .env file
Expand All @@ -20,15 +22,25 @@

# FastAPI app instance
app = FastAPI()
logger = logging.getLogger(__name__)

# Define response model
class StatusResponse(BaseModel):
id: int # Example column, adjust based on your actual table schema
urlname: str
parentname: str
valid: str
warning: str

id: int
urlname: Optional[str]
parentname: Optional[str]
valid: Optional[str]
warning: Optional[str]

# Model to get the availability history of a specific url
class URLAvailabilityResponse(BaseModel):
url: Optional[str]
perent_url: Optional[str]
validation_valid: Optional[str]
result: Optional[str]
warning: Optional[str]
lastChecked: Optional[datetime]

# Define status lists
REDIRECTION_STATUSES = [
"301 Moved Permanently",
Expand Down Expand Up @@ -98,6 +110,85 @@ async def get_status_for_url(item):
data = await fetch_data(query=query, values={'item': decoded_item })
return data

@app.get("/Single_url_status_history", response_model=List[URLAvailabilityResponse])
async def get_current_url_status_history(
url: str = Query(..., description="URL to get avalability"),
limit: int = Query(100, ge=1, le=1000, description="Maximum number of results (default: 100, min: 1, max: 1000)")) -> List[URLAvailabilityResponse]:
query = """
SELECT
lo.urlname AS url,
lo.parentname AS parent_url,
lo.result AS result,
lo.warning AS warning,
vh.validation_result AS validation_valid,
vh.timestamp AS last_checked
FROM
linkchecker_output lo
JOIN (
SELECT
url,
validation_result,
timestamp,
ROW_NUMBER() OVER (PARTITION BY url ORDER BY timestamp DESC) as rn
FROM
validation_history
) vh ON lo.urlname = vh.url AND vh.rn = 1
WHERE (lo.urlname = :url)
LIMIT :limit
"""

try:
results = await fetch_data(query=query, values={'url': url, 'limit': limit})
logger.info(f"Query returned {len(results)} results.")

response_data = [URLAvailabilityResponse(**dict(row)) for row in results]

return response_data
except Exception as e:
logger.error(f"Error occurred: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))

@app.get("/All_url_status_history", response_model=List[URLAvailabilityResponse])
async def get_all_url_status_history(
limit: int = Query(100, ge=1, le=1000, description="Maximum number of results (default: 100, min: 1, max: 1000)")) -> List[URLAvailabilityResponse]:

query = """
SELECT
lo.urlname AS url,
lo.parentname AS parent_url,
lo.result AS result,
lo.warning AS warning,
vh.validation_result AS validation_valid,
vh.timestamp AS last_checked
FROM
linkchecker_output lo
JOIN (
SELECT
url,
validation_result,
timestamp,
ROW_NUMBER() OVER (PARTITION BY url ORDER BY timestamp DESC) as rn
FROM
validation_history
) vh ON lo.urlname = vh.url AND vh.rn = 1
ORDER BY
vh.timestamp DESC
LIMIT :limit
"""

values = {"limit": limit}

try:
results = await fetch_data(query=query, values=values)
logging.info(f"Query returned {len(results)} results.")

response_data = [URLAvailabilityResponse(**row) for row in results]

return response_data
except Exception as e:
logging.error(f"Error occurred: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))

# Start the application
@app.on_event('startup')
async def startup():
Expand Down
28 changes: 27 additions & 1 deletion src/linkchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,23 @@ def setup_database():
)
"""
cur.execute(create_table_query)


# Check if the validation_history table exists
cur.execute("SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_name = 'validation_history')")
validation_history_table_exists = cur.fetchone()[0]

if not validation_history_table_exists:
# Create the validation_history table if it doesn't exist
create_validation_history_table = """
CREATE TABLE validation_history (
id SERIAL PRIMARY KEY,
url TEXT NOT NULL,
validation_result TEXT NOT NULL,
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
cur.execute(create_validation_history_table)

# Commit the changes
conn.commit()

Expand Down Expand Up @@ -121,6 +137,14 @@ def run_linkchecker(urls):
# Wait for the process to finish
process.wait()

def insert_validation_history(conn, url, validation_result):
with conn.cursor() as cur:
cur.execute(
"INSERT INTO validation_history (url, validation_result) VALUES (%s, %s)",
(url, validation_result)
)
conn.commit()

def main():
start_time = time.time() # Start timing
# Set up the database and create the table
Expand Down Expand Up @@ -172,6 +196,8 @@ def main():
"""
cur.execute(insert_query, filtered_values)
conn.commit()

insert_validation_history(conn, filtered_values[0], filtered_values[3])

print("LinkChecker output written to PostgreSQL database")

Expand Down