Skip to content

Commit

Permalink
Update api to retrun record for each url
Browse files Browse the repository at this point in the history
  • Loading branch information
vgole001 committed Oct 24, 2024
1 parent 2b17b1b commit ba49c61
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 19 deletions.
13 changes: 4 additions & 9 deletions linkcheck/linkchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@

# format catalogue path with f-string
catalogue_json_url= f"{base}/collections/{collection}/items?f=json"

catalogue_domain = f"{base}/collections/{collection}/items/"
catalogue_domain= f"{base}/collections/{collection}/items/"

class URLChecker:
def __init__(self, timeout=TIMEOUT):
Expand Down Expand Up @@ -248,7 +247,6 @@ def extract_relevant_links_from_json(json_url):
data = response.json()
links_map = {} # Dictionary to store URL:record_id pairs

# Handle features array
features = data.get('features', [])
if features:
for feature in features:
Expand All @@ -257,7 +255,7 @@ def extract_relevant_links_from_json(json_url):
if record_id:
# Create a temporary set to store links for this feature
feature_links = set()
# Process links array if it exists
# Process links array
for link in feature.get('links', []):
process_item(link, feature_links)
# Add all links from this feature to the map with their record ID
Expand All @@ -271,7 +269,6 @@ def extract_relevant_links_from_json(json_url):

def process_item(item, relevant_links):
if isinstance(item, dict) and 'href' in item and item['href'] not in [None, '', 'null']:
# Make sure href is actually a URL
if item['href'].startswith('http'):
if 'rel' in item and item['rel'] not in [None,''] and item['rel'].lower() in ['collection', 'self', 'root', 'prev', 'next', 'canonical']:
None
Expand All @@ -284,13 +281,11 @@ def main():
conn, cur = setup_database()
url_checker = URLChecker()

base_url = 'https://catalogue.ejpsoil.eu/collections/metadata:main/items?offset='
catalogue_json_url = 'https://catalogue.ejpsoil.eu/collections/metadata:main/items?f=json'

base_url = base + 'collections/' + collection + '/items?offset='
total_pages, items_per_page = get_pagination_info(catalogue_json_url)

print('Extracting links from catalogue...')
url_record_map = {} # Master dictionary to store URL to record_id mapping
url_record_map = {} # Dictionary to store URL to record_id mapping

# Process catalogue pages
for page in range(total_pages):
Expand Down
29 changes: 19 additions & 10 deletions src/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,16 @@ class LinkResponse(BaseModel):
consecutive_failures: Optional[int] = None

class StatusResponse(LinkResponse):
status_code: Optional[int] = None
is_redirect: Optional[bool] = None
status_code: Optional[int] = None
record_id: Optional[str] = None
is_redirect: Optional[bool] = None
error_message: Optional[str] = None
timestamp: datetime

class TimeoutResponse(LinkResponse):
status_code: Optional[int] = None # Make status_code optional for timeout cases
final_url: Optional[str] = None
record_id: Optional[str] = None
is_redirect: Optional[bool] = None
error_message: Optional[str] = None
timestamp: datetime
Expand All @@ -68,8 +70,9 @@ async def fetch_data(query: str, values: dict = {}):
async def get_redirection_statuses():
query = """
SELECT l.id_link, l.urlname, l.deprecated, l.consecutive_failures,
vh.status_code, vh.is_redirect, vh.error_message, vh.timestamp
r.record_id, vh.status_code, vh.is_redirect, vh.error_message, vh.timestamp
FROM links l
JOIN records r ON l.fk_record = r.id
JOIN validation_history vh ON l.id_link = vh.fk_link
WHERE vh.status_code = ANY(:statuses)
AND vh.timestamp = (
Expand All @@ -86,8 +89,9 @@ async def get_redirection_statuses():
async def get_client_error_statuses():
query = """
SELECT l.id_link, l.urlname, l.deprecated, l.consecutive_failures,
vh.status_code, vh.is_redirect, vh.error_message, vh.timestamp
r.record_id, vh.status_code, vh.is_redirect, vh.error_message, vh.timestamp
FROM links l
JOIN records r ON l.fk_record = r.id
JOIN validation_history vh ON l.id_link = vh.fk_link
WHERE vh.status_code = ANY(:statuses)
AND vh.timestamp = (
Expand All @@ -104,8 +108,9 @@ async def get_client_error_statuses():
async def get_server_error_statuses():
query = """
SELECT l.id_link, l.urlname, l.deprecated, l.consecutive_failures,
vh.status_code, vh.is_redirect, vh.error_message, vh.timestamp
r.record_id, vh.status_code, vh.is_redirect, vh.error_message, vh.timestamp
FROM links l
JOIN records r ON l.fk_record = r.id
JOIN validation_history vh ON l.id_link = vh.fk_link
WHERE vh.status_code = ANY(:statuses)
AND vh.timestamp = (
Expand All @@ -122,8 +127,9 @@ async def get_server_error_statuses():
async def get_status_for_url(item):
query = """
SELECT l.id_link, l.urlname, l.deprecated, l.consecutive_failures,
vh.status_code, vh.is_redirect, vh.error_message, vh.timestamp
r.record_id, vh.status_code, vh.is_redirect, vh.error_message, vh.timestamp
FROM links l
JOIN records r ON l.fk_record = r.id
JOIN validation_history vh ON l.id_link = vh.fk_link
WHERE l.urlname = :item
AND vh.timestamp = (
Expand All @@ -140,8 +146,9 @@ async def get_status_for_url(item):
async def get_timeout_urls():
query = """
SELECT l.id_link, l.urlname, l.deprecated, l.consecutive_failures,
vh.status_code, vh.is_redirect, vh.error_message, vh.timestamp
r.record_id, vh.status_code, vh.is_redirect, vh.error_message, vh.timestamp
FROM links l
JOIN records r ON l.fk_record = r.id
JOIN validation_history vh ON l.id_link = vh.fk_link
WHERE (vh.error_message LIKE '%ReadTimeout%' OR vh.error_message LIKE '%ConnectTimeout%')
AND vh.timestamp = (
Expand All @@ -156,8 +163,9 @@ async def get_timeout_urls():
@app.get('/Deprecated_URLs', response_model=List[LinkResponse])
async def get_deprecated_urls():
query = """
SELECT id_link, urlname, deprecated, consecutive_failures
SELECT id_link, urlname, r.record_id, deprecated, consecutive_failures
FROM links
JOIN records r ON l.fk_record = r.id
WHERE deprecated IS TRUE
"""
data = await fetch_data(query=query)
Expand All @@ -173,15 +181,16 @@ async def get_url_status_history(
l.id_link,
l.urlname,
l.deprecated,
r.record_id,
l.consecutive_failures,
vh.status_code,
vh.is_redirect,
vh.error_message,
vh.timestamp
FROM
links l
JOIN
validation_history vh ON l.id_link = vh.fk_link
JOIN records r ON l.fk_record = r.id
JOIN validation_history vh ON l.id_link = vh.fk_link
WHERE
l.urlname = :url
ORDER BY
Expand Down

0 comments on commit ba49c61

Please sign in to comment.