From 27568ac1c9571362cee4aecdc6d140c08ac5c50f Mon Sep 17 00:00:00 2001
From: Vasilis Gkoles <v_gkoles@neuropublic.gr>
Date: Mon, 15 Jul 2024 16:27:44 +0300
Subject: [PATCH] Fix OGC services throwing bad request

---
 src/linkchecker.py | 146 ++++++++++++++++++++++++++++++---------------
 1 file changed, 97 insertions(+), 49 deletions(-)

diff --git a/src/linkchecker.py b/src/linkchecker.py
index de82448..77e0355 100644
--- a/src/linkchecker.py
+++ b/src/linkchecker.py
@@ -1,5 +1,6 @@
 from bs4 import BeautifulSoup
 from dotenv import load_dotenv
+from urllib.parse import urlparse, parse_qs, urlencode
 import subprocess
 import psycopg2
 import psycopg2.extras
@@ -119,24 +120,40 @@ def extract_links(url):
         print(f"Error extracting links from {url}: {e}")
         return []
 
-def run_linkchecker(urls):
-    for url in urls:
-        # Run LinkChecker Docker command with specified user and group IDs for each URL
-        process = subprocess.Popen([
-           "linkchecker",
-            "--verbose",
-            "--check-extern",
-            "--recursion-level=1",
-            "--timeout=5",
-            "--output=csv",
-            url + "?f=html"
-        ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+def check_single_url(url):
+    process = subprocess.Popen([
+        "linkchecker",
+        "--verbose",
+        "--check-extern",
+        "--recursion-level=0",
+        "--timeout=5",
+        "--output=csv",
+        url + "?f=html"
+    ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    # Process.communicate is good for shorter-running processes 
+    stdout, _ = process.communicate()
+
+    return stdout.decode('utf-8').strip().split('\n')
 
-        # Process the output line by line and yield each line
-        for line in process.stdout:
-            yield line.decode('utf-8').strip()  # Decode bytes to string and strip newline characters
-        # Wait for the process to finish
-        process.wait()
+def run_linkchecker(url):
+    # Run LinkChecker Docker command with specified user and group IDs for each URL
+    process = subprocess.Popen([
+        "linkchecker",
+        "--verbose",
+        "--check-extern",
+        "--recursion-level=1",
+        "--timeout=5",
+        "--output=csv",
+        url + "?f=html"
+    ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    # Process the output line by line and yield each line
+    # Memory efficient for large outputs
+    for line in process.stdout:
+        yield line.decode('utf-8').strip()  # Decode bytes to string and strip newline characters
+    # Wait for the process to finish
+    process.wait()
        
 def insert_or_update_link(conn, urlname, status, result, info, warning, is_valid):
     
@@ -223,6 +240,35 @@ def get_active_urls(conn):
         else:
             cur.execute("SELECT url FROM validation_history WHERE NOT deprecated")
             return [row[0] for row in cur.fetchall()]
+        
+def determine_service_type(url):
+    ogc_patterns = ['/wms', '/wfs', '/csw', '/wcs', 'service=']
+    
+    if any(pattern in url.lower() for pattern in ogc_patterns):
+        parsed_url = urlparse(url)
+        query_params = parse_qs(parsed_url.query)
+        
+        query_params.pop('service', None)
+        query_params.pop('request', None)
+        
+        query_params['request'] = ['GetCapabilities']
+        
+        if 'service' not in query_params:
+            if '/wms' in parsed_url.path.lower():
+                query_params['service'] = ['WMS']
+            elif '/wfs' in parsed_url.path.lower():
+                query_params['service'] = ['WFS']
+            elif '/csw' in parsed_url.path.lower():
+                query_params['service'] = ['CSW']
+            elif '/wcs' in parsed_url.path.lower():
+                query_params['service'] = ['WCS']
+        
+        new_query = urlencode(query_params, doseq=True)
+        new_url = parsed_url._replace(query=new_query).geturl()
+        
+        return new_url
+    
+    return url
 
 def main():
     start_time = time.time()  # Start timing
@@ -247,43 +293,45 @@ def main():
         extracted_links = extract_links(url)
         all_links.update(extracted_links)  # Add new links to the set of all links
    
-    # Define the formats to be removed
-    formats_to_remove = [
-        'collections/' + collection + '/items?offset',
-        '?f=json'
-    ]
-           
     # Specify the fields to include in the CSV file
     fields_to_include = ['urlname', 'parentname', 'baseref', 'valid', 'result', 'warning', 'info']
 
     print("Checking Links...")
+    
     # Run LinkChecker and process the output
-    for line in run_linkchecker(all_links):
-        if re.match(r'^http', line):
-            # Remove trailing semicolon and split by semicolon
-            values = line.rstrip(';').split(';')
-           
-            # Filter and pad values based on fields_to_include
-            filtered_values = [str(values[i]) if i < len(values) else "" for i in range(len(fields_to_include))]
-           
-            # Destructure filtered_values
-            urlname, parentname, baseref, valid, result, warning, info = filtered_values
-            # print(f"""
-            #     Urlname:    {urlname}
-            #     parentname: {parentname}
-            #     baseref:    {baseref}
-            #     valid:      {valid}
-            #     result:     {result}
-            #     warning:    {warning}
-            #     info:       {info}
-            #     Is valid:   {is_valid_status(valid)}
-            # """)
-            is_valid = is_valid_status(valid)
-
-            link_id = insert_or_update_link(conn, urlname, valid, result, info, warning, is_valid)
-
-            # Insert parent information
-            insert_parent(conn, parentname, baseref, link_id)
+    urls_to_recheck = set()
+    print("Initial Link Checking...")
+    for url in all_links:
+        for line in run_linkchecker(url):
+            if re.match(r'^http', line):
+                values = line.rstrip(';').split(';')
+                urlname = values[0]               
+                
+                # Parse initial check results
+                filtered_values = [str(values[i]) if i < len(values) else "" for i in range(len(fields_to_include))]
+                urlname, parentname, baseref, valid, result, warning, info = filtered_values
+                
+                # Determine if URL needs to be rechecked
+                processed_url = determine_service_type(urlname)
+                if processed_url != urlname:
+                    urls_to_recheck.add(processed_url)
+                else:
+                    # If URL doesn't need reprocessing, insert results directly
+                    is_valid = is_valid_status(valid)
+                    link_id = insert_or_update_link(conn, urlname, valid, result, info, warning, is_valid)
+                    insert_parent(conn, parentname, baseref, link_id)
+    
+    print("Rechecking OGC processed URLs...")
+    for url in urls_to_recheck:
+        results = check_single_url(url)
+        for line in results:
+            if re.match(r'^http', line):
+                values = line.rstrip(';').split(';')
+                filtered_values = [str(values[i]) if i < len(values) else "" for i in range(len(fields_to_include))]
+                urlname, parentname, baseref, valid, result, warning, info = filtered_values
+                is_valid = is_valid_status(valid)
+                link_id = insert_or_update_link(conn, urlname, valid, result, info, warning, is_valid)
+                insert_parent(conn, parentname, baseref, link_id)
 
     # conn.commit()
     print("LinkChecker output written to PostgreSQL database")