From dceafc9f3c696a0955c54b6187aa6e472cf691d5 Mon Sep 17 00:00:00 2001
From: Zac Romick <zac.romick@choreograph.com>
Date: Tue, 2 Jul 2024 09:03:07 -0500
Subject: [PATCH] Added A-Z sorting and fixed long lines

---
 src/index.tsx | 128 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 84 insertions(+), 44 deletions(-)
diff --git a/src/index.tsx b/src/index.tsx
index fae6ec6..20dd094 100644
--- a/src/index.tsx
+++ b/src/index.tsx
@@ -1,36 +1,52 @@
 import React, { useState, useEffect } from 'react';
 import ReactDOM from 'react-dom/client';
-import { InfoIcon, Copy, Check } from 'lucide-react';
+import { InfoIcon, Copy, Check, SortAsc } from 'lucide-react';
 
 const HTMLLinkScraper = () => {
   const [html, setHtml] = useState<string>('');
-  const [links, setLinks] = useState<(string | null)[]>([]);
-  const [message, setMessage] = useState<string>('');
-  const [isCopied, setIsCopied] = useState<boolean>(false);
+  const [links, setLinks] = useState<string[]>([]);
+  const [message, setMessage] = useState('');
+  const [isCopied, setIsCopied] = useState(false);
   const [baseUrl, setBaseUrl] = useState<string>('');
-  const useFullUrls = true; // Always use full URLs
+  const [isSorted, setIsSorted] = useState(false);
+  const useFullUrls = true;
+  const maxDisplayLength = 100;
 
-  const extractBaseUrl = (html: string): string => {
+  const extractMostCommonBaseUrl = (html: string): string => {
     const parser = new DOMParser();
     const doc = parser.parseFromString(html, 'text/html');
+    const anchors = Array.from(doc.getElementsByTagName('a'));
+    const hrefs = anchors.map(a => a.getAttribute('href')).filter(Boolean) as string[];
+    const fullUrls = hrefs.filter(href => href.startsWith('http'));
 
-    // Try to extract base URL from <base> tag
-    const baseTag = doc.querySelector('base');
-    if (baseTag && baseTag.href) {
-      return new URL(baseTag.href).origin;
+    if (fullUrls.length === 0) {
+      return '';
     }
 
-    // Try to extract from first <a> tag with an absolute URL
-    const firstLink = doc.querySelector('a[href^="http"]') as HTMLAnchorElement;
-    if (firstLink && firstLink.href) {
-      return new URL(firstLink.href).origin;
+    const baseUrls = fullUrls.map(url => {
+      try {
+        const parsedUrl = new URL(url);
+        return `${parsedUrl.protocol}//${parsedUrl.hostname}`;
+      } catch (error) {
+        return null;
+      }
+    }).filter(Boolean) as string[];
+
+    if (baseUrls.length === 0) {
+      return '';
     }
 
-    // If no base URL found, return empty string
-    return '';
+    const urlCounts = baseUrls.reduce((acc, url) => {
+      acc[url] = (acc[url] || 0) + 1;
+      return acc;
+    }, {} as Record<string, number>);
+
+    const mostCommonBaseUrl = Object.entries(urlCounts).reduce((a, b) => a[1] > b[1] ? a : b)[0];
+
+    return mostCommonBaseUrl || '';
   };
 
-  const extractLinks = (html: string, baseUrl: string, useFullUrls: boolean): (string | null)[] => {
+  const extractLinks = (html: string, baseUrl: string, useFullUrls: boolean) => {
     const parser = new DOMParser();
     const doc = parser.parseFromString(html, 'text/html');
     const links = Array.from(doc.getElementsByTagName('a'))
@@ -39,22 +55,25 @@ const HTMLLinkScraper = () => {
         if (href && href.startsWith('http')) {
           return href;
         } else if (href && useFullUrls && baseUrl) {
-          return new URL(href, baseUrl).href;
+          try {
+            return new URL(href, baseUrl).href;
+          } catch {
+            return href;
+          }
+        } else if (href && href.startsWith('/')) {
+          return useFullUrls && baseUrl ? `${baseUrl}${href}` : `{base_url}${href}`;
         } else if (href) {
-          return useFullUrls && baseUrl ? new URL(href, baseUrl).href : `{base_url}${href.startsWith('/') ? '' : '/'}${href}`;
+          return useFullUrls && baseUrl ? `${baseUrl}/${href}` : `{base_url}/${href}`;
         }
         return null;
       });
-    return links;
+    return [...new Set(links)].filter(Boolean) as string[];
   };
 
   useEffect(() => {
-    const extractedBaseUrl = extractBaseUrl(html);
-    if (extractedBaseUrl && !baseUrl) {
-      setBaseUrl(extractedBaseUrl);
-    }
-
-    const extractedLinks = extractLinks(html, baseUrl || extractedBaseUrl, useFullUrls);
+    const extractedBaseUrl = extractMostCommonBaseUrl(html);
+    setBaseUrl(extractedBaseUrl);
+    const extractedLinks = extractLinks(html, extractedBaseUrl, useFullUrls);
     setLinks(extractedLinks);
     if (extractedLinks.length === 0) {
       setMessage(html.trim() ? 'No links were found in the provided HTML.' : '');
@@ -62,7 +81,8 @@ const HTMLLinkScraper = () => {
       setMessage(`${extractedLinks.length} link${extractedLinks.length === 1 ? '' : 's'} extracted.`);
     }
     setIsCopied(false);
-  }, [html, baseUrl, useFullUrls]);
+    setIsSorted(false);
+  }, [html, useFullUrls]);
 
   const handleCopyLinks = async () => {
     if (links.length === 0) {
@@ -70,7 +90,7 @@ const HTMLLinkScraper = () => {
       return;
     }
 
-    const linkText = links.filter(Boolean).join('\n');
+    const linkText = links.join('\n');
 
     try {
       await navigator.clipboard.writeText(linkText);
@@ -78,11 +98,21 @@ const HTMLLinkScraper = () => {
       setIsCopied(true);
       setTimeout(() => setIsCopied(false), 3000);
     } catch (err) {
-      console.error('Failed to copy links: ', err);
       setMessage('Failed to copy links. Please try again or copy manually.');
     }
   };
 
+  const handleSortLinks = () => {
+    const sortedLinks = [...links].sort((a, b) => a.localeCompare(b));
+    setLinks(sortedLinks);
+    setIsSorted(true);
+  };
+
+  const truncateDisplayText = (text: string, maxLength: number) => {
+    if (text.length <= maxLength) return text;
+    return `${text.slice(0, maxLength - 3) }...`;
+  };
+
   const tip = `The purpose of this site is to easily scrape a webpage for links.
 
   Here's how to do it:
@@ -92,9 +122,8 @@ const HTMLLinkScraper = () => {
   3. Right-click on the {<body>} tag in the Elements panel.
   4. Choose "Copy" -> "Copy element".
   5. Paste the copied HTML into the text box below.
-  6. The base URL will be automatically extracted if possible. You can manually adjust it if needed.
 
-  Note: All URLs will be displayed as full URLs when possible.`;
+  Note: The base URL will be automatically extracted from the most common URL in the HTML.`;
 
   return (
     <div className="w-full max-w-2xl mx-auto bg-white shadow-md rounded px-8 pt-6 pb-8 mb-4">
@@ -107,8 +136,9 @@ const HTMLLinkScraper = () => {
           type="text"
           value={baseUrl}
           onChange={(e) => setBaseUrl(e.target.value)}
-          placeholder="Base URL (auto-detected or enter manually)"
+          placeholder="Base URL (automatically detected)"
           className="w-full px-3 py-2 text-gray-700 border rounded-lg focus:outline-none"
+          readOnly
         />
       </div>
       <textarea
@@ -120,29 +150,40 @@ const HTMLLinkScraper = () => {
       />
       <div className="flex justify-between items-center mt-4">
         <span className="text-sm text-gray-500">{message}</span>
-        <button
-          onClick={handleCopyLinks}
-          disabled={links.length === 0}
-          className="bg-blue-500 hover:bg-blue-700 text-white font-bold py-2 px-4 rounded focus:outline-none focus:shadow-outline disabled:opacity-50"
-        >
-          {isCopied ? <Check className="inline-block mr-2" /> : <Copy className="inline-block mr-2" />}
-          {isCopied ? 'Copied!' : 'Copy Links'}
-        </button>
+        <div>
+          <button
+            onClick={handleSortLinks}
+            disabled={links.length === 0 || isSorted}
+            className="bg-green-500 hover:bg-green-700 text-white font-bold py-2 px-4 rounded focus:outline-none focus:shadow-outline disabled:opacity-50 mr-2"
+          >
+            <SortAsc className="inline-block mr-2" />
+            Sort Links A-Z
+          </button>
+          <button
+            onClick={handleCopyLinks}
+            disabled={links.length === 0}
+            className="bg-blue-500 hover:bg-blue-700 text-white font-bold py-2 px-4 rounded focus:outline-none focus:shadow-outline disabled:opacity-50"
+          >
+            {isCopied ? <Check className="inline-block mr-2" /> : <Copy className="inline-block mr-2" />}
+            {isCopied ? 'Copied!' : 'Copy Links'}
+          </button>
+        </div>
       </div>
       {links.length > 0 && (
         <div className="mt-4">
           <h3 className="text-xl font-semibold mb-2">Extracted Links:</h3>
           <ul className="list-disc pl-5">
-            {links.filter(Boolean).map((link, index) => (
+            {links.map((link, index) => (
               <li key={index} className="mb-2">
                 <a
-                  href={link || '#'}
+                  href={link}
                   target="_blank"
                   rel="noopener noreferrer"
                   className="text-blue-500 hover:underline break-words"
                   style={{ marginLeft: '30px', display: 'inline-block', maxWidth: 'calc(100% - 30px)' }}
+                  title={link}
                 >
-                  {link}
+                  {truncateDisplayText(link, maxDisplayLength)}
                 </a>
               </li>
             ))}
@@ -153,7 +194,6 @@ const HTMLLinkScraper = () => {
   );
 };
 
-
 const App = () => (
   <div>
     <h1 className="text-3xl font-bold text-center mb-8">HTML Link Scraper</h1>