From dceafc9f3c696a0955c54b6187aa6e472cf691d5 Mon Sep 17 00:00:00 2001 From: Zac Romick Date: Tue, 2 Jul 2024 09:03:07 -0500 Subject: [PATCH] Added A-Z sorting and fixed long lines --- src/index.tsx | 128 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 84 insertions(+), 44 deletions(-) diff --git a/src/index.tsx b/src/index.tsx index fae6ec6..20dd094 100644 --- a/src/index.tsx +++ b/src/index.tsx @@ -1,36 +1,52 @@ import React, { useState, useEffect } from 'react'; import ReactDOM from 'react-dom/client'; -import { InfoIcon, Copy, Check } from 'lucide-react'; +import { InfoIcon, Copy, Check, SortAsc } from 'lucide-react'; const HTMLLinkScraper = () => { const [html, setHtml] = useState(''); - const [links, setLinks] = useState<(string | null)[]>([]); - const [message, setMessage] = useState(''); - const [isCopied, setIsCopied] = useState(false); + const [links, setLinks] = useState([]); + const [message, setMessage] = useState(''); + const [isCopied, setIsCopied] = useState(false); const [baseUrl, setBaseUrl] = useState(''); - const useFullUrls = true; // Always use full URLs + const [isSorted, setIsSorted] = useState(false); + const useFullUrls = true; + const maxDisplayLength = 100; - const extractBaseUrl = (html: string): string => { + const extractMostCommonBaseUrl = (html: string): string => { const parser = new DOMParser(); const doc = parser.parseFromString(html, 'text/html'); + const anchors = Array.from(doc.getElementsByTagName('a')); + const hrefs = anchors.map(a => a.getAttribute('href')).filter(Boolean) as string[]; + const fullUrls = hrefs.filter(href => href.startsWith('http')); - // Try to extract base URL from tag - const baseTag = doc.querySelector('base'); - if (baseTag && baseTag.href) { - return new URL(baseTag.href).origin; + if (fullUrls.length === 0) { + return ''; } - // Try to extract from first tag with an absolute URL - const firstLink = doc.querySelector('a[href^="http"]') as HTMLAnchorElement; - if (firstLink && firstLink.href) { - return new URL(firstLink.href).origin; + const baseUrls = fullUrls.map(url => { + try { + const parsedUrl = new URL(url); + return `${parsedUrl.protocol}//${parsedUrl.hostname}`; + } catch (error) { + return null; + } + }).filter(Boolean) as string[]; + + if (baseUrls.length === 0) { + return ''; } - // If no base URL found, return empty string - return ''; + const urlCounts = baseUrls.reduce((acc, url) => { + acc[url] = (acc[url] || 0) + 1; + return acc; + }, {} as Record); + + const mostCommonBaseUrl = Object.entries(urlCounts).reduce((a, b) => a[1] > b[1] ? a : b)[0]; + + return mostCommonBaseUrl || ''; }; - const extractLinks = (html: string, baseUrl: string, useFullUrls: boolean): (string | null)[] => { + const extractLinks = (html: string, baseUrl: string, useFullUrls: boolean) => { const parser = new DOMParser(); const doc = parser.parseFromString(html, 'text/html'); const links = Array.from(doc.getElementsByTagName('a')) @@ -39,22 +55,25 @@ const HTMLLinkScraper = () => { if (href && href.startsWith('http')) { return href; } else if (href && useFullUrls && baseUrl) { - return new URL(href, baseUrl).href; + try { + return new URL(href, baseUrl).href; + } catch { + return href; + } + } else if (href && href.startsWith('/')) { + return useFullUrls && baseUrl ? `${baseUrl}${href}` : `{base_url}${href}`; } else if (href) { - return useFullUrls && baseUrl ? new URL(href, baseUrl).href : `{base_url}${href.startsWith('/') ? '' : '/'}${href}`; + return useFullUrls && baseUrl ? `${baseUrl}/${href}` : `{base_url}/${href}`; } return null; }); - return links; + return [...new Set(links)].filter(Boolean) as string[]; }; useEffect(() => { - const extractedBaseUrl = extractBaseUrl(html); - if (extractedBaseUrl && !baseUrl) { - setBaseUrl(extractedBaseUrl); - } - - const extractedLinks = extractLinks(html, baseUrl || extractedBaseUrl, useFullUrls); + const extractedBaseUrl = extractMostCommonBaseUrl(html); + setBaseUrl(extractedBaseUrl); + const extractedLinks = extractLinks(html, extractedBaseUrl, useFullUrls); setLinks(extractedLinks); if (extractedLinks.length === 0) { setMessage(html.trim() ? 'No links were found in the provided HTML.' : ''); @@ -62,7 +81,8 @@ const HTMLLinkScraper = () => { setMessage(`${extractedLinks.length} link${extractedLinks.length === 1 ? '' : 's'} extracted.`); } setIsCopied(false); - }, [html, baseUrl, useFullUrls]); + setIsSorted(false); + }, [html, useFullUrls]); const handleCopyLinks = async () => { if (links.length === 0) { @@ -70,7 +90,7 @@ const HTMLLinkScraper = () => { return; } - const linkText = links.filter(Boolean).join('\n'); + const linkText = links.join('\n'); try { await navigator.clipboard.writeText(linkText); @@ -78,11 +98,21 @@ const HTMLLinkScraper = () => { setIsCopied(true); setTimeout(() => setIsCopied(false), 3000); } catch (err) { - console.error('Failed to copy links: ', err); setMessage('Failed to copy links. Please try again or copy manually.'); } }; + const handleSortLinks = () => { + const sortedLinks = [...links].sort((a, b) => a.localeCompare(b)); + setLinks(sortedLinks); + setIsSorted(true); + }; + + const truncateDisplayText = (text: string, maxLength: number) => { + if (text.length <= maxLength) return text; + return `${text.slice(0, maxLength - 3) }...`; + }; + const tip = `The purpose of this site is to easily scrape a webpage for links. Here's how to do it: @@ -92,9 +122,8 @@ const HTMLLinkScraper = () => { 3. Right-click on the {} tag in the Elements panel. 4. Choose "Copy" -> "Copy element". 5. Paste the copied HTML into the text box below. - 6. The base URL will be automatically extracted if possible. You can manually adjust it if needed. - Note: All URLs will be displayed as full URLs when possible.`; + Note: The base URL will be automatically extracted from the most common URL in the HTML.`; return (
@@ -107,8 +136,9 @@ const HTMLLinkScraper = () => { type="text" value={baseUrl} onChange={(e) => setBaseUrl(e.target.value)} - placeholder="Base URL (auto-detected or enter manually)" + placeholder="Base URL (automatically detected)" className="w-full px-3 py-2 text-gray-700 border rounded-lg focus:outline-none" + readOnly />