Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added A-Z sorting and fixed long lines #2

Merged
merged 1 commit into from
Jul 2, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 84 additions & 44 deletions src/index.tsx
Original file line number Diff line number Diff line change
@@ -1,36 +1,52 @@
import React, { useState, useEffect } from 'react';
import ReactDOM from 'react-dom/client';
import { InfoIcon, Copy, Check } from 'lucide-react';
import { InfoIcon, Copy, Check, SortAsc } from 'lucide-react';

const HTMLLinkScraper = () => {
const [html, setHtml] = useState<string>('');
const [links, setLinks] = useState<(string | null)[]>([]);
const [message, setMessage] = useState<string>('');
const [isCopied, setIsCopied] = useState<boolean>(false);
const [links, setLinks] = useState<string[]>([]);
const [message, setMessage] = useState('');
const [isCopied, setIsCopied] = useState(false);
const [baseUrl, setBaseUrl] = useState<string>('');
const useFullUrls = true; // Always use full URLs
const [isSorted, setIsSorted] = useState(false);
const useFullUrls = true;
const maxDisplayLength = 100;

const extractBaseUrl = (html: string): string => {
const extractMostCommonBaseUrl = (html: string): string => {
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
const anchors = Array.from(doc.getElementsByTagName('a'));
const hrefs = anchors.map(a => a.getAttribute('href')).filter(Boolean) as string[];
const fullUrls = hrefs.filter(href => href.startsWith('http'));

// Try to extract base URL from <base> tag
const baseTag = doc.querySelector('base');
if (baseTag && baseTag.href) {
return new URL(baseTag.href).origin;
if (fullUrls.length === 0) {
return '';
}

// Try to extract from first <a> tag with an absolute URL
const firstLink = doc.querySelector('a[href^="http"]') as HTMLAnchorElement;
if (firstLink && firstLink.href) {
return new URL(firstLink.href).origin;
const baseUrls = fullUrls.map(url => {
try {
const parsedUrl = new URL(url);
return `${parsedUrl.protocol}//${parsedUrl.hostname}`;
} catch (error) {
return null;
}
}).filter(Boolean) as string[];

if (baseUrls.length === 0) {
return '';
}

// If no base URL found, return empty string
return '';
const urlCounts = baseUrls.reduce((acc, url) => {
acc[url] = (acc[url] || 0) + 1;
return acc;
}, {} as Record<string, number>);

const mostCommonBaseUrl = Object.entries(urlCounts).reduce((a, b) => a[1] > b[1] ? a : b)[0];

return mostCommonBaseUrl || '';
};

const extractLinks = (html: string, baseUrl: string, useFullUrls: boolean): (string | null)[] => {
const extractLinks = (html: string, baseUrl: string, useFullUrls: boolean) => {
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
const links = Array.from(doc.getElementsByTagName('a'))
Expand All @@ -39,50 +55,64 @@ const HTMLLinkScraper = () => {
if (href && href.startsWith('http')) {
return href;
} else if (href && useFullUrls && baseUrl) {
return new URL(href, baseUrl).href;
try {
return new URL(href, baseUrl).href;
} catch {
return href;
}
} else if (href && href.startsWith('/')) {
return useFullUrls && baseUrl ? `${baseUrl}${href}` : `{base_url}${href}`;
} else if (href) {
return useFullUrls && baseUrl ? new URL(href, baseUrl).href : `{base_url}${href.startsWith('/') ? '' : '/'}${href}`;
return useFullUrls && baseUrl ? `${baseUrl}/${href}` : `{base_url}/${href}`;
}
return null;
});
return links;
return [...new Set(links)].filter(Boolean) as string[];
};

useEffect(() => {
const extractedBaseUrl = extractBaseUrl(html);
if (extractedBaseUrl && !baseUrl) {
setBaseUrl(extractedBaseUrl);
}

const extractedLinks = extractLinks(html, baseUrl || extractedBaseUrl, useFullUrls);
const extractedBaseUrl = extractMostCommonBaseUrl(html);
setBaseUrl(extractedBaseUrl);
const extractedLinks = extractLinks(html, extractedBaseUrl, useFullUrls);
setLinks(extractedLinks);
if (extractedLinks.length === 0) {
setMessage(html.trim() ? 'No links were found in the provided HTML.' : '');
} else {
setMessage(`${extractedLinks.length} link${extractedLinks.length === 1 ? '' : 's'} extracted.`);
}
setIsCopied(false);
}, [html, baseUrl, useFullUrls]);
setIsSorted(false);
}, [html, useFullUrls]);

const handleCopyLinks = async () => {
if (links.length === 0) {
setMessage('No links to copy.');
return;
}

const linkText = links.filter(Boolean).join('\n');
const linkText = links.join('\n');

try {
await navigator.clipboard.writeText(linkText);
setMessage('Links copied to clipboard!');
setIsCopied(true);
setTimeout(() => setIsCopied(false), 3000);
} catch (err) {
console.error('Failed to copy links: ', err);
setMessage('Failed to copy links. Please try again or copy manually.');
}
};

const handleSortLinks = () => {
const sortedLinks = [...links].sort((a, b) => a.localeCompare(b));
setLinks(sortedLinks);
setIsSorted(true);
};

const truncateDisplayText = (text: string, maxLength: number) => {
if (text.length <= maxLength) return text;
return `${text.slice(0, maxLength - 3) }...`;
};

const tip = `The purpose of this site is to easily scrape a webpage for links.

Here's how to do it:
Expand All @@ -92,9 +122,8 @@ const HTMLLinkScraper = () => {
3. Right-click on the {<body>} tag in the Elements panel.
4. Choose "Copy" -> "Copy element".
5. Paste the copied HTML into the text box below.
6. The base URL will be automatically extracted if possible. You can manually adjust it if needed.

Note: All URLs will be displayed as full URLs when possible.`;
Note: The base URL will be automatically extracted from the most common URL in the HTML.`;

return (
<div className="w-full max-w-2xl mx-auto bg-white shadow-md rounded px-8 pt-6 pb-8 mb-4">
Expand All @@ -107,8 +136,9 @@ const HTMLLinkScraper = () => {
type="text"
value={baseUrl}
onChange={(e) => setBaseUrl(e.target.value)}
placeholder="Base URL (auto-detected or enter manually)"
placeholder="Base URL (automatically detected)"
className="w-full px-3 py-2 text-gray-700 border rounded-lg focus:outline-none"
readOnly
/>
</div>
<textarea
Expand All @@ -120,29 +150,40 @@ const HTMLLinkScraper = () => {
/>
<div className="flex justify-between items-center mt-4">
<span className="text-sm text-gray-500">{message}</span>
<button
onClick={handleCopyLinks}
disabled={links.length === 0}
className="bg-blue-500 hover:bg-blue-700 text-white font-bold py-2 px-4 rounded focus:outline-none focus:shadow-outline disabled:opacity-50"
>
{isCopied ? <Check className="inline-block mr-2" /> : <Copy className="inline-block mr-2" />}
{isCopied ? 'Copied!' : 'Copy Links'}
</button>
<div>
<button
onClick={handleSortLinks}
disabled={links.length === 0 || isSorted}
className="bg-green-500 hover:bg-green-700 text-white font-bold py-2 px-4 rounded focus:outline-none focus:shadow-outline disabled:opacity-50 mr-2"
>
<SortAsc className="inline-block mr-2" />
Sort Links A-Z
</button>
<button
onClick={handleCopyLinks}
disabled={links.length === 0}
className="bg-blue-500 hover:bg-blue-700 text-white font-bold py-2 px-4 rounded focus:outline-none focus:shadow-outline disabled:opacity-50"
>
{isCopied ? <Check className="inline-block mr-2" /> : <Copy className="inline-block mr-2" />}
{isCopied ? 'Copied!' : 'Copy Links'}
</button>
</div>
</div>
{links.length > 0 && (
<div className="mt-4">
<h3 className="text-xl font-semibold mb-2">Extracted Links:</h3>
<ul className="list-disc pl-5">
{links.filter(Boolean).map((link, index) => (
{links.map((link, index) => (
<li key={index} className="mb-2">
<a
href={link || '#'}
href={link}
target="_blank"
rel="noopener noreferrer"
className="text-blue-500 hover:underline break-words"
style={{ marginLeft: '30px', display: 'inline-block', maxWidth: 'calc(100% - 30px)' }}
title={link}
>
{link}
{truncateDisplayText(link, maxDisplayLength)}
</a>
</li>
))}
Expand All @@ -153,7 +194,6 @@ const HTMLLinkScraper = () => {
);
};


const App = () => (
<div>
<h1 className="text-3xl font-bold text-center mb-8">HTML Link Scraper</h1>
Expand Down