-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharchive_links.py
100 lines (82 loc) · 3.58 KB
/
archive_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os
import re
import requests
from datetime import datetime, timedelta
import yaml
# Path to your Hugo site content folder
content_path = os.path.join(os.getcwd(), "content")
# Regular expression to find markdown links: [link_text](link_url)
md_link_pattern = re.compile(r'\[.*?\]\((https?://.*?)\)')
# Regular expression to parse frontmatter in Hugo markdown files
frontmatter_pattern = re.compile(r'^---(.*?)---', re.DOTALL)
# Wayback Machine URL prefix
wayback_machine_prefix = "https://web.archive.org/"
def parse_frontmatter(md_file):
"""
Parse the frontmatter of a markdown file to extract the post date.
"""
with open(md_file, 'r', encoding='utf-8') as f:
content = f.read()
frontmatter_match = re.match(frontmatter_pattern, content)
if frontmatter_match:
frontmatter = frontmatter_match.group(1)
try:
# Load frontmatter as YAML and extract the 'date' field
frontmatter_data = yaml.safe_load(frontmatter)
post_date = frontmatter_data.get('date')
if post_date:
return datetime.strptime(post_date, '%Y-%m-%d')
except yaml.YAMLError:
pass
return None
def query_wayback_machine(url):
"""
Query the Wayback Machine API to check if a URL has been archived.
"""
api_url = f"http://archive.org/wayback/available?url={url}"
try:
response = requests.get(api_url)
if response.status_code == 200:
data = response.json()
if data['archived_snapshots']:
closest_snapshot = data['archived_snapshots']['closest']
if closest_snapshot['available']:
return closest_snapshot['timestamp']
except requests.RequestException as e:
print(f"Error querying Wayback Machine for {url}: {e}")
return None
def find_external_links(md_file):
"""
Parse a markdown file to find external links that don't point to the Wayback Machine.
"""
with open(md_file, 'r', encoding='utf-8') as f:
content = f.read()
# Find all URLs in markdown file
external_links = re.findall(md_link_pattern, content)
# Filter out Wayback Machine URLs
external_links = [url for url in external_links if not url.startswith(wayback_machine_prefix)]
return external_links
def check_markdown_files(root_folder):
"""
Traverse the Hugo content directory and check markdown files for external links.
"""
for subdir, _, files in os.walk(root_folder):
for file in files:
if file.endswith(".md"):
md_file_path = os.path.join(subdir, file)
external_links = find_external_links(md_file_path)
post_date = parse_frontmatter(md_file_path)
if external_links and post_date:
for url in external_links:
archive_timestamp = query_wayback_machine(url)
if archive_timestamp:
# Convert the archive timestamp to datetime
archive_date = datetime.strptime(archive_timestamp, '%Y%m%d%H%M%S')
# Check if the archive date is within one day of the post date
if abs((archive_date - post_date).days) <= 1:
continue
# If no valid archive found, or it’s not within 1 day of the post date
print(f"File: {md_file_path}")
print(f" External Link: {url}")
# Run the script
check_markdown_files(content_path)