Skip to content

Commit

Permalink
fix(debian-copyright-mirror): overhaul for more efficient run length (#…
Browse files Browse the repository at this point in the history
…3150)

The k8s CronJob monitoring has highlighted that the
debian-copyright-mirror job appears to be overrunning. The wget
mirroring approach is grossly inefficient.

This replaces that with a more efficient parallel curl execution, using
a YAML file conveniently available to construct the URLs to download.

It sacrifices the attempt at only updating changed files, as this isn't
easily possible with curl.

The runtime is now < 10 minutes in local testing!
  • Loading branch information
andrewpollock authored Feb 13, 2025
1 parent 30ac1c8 commit 839a981
Show file tree
Hide file tree
Showing 3 changed files with 166 additions and 13 deletions.
3 changes: 2 additions & 1 deletion vulnfeeds/cmd/debian-copyright-mirror/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@

FROM gcr.io/google.com/cloudsdktool/google-cloud-cli:485.0.0-alpine@sha256:d5da0344b23d03a6f2728657732c7a60300a91acaad9b8076c6fd30b1dfe1ff4

RUN apk add wget
RUN apk add py3-yaml

COPY ./debian-copyright-mirror.sh /
COPY ./debian-copyright-mirror.py /

ENTRYPOINT ["/debian-copyright-mirror.sh"]
163 changes: 163 additions & 0 deletions vulnfeeds/cmd/debian-copyright-mirror/debian-copyright-mirror.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
"""
Download all of the copyright files for packages in main in Debian unstable.
This:
Parses https://metadata.ftp-master.debian.org/changelogs/filelist.yaml.xz
to identify the files to retrieve
Generates a curl configuration to download the URLs
Executes curl to download the URLs in parallel
"""

import os
import argparse
import yaml
import lzma
import urllib.request
import subprocess
from typing import List, Optional, Dict


class Error(Exception):
"""General purpose error class."""


def download_url_to_directory(url: str,
directory: str,
filename: Optional[str] = None) -> Optional[str]:
"""
Downloads a URL to a specified directory.
Args:
url (str): The URL to download.
directory (str): The directory to save the file to.
filename (str, optional): The filename to use. If None, the filename
is extracted from the URL. Defaults to None.
Returns:
str: The full path to the downloaded file, or None on error.
"""
try:
if not os.path.exists(directory):
os.makedirs(directory)

if filename is None:
filename = os.path.basename(urllib.parse.urlsplit(url).path)

filepath = os.path.join(directory, filename)

urllib.request.urlretrieve(url, filepath)
return filepath

except urllib.error.URLError as e:
print(f'Error downloading {url}: {e}')
return None
except OSError as e: # Catch directory creation/file writing errors.
print(f'OS Error: {e}')
return None
except Exception as e: # Catch any other unexpected error.
print(f'An unexpected error occurred: {e}')
return None


def extract_unstable_copyright(filelist: str) -> Dict:
"""
Extracts the 'unstable_copyright' entry for each package
from an xz-compressed YAML file.
Args:
filelist (str): The path to the xz-compressed YAML filelist
Returns:
A dictionary where keys are package names and values are
their 'unstable_copyright' entries, or None if no unstable
copyright is found.
"""
try:
with lzma.open(filelist, 'rt', encoding='utf-8') as f:
data = yaml.safe_load(f)

results = {}
for package, versions in data.items():
if 'unstable' in versions:
entries = versions['unstable']
for entry in entries:
if entry.endswith('unstable_copyright'):
results[package] = entry
break # Found it, no need to continue checking this package.

return results

except FileNotFoundError:
print(f"Error: File not found at {filelist}")
return None
except lzma.LZMAError as e:
print(f"Error: LZMA decompression failed: {e}")
return None
except yaml.YAMLError as e:
print(f"Error: YAML parsing failed: {e}")
return None
except Exception as e:
print(f"An unexpected error occurred: {e}")
return None


def generate_curl_configuration(filelist: List[str]):
"""
Generates a curl configuration to download all of the files in filelist.
--output filename
url = https://url
Args:
filelist (List[str]): a list of files to download.
"""

url_base = 'https://metadata.ftp-master.debian.org/changelogs'

with open('/tmp/curl_configuration', 'w') as curl_config:
curl_config.writelines([
'--output ' + path + '\n' + 'url = ' + os.path.join(url_base, path) +
'\n' for path in filelist
])


def execute_curl(configuration: str, directory: str):
"""
Execute curl with the supplied configuration in the specified directory.
Args:
configuration (str): path to configuration file.
directory (str): path to set current working directory to.
"""

os.makedirs(directory)
subprocess.run(
['curl', '--parallel', '--create-dirs', '--config', configuration],
cwd=directory,
check=True)


def main():
parser = argparse.ArgumentParser()
parser.add_argument('work_dir')
args = parser.parse_args()

download_url_to_directory(
'https://metadata.ftp-master.debian.org/changelogs/filelist.yaml.xz',
'/tmp')
unstable_package_copyright_files = extract_unstable_copyright(
'/tmp/filelist.yaml.xz')
if unstable_package_copyright_files is None:
raise Error('Unexpected result determining files to download')
generate_curl_configuration(
f for f in unstable_package_copyright_files.values()
if f.startswith('main/'))
with open("/tmp/curl_configuration") as curl_configuration:
if len(curl_configuration.readlines()) < 80000:
raise Error('Unexpectly small curl configuration')
execute_curl('/tmp/curl_configuration', args.work_dir)


if __name__ == '__main__':
main()
13 changes: 1 addition & 12 deletions vulnfeeds/cmd/debian-copyright-mirror/debian-copyright-mirror.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,7 @@

mkdir -p "${WORK_DIR}" || true

if gsutil --quiet stat "${GCS_PATH}"; then
gsutil ${BE_VERBOSE="--quiet"} cp "${GCS_PATH}" "${WORK_DIR}"
tar -C "${WORK_DIR}" -xf "${WORK_DIR}/$(basename ${GCS_PATH})"
fi

wget \
${BE_VERBOSE="--quiet"} \
--directory "${WORK_DIR}" \
--mirror \
--accept unstable_copyright \
--accept index.html \
https://metadata.ftp-master.debian.org/changelogs/main
python debian-copyright-mirror.py "${WORK_DIR}/metadata.ftp-master.debian.org/changelogs/"

tar -C "${WORK_DIR}" -cf "${WORK_DIR}/$(basename ${GCS_PATH})" .

Expand Down

0 comments on commit 839a981

Please sign in to comment.