Add a high-level repository writing API

The existing API is frustratingly verbose and low-level. This will make it trivial to create repositories with only a few lines.
rpm-software-management · Dec 3, 2023 · 9a9928d · 9a9928d
1 parent 0930a82
commit 9a9928d
Show file tree

Hide file tree

Showing 5 changed files with 270 additions and 11 deletions.
diff --git a/examples/python/simple_createrepo.py → examples/python/manual_createrepo.py b/examples/python/simple_createrepo.py → examples/python/manual_createrepo.py
@@ -6,7 +6,7 @@
 import os.path
 import createrepo_c as cr
 
-def do_repodata(path):
+def manual_method(path):
     # Prepare repodata/ directory
     repodata_path = os.path.join(path, "repodata")
     if os.path.exists(repodata_path):
@@ -37,10 +37,10 @@ def do_repodata(path):
 
     # List directory and prepare list of files to process
     pkg_list = []
-    for filename in os.listdir(path):
-        filename = os.path.join(path, filename)
-        if os.path.isfile(filename) and filename.endswith(".rpm"):
-            pkg_list.append(filename)
+    with os.scandir(path) as entries:
+        for entry in entries:
+            if entry.is_file() and entry.path.endswith(".rpm"):
+                pkg_list.append(entry.path)
 
     pri_xml.set_num_of_pkgs(len(pkg_list))
     fil_xml.set_num_of_pkgs(len(pkg_list))
@@ -93,6 +93,6 @@ def do_repodata(path):
         print("Usage: %s <directory>" % (sys.argv[0]))
         sys.exit(1)
 
-    do_repodata(sys.argv[1])
+    manual_method(sys.argv[1])
 
     print("Repository created in %s" % sys.argv[1])
diff --git a/examples/python/simple_repository_writing.py b/examples/python/simple_repository_writing.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import createrepo_c as cr
+
+
+def write_repository_v1(path):
+    # List directory and prepare list of files to process
+    pkg_list = []
+
+    with os.scandir(path) as entries:
+        for entry in entries:
+            if entry.is_file() and entry.path.endswith(".rpm"):
+                pkg_list.append(entry.path)
+
+    # create a RepositoryWriter with a context manager - finish() is called automatically
+    # let's just use the default options
+    with cr.RepositoryWriter(path) as writer:
+        writer.repomd.add_repo_tag("Fedora 34")
+        writer.repomd.set_revision("1628310033")
+        # we have to set the number of packages we will add, before we add them
+        writer.set_num_of_pkgs(len(pkg_list))
+
+        for filename in pkg_list:
+            pkg = writer.add_pkg_from_file(filename)
+            print("Added: %s" % pkg.nevra())
+
+
+def write_repository_v2(path):
+    # List directory and prepare list of files to process
+    pkg_list = []
+
+    with os.scandir(path) as entries:
+        for entry in entries:
+            if entry.is_file() and entry.path.endswith(".rpm"):
+                pkg_list.append(entry.path)
+
+    # create a writer without a context manager - you need to manually call finish()
+    # change a couple of the defaults too
+    writer = cr.RepositoryWriter(
+        path,
+        unique_md_filenames=False,
+        changelog_limit=4,
+        checksum_type=cr.SHA512,
+        compression=cr.GZ_COMPRESSION,
+    )
+    writer.repomd.set_num_of_pkgs(len(pkg_list))
+    writer.repomd.add_repo_tag("Fedora 34")
+    writer.set_revision("1628310033")
+
+    for filename in pkg_list:
+        pkg = writer.add_pkg_from_file(filename)
+        print("Added: %s" % pkg.nevra())
+
+    writer.finish()
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2 or not os.path.isdir(sys.argv[1]):
+        print("Usage: %s <directory>" % (sys.argv[0]))
+        sys.exit(1)
+
+    create_repo(sys.argv[1])
+
+    print("Repository created in %s" % sys.argv[1])
diff --git a/src/compression_wrapper.h b/src/compression_wrapper.h
@@ -54,7 +54,7 @@ typedef enum {
     CR_CW_MODE_SENTINEL,        /*!< Sentinel of the list */
 } cr_OpenMode;
 
-/** Stat build about open content during compression (writting).
+/** Stat build about open content during compression (writing).
  */
 typedef struct {
     gint64          size;               /*!< Size of content */
@@ -87,7 +87,7 @@ typedef struct {
     void                *INNERFILE;     /*!< Pointer to underlying FILE */
     cr_OpenMode         mode;           /*!< Mode */
     cr_ContentStat      *stat;          /*!< Content stats */
-    cr_ChecksumCtx      *checksum_ctx;  /*!< Checksum contenxt */
+    cr_ChecksumCtx      *checksum_ctx;  /*!< Checksum context */
 } CR_FILE;
 
 #define CR_CW_ERR       -1      /*!< Return value - Error */

diff --git a/src/createrepo_c.c b/src/createrepo_c.c
@@ -1153,7 +1153,7 @@ main(int argc, char **argv)
                 fex_db_filename = g_strconcat(tmp_out_repo, "/filelists-ext.sqlite", NULL);
             oth_db_filename = g_strconcat(tmp_out_repo, "/other.sqlite", NULL);
         } else {
-            g_debug("Creating databases localy");
+            g_debug("Creating databases locally");
             const gchar *tmpdir = g_get_tmp_dir();
             pri_db_filename = g_build_filename(tmpdir, "primary.XXXXXX.sqlite", NULL);
             fil_db_filename = g_build_filename(tmpdir, "filelists.XXXXXX.sqlite", NULL);

diff --git a/src/python/createrepo_c/__init__.py b/src/python/createrepo_c/__init__.py
@@ -3,8 +3,11 @@
 
 import collections
 import os
+from pathlib import Path
+import shutil
 import subprocess
 import sys
+import tempfile
 
 from . import _createrepo_c
 from ._createrepo_c import *
@@ -196,7 +199,7 @@ def __init__(self, type=None, path=None):
         _createrepo_c.RepomdRecord.__init__(self, type, path)
 
     def compress_and_fill(self, hashtype, compresstype):
-        rec = RepomdRecord(self.type + "_gz", None)
+        rec = RepomdRecord(self.type, None)
         _createrepo_c.RepomdRecord.compress_and_fill(self,
                                                      rec,
                                                      hashtype,
@@ -446,7 +449,7 @@ def package_count(self):
         # But there's no way to do that. This gets fuzzy around the topic of duplicates.
         # If the same package is listed more than once, is that counted as more than one package?
         # Currently, no.
-        return len(self.parse_packages(only_primary=True))
+        return len(self.parse_packages(only_primary=True)[0])
 
     def iter_packages(self, warningcb=None):
         """
@@ -545,6 +548,196 @@ def newpkgcb(pkgId, name, arch):
         return packages, warnings
 
 
+# both the path and the *XmlFile objects need to be tracked together because there's no way to get the path
+# back from the *XmlFile objects
+MetadataInfoHolder = collections.namedtuple("MetadataInfoHolder", ["path", "writer"])
+
+class RepositoryWriter:
+
+    _FINISHED_ERR_MSG = "Cannot perform action after the repository has already finished being written"
+
+    def __init__(self,
+                 destination,
+                 num_packages=None,
+                 unique_md_filenames=True,
+                 changelog_limit=10,
+                 compression=ZSTD_COMPRESSION,
+                 checksum_type=SHA256,
+                ):
+        # TODO: with_zchunk option?
+
+        if changelog_limit:
+            assert isinstance(changelog_limit, int) and changelog_limit >= 0, "changelog_limit must be an integer >= 0"
+
+        self.repomd = Repomd()
+        self._destination_repo_path = Path(destination)
+
+        self._unique_md_filenames = unique_md_filenames
+        self._changelog_limit = changelog_limit
+        self._preserve_existing_metadata = preserve_existing_metadata
+        self._checksum_type = checksum_type
+
+        self._has_set_num_pkgs = False
+        self._finished = False
+
+        os.makedirs(self.path, exist_ok=True)
+        os.makedirs(self.repodata_dir, exist_ok=True)
+
+        def _compression_suffix(compressiontype):
+            suffix = compression_suffix(compressiontype)
+            return suffix if suffix else ""
+
+        self._compression = compression
+        self._compression_suffix = _compression_suffix(compression)
+
+        pri_xml_path = self.repodata_dir / ("primary.xml" + self._compression_suffix)
+        fil_xml_path = self.repodata_dir / ("filelists.xml" + self._compression_suffix)
+        oth_xml_path = self.repodata_dir / ("other.xml" + self._compression_suffix)
+
+        self.working_metadata_files = {
+            "primary": MetadataInfoHolder(
+                pri_xml_path, PrimaryXmlFile(str(pri_xml_path), compressiontype=compression)
+            ),
+            "filelists": MetadataInfoHolder(
+                fil_xml_path, FilelistsXmlFile(str(fil_xml_path), compressiontype=compression)
+            ),
+            "other": MetadataInfoHolder(
+                oth_xml_path, OtherXmlFile(str(oth_xml_path), compressiontype=compression)
+            ),
+        }
+        self.additional_metadata_files = {}
+
+        if num_packages is not None:
+            self.set_num_of_pkgs(num_packages)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        # TODO: if there's an error do we actually want to finish()?
+        self.finish()
+
+    @property
+    def path(self):
+        return self._destination_repo_path
+
+    @property
+    def repodata_dir(self):
+        return self.path / "repodata"
+
+    def set_num_of_pkgs(self, num):
+        """Set the number of packages that will be added - this has to be done before adding any packages."""
+        assert not self._has_set_num_pkgs, "The number of packages has already been set"
+        self._has_set_num_pkgs = True
+
+        self.working_metadata_files["primary"].writer.set_num_of_pkgs(num)
+        self.working_metadata_files["filelists"].writer.set_num_of_pkgs(num)
+        self.working_metadata_files["other"].writer.set_num_of_pkgs(num)
+
+    def add_pkg_from_file(self, path, output_dir=None):
+        """Add a package to the repo from a provided path."""
+        assert self._has_set_num_pkgs, "Must set the number of packages before adding packages"
+        assert not self._finished, self._FINISHED_ERR_MSG
+
+        try:
+            relative_path = Path(path).relative_to(self.path)  # raises a ValueError if path is not relative
+        except ValueError:
+            if output_dir:
+                os.makedirs(output_dir, exist_ok=True)
+                relative_path = Path(output_dir) / os.path.basename(path)
+                shutil.copy2(path, relative_path)
+            else:
+                raise
+
+        pkg = package_from_rpm(
+            path,
+            checksum_type=self._checksum_type,
+            location_href=str(relative_path),
+            location_base=None,
+            changelog_limit=self._changelog_limit
+        )
+
+        self.add_pkg(pkg)
+        return pkg
+
+    def add_pkg(self, pkg):
+        """Add a package to the repo from a pre-created Package object."""
+        assert self._has_set_num_pkgs, "Must set the number of packages before adding packages"
+        assert not self._finished, self._FINISHED_ERR_MSG
+
+        self.working_metadata_files["primary"].writer.add_pkg(pkg)
+        self.working_metadata_files["filelists"].writer.add_pkg(pkg)
+        self.working_metadata_files["other"].writer.add_pkg(pkg)
+
+    def add_repomd_metadata(self, name, path, compressiontype=None):
+        """Add an additional metadata file to the final repomd."""
+        assert not self._finished, self._FINISHED_ERR_MSG
+
+        if not compressiontype:
+            shutil.copy2(path, self.repodata_dir)
+            self.additional_metadata_files[name] = path
+        else:
+            dst = self.repodata_dir / (os.path.basename(path) + compression_suffix(compressiontype))
+            compress_file(path, str(dst), compressiontype=compressiontype)
+            self.additional_metadata_files[name] = path
+
+    def add_update_record(self, rec):
+        """Add an advisory (update record) to the repository."""
+        assert not self._finished, self._FINISHED_ERR_MSG
+
+        # lazily create the updateinfo entry
+        if "updateinfo" not in self.working_metadata_files:
+            upd_xml_path = Path(self.repodata_dir) / (self.upd_xml_name + self._compression_suffix)
+            self.working_metadata_files["updateinfo"] = MetadataInfoHolder(
+                upd_xml_path, UpdateInfoXmlFile(str(upd_xml_path), compressiontype=self._compression)
+            )
+
+        self.working_metadata_files["updateinfo"].writer.append(rec)
+
+    def finish(self):
+        """Finish writing metadata."""
+        assert not self._finished, self._FINISHED_ERR_MSG
+        self._finished = True
+
+        # if the user hasn't added any packages we can let them skip this step
+        if not self._has_set_num_pkgs:
+            self.set_num_of_pkgs(0)
+
+        records = {}
+
+        # fail if the user used add_repomd_metadata() for one of "primary", "filelists", "other",
+        # "updateinfo" (if updaterecords added also), etc.
+        created_record_names = set(self.working_metadata_files.keys())
+        added_record_names = set(self.additional_metadata_files.keys())
+        overlapping_records = created_record_names.intersection(added_record_names)
+        assert not overlapping_records, "Added repomd metadata {} conflicts with created metadata".format(overlapping_records)
+
+        # Create all the repomdrecords for the standard metadata
+        for record_name, metadata_info in self.working_metadata_files.items():
+            # Close all of the metadata files being actively edited
+            metadata_info.writer.close()
+            record = RepomdRecord(record_name, str(metadata_info.path))
+            record.fill(self._checksum_type)
+            records[record_name] = record
+
+        # Create all the repomdrecords for the externally-added metadata
+        for record_name, path in self.additional_metadata_files.items():
+            # if the user tried to add the same record twice, last one wins I guess?
+            record = RepomdRecord(record_name, str(path))
+            record.fill(self._checksum_type)
+            records[record_name] = record
+
+        # Rename the files (if requested) and then add all the repomdrecords to the repomd.xml
+        for record in records.values():
+            if self._unique_md_filenames:
+                record.rename_file()
+            self.repomd.set_record(record)
+
+        # Write repomd.xml
+        repomd_path = self.repodata_dir / "repomd.xml"
+        with open(repomd_path, "w") as repomd_xml_file:
+            repomd_xml_file.write(self.repomd.xml_dump())
+
 # If we have been built as a Python package, e.g. "setup.py", this is where the binaries
 # will be located.
 _DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')