From c3078a9f5a4e32e0676ff979b0ddc41249a8b2a9 Mon Sep 17 00:00:00 2001 From: Grant Gainey Date: Wed, 26 Jul 2023 16:41:45 -0400 Subject: [PATCH] Taught export to insure de-duplicated Artifact.json. Along the way taught export to operate on a QuerySet of Artifacts instead of (prematurely) hydrating all affected Artifacts into a list. fixes #4159. (cherry picked from commit 617888748a85fa994be9258742d83ae3a7d4b034) --- CHANGES/4159.bugfix | 4 ++++ pulpcore/app/importexport.py | 5 +++-- pulpcore/app/tasks/export.py | 13 +++++++++---- 3 files changed, 16 insertions(+), 6 deletions(-) create mode 100644 CHANGES/4159.bugfix diff --git a/CHANGES/4159.bugfix b/CHANGES/4159.bugfix new file mode 100644 index 0000000000..15c94b8a9a --- /dev/null +++ b/CHANGES/4159.bugfix @@ -0,0 +1,4 @@ +Taught the Artifact.json of an export to hold minimum-unique-set of Artifact entries. + +In highly-duplicated-content export scenarios, this can mean a significant decrease +in export-size, and significant improvement in import-performance. diff --git a/pulpcore/app/importexport.py b/pulpcore/app/importexport.py index 66105dd80f..0f97107794 100644 --- a/pulpcore/app/importexport.py +++ b/pulpcore/app/importexport.py @@ -94,14 +94,14 @@ def export_artifacts(export, artifacts): Args: export (django.db.models.PulpExport): export instance that's doing the export - artifacts (django.db.models.Artifacts): list of artifacts in all repos being exported + artifacts (django.db.models.Artifacts): QuerySet of artifacts in all repos being exported Raises: ValidationError: When path is not in the ALLOWED_EXPORT_PATHS setting """ data = dict(message="Exporting Artifacts", code="export.artifacts", total=len(artifacts)) with ProgressReport(**data) as pb: - for artifact in pb.iter(artifacts): + for artifact in artifacts.iterator(): # chunk_size= defaults to 2000 at a fetch dest = artifact.file.name if settings.DEFAULT_FILE_STORAGE != "pulpcore.app.models.storage.FileSystem": with tempfile.TemporaryDirectory(dir=".") as temp_dir: @@ -112,6 +112,7 @@ def export_artifacts(export, artifacts): export.tarfile.add(temp_file.name, dest) else: export.tarfile.add(artifact.file.path, dest) + pb.increment() resource = ArtifactResource() resource.queryset = artifacts diff --git a/pulpcore/app/tasks/export.py b/pulpcore/app/tasks/export.py index 2b04343498..f4efbaea68 100644 --- a/pulpcore/app/tasks/export.py +++ b/pulpcore/app/tasks/export.py @@ -492,7 +492,7 @@ def _do_export(pulp_exporter, tar, the_export): starting_versions = _get_starting_versions(do_incremental, pulp_exporter, the_export) vers_match = _version_match(ending_versions, starting_versions) # Gather up versions and artifacts - artifacts = [] + artifacts = None # Will be a QuerySet selecting the Artifacts that need to be exported for version in ending_versions: # Check version-content to make sure we're not being asked to export # an on_demand repo @@ -501,10 +501,15 @@ def _do_export(pulp_exporter, tar, the_export): raise RuntimeError(_("Remote artifacts cannot be exported.")) if do_incremental: - vers_artifacts = version.artifacts.difference(vers_match[version].artifacts).all() + vers_artifacts = version.artifacts.difference(vers_match[version].artifacts) else: - vers_artifacts = version.artifacts.all() - artifacts.extend(vers_artifacts) + vers_artifacts = version.artifacts + + if artifacts: + artifacts.union(vers_artifacts) + else: + artifacts = vers_artifacts + # export plugin-version-info export_versions(the_export, plugin_version_info) # Export the top-level entities (artifacts and repositories)