Skip to content

Commit 64fb8e7

Browse files
committed
Add last_serial sync optimization
fixes: #351
1 parent ecdb2df commit 64fb8e7

File tree

8 files changed

+148
-9
lines changed

8 files changed

+148
-9
lines changed

Diff for: CHANGES/351.feature

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Added ``last_serial`` sync optimization to Python repositories.
2+
Subsequent syncs will use ``last_serial`` to get the changed packages since the previous sync.

Diff for: docs/tech-preview.rst

+1
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ The following features are currently being released as part of a tech preview
88
* PyPI’s json API at content endpoint ‘/pypi/{package-name}/json’. Allows for basic Pulp-to-Pulp syncing.
99
* Fully mirror Python repositories like PyPI.
1010
* ``Twine`` upload packages to indexes at endpoints '/simple` or '/legacy'.
11+
* Auto-optimize subsequent full mirror syncs using PyPI's last_serial field.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Generated by Django 3.2.8 on 2021-10-21 20:25
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
('python', '0010_update_json_field'),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name='pythonrepository',
15+
name='last_serial',
16+
field=models.IntegerField(default=0),
17+
),
18+
]

Diff for: pulp_python/app/models.py

+34-1
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
from aiohttp.web import json_response
44
from django.contrib.postgres.fields import ArrayField
55
from django.core.exceptions import ObjectDoesNotExist
6-
from django.db import models
6+
from django.db import models, transaction
77
from django.conf import settings
8+
from django_lifecycle import hook, BEFORE_UPDATE
89
from yarl import URL
910

1011
from pulpcore.plugin.models import (
@@ -216,6 +217,30 @@ class PythonRemote(Remote):
216217
class Meta:
217218
default_related_name = "%(app_label)s_%(model_name)s"
218219

220+
@hook(
221+
BEFORE_UPDATE,
222+
when_any=[
223+
"excludes",
224+
"prereleases",
225+
"package_types",
226+
"keep_latest_packages",
227+
"exclude_platforms",
228+
"url",
229+
"policy"
230+
],
231+
has_changed=True
232+
)
233+
def clear_last_serial(self):
234+
"""
235+
Clear `last_serial` for any repository with this remote.
236+
"""
237+
with transaction.atomic():
238+
repos = PythonRepository.objects.filter(remote_id=self.pk, last_serial__gt=0)
239+
if repos:
240+
for repo in repos:
241+
repo.last_serial = 0
242+
PythonRepository.objects.bulk_update(repos, ["last_serial"])
243+
219244

220245
class PythonRepository(Repository):
221246
"""
@@ -227,6 +252,7 @@ class PythonRepository(Repository):
227252
REMOTE_TYPES = [PythonRemote]
228253

229254
autopublish = models.BooleanField(default=False)
255+
last_serial = models.IntegerField(default=0)
230256

231257
class Meta:
232258
default_related_name = "%(app_label)s_%(model_name)s"
@@ -252,3 +278,10 @@ def finalize_new_version(self, new_version):
252278
"""
253279
remove_duplicates(new_version)
254280
validate_repo_version(new_version)
281+
282+
@hook(BEFORE_UPDATE, when="remote", has_changed=True)
283+
def clear_last_serial(self):
284+
"""
285+
Reset `last_serial` when remote on repository changes.
286+
"""
287+
self.last_serial = 0

Diff for: pulp_python/app/serializers.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,17 @@ class PythonRepositorySerializer(core_serializers.RepositorySerializer):
2828
default=False,
2929
required=False,
3030
)
31+
last_serial = serializers.IntegerField(
32+
help_text=_(
33+
"The serial number from the last successful sync. Used in the sync process to "
34+
"optimize the sync based on changes from previous sync. Use mirror=True to bypass"
35+
"this optimization."
36+
),
37+
read_only=True,
38+
)
3139

3240
class Meta:
33-
fields = core_serializers.RepositorySerializer.Meta.fields + ("autopublish",)
41+
fields = core_serializers.RepositorySerializer.Meta.fields + ("autopublish", "last_serial")
3442
model = python_models.PythonRepository
3543

3644

Diff for: pulp_python/app/tasks/sync.py

+17-6
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from rest_framework import serializers
77

8-
from pulpcore.plugin.models import Artifact, ProgressReport, Remote, Repository
8+
from pulpcore.plugin.models import Artifact, ProgressReport, Remote
99
from pulpcore.plugin.stages import (
1010
DeclarativeArtifact,
1111
DeclarativeContent,
@@ -16,6 +16,7 @@
1616
from pulp_python.app.models import (
1717
PythonPackageContent,
1818
PythonRemote,
19+
PythonRepository,
1920
)
2021
from pulp_python.app.utils import parse_metadata
2122

@@ -43,15 +44,21 @@ def sync(remote_pk, repository_pk, mirror):
4344
4445
"""
4546
remote = PythonRemote.objects.get(pk=remote_pk)
46-
repository = Repository.objects.get(pk=repository_pk)
47+
repository = PythonRepository.objects.get(pk=repository_pk)
4748

4849
if not remote.url:
4950
raise serializers.ValidationError(
5051
detail=_("A remote must have a url attribute to sync.")
5152
)
5253

53-
first_stage = PythonBanderStage(remote)
54-
DeclarativeVersion(first_stage, repository, mirror).create()
54+
same_remote = getattr(repository.remote, "pk", None) == remote_pk
55+
serial = repository.last_serial if same_remote else 0
56+
first_stage = PythonBanderStage(remote, mirror, serial)
57+
version = DeclarativeVersion(first_stage, repository, mirror).create()
58+
if version is not None and same_remote:
59+
if first_stage.next_serial and first_stage.next_serial != repository.last_serial:
60+
repository.last_serial = first_stage.next_serial
61+
repository.save()
5562

5663

5764
def create_bandersnatch_config(remote):
@@ -97,10 +104,13 @@ class PythonBanderStage(Stage):
97104
Python Package Syncing Stage using Bandersnatch
98105
"""
99106

100-
def __init__(self, remote):
107+
def __init__(self, remote, mirror, last_serial):
101108
"""Initialize the stage and Bandersnatch config"""
102109
super().__init__()
103110
self.remote = remote
111+
# If mirror=True, then sync everything, don't use serial
112+
self.serial = last_serial if not mirror else 0
113+
self.next_serial = None
104114
create_bandersnatch_config(remote)
105115

106116
async def run(self):
@@ -119,7 +129,7 @@ async def run(self):
119129
message="Fetching Project Metadata", code="sync.fetching.project"
120130
) as p:
121131
pmirror = PulpMirror(
122-
serial=0, # Serial currently isn't supported by Pulp
132+
serial=self.serial,
123133
master=master,
124134
workers=workers,
125135
deferred_download=deferred_download,
@@ -132,6 +142,7 @@ async def run(self):
132142
Requirement(pkg).name for pkg in self.remote.includes
133143
]
134144
await pmirror.synchronize(packages_to_sync)
145+
self.next_serial = pmirror.target_serial
135146

136147

137148
class PulpMirror(Mirror):

Diff for: pulp_python/app/viewsets.py

+21-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
AsyncOperationResponseSerializer,
1212
RepositorySyncURLSerializer,
1313
)
14-
from pulpcore.plugin.tasking import dispatch
14+
from pulpcore.plugin.tasking import dispatch, general_update
1515

1616
from pulp_python.app import models as python_models
1717
from pulp_python.app import serializers as python_serializers
@@ -135,6 +135,26 @@ class PythonRemoteViewSet(core_viewsets.RemoteViewSet):
135135
queryset = python_models.PythonRemote.objects.all()
136136
serializer_class = python_serializers.PythonRemoteSerializer
137137

138+
@extend_schema(
139+
description="Trigger an asynchronous update task",
140+
responses={202: AsyncOperationResponseSerializer},
141+
)
142+
def update(self, request, pk, **kwargs):
143+
"""Update remote."""
144+
partial = kwargs.pop("partial", False)
145+
lock = [self.get_object()]
146+
serializer = self.get_serializer(lock[0], data=request.data, partial=partial)
147+
serializer.is_valid(raise_exception=True)
148+
repos = python_models.PythonRepository.objects.filter(remote_id=pk, last_serial__gt=0)
149+
lock.extend(repos)
150+
async_result = dispatch(
151+
general_update,
152+
exclusive_resources=lock,
153+
args=(pk, lock[1:]),
154+
kwargs={"data": request.data, "partial": partial},
155+
)
156+
return core_viewsets.OperationPostponedResponse(async_result, request)
157+
138158
@extend_schema(
139159
summary="Create from Bandersnatch",
140160
responses={201: python_serializers.PythonRemoteSerializer},

Diff for: pulp_python/tests/unit/test_models.py

+46
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,55 @@
11
from django.test import TestCase
22

3+
from pulp_python.app.models import PythonRemote, PythonRepository
4+
from pulp_python.app.tasks import update_remote
5+
6+
7+
DEFAULT_SERIAL = 10000
8+
MAX_SERIAL = 20000
9+
310

411
class TestNothing(TestCase):
512
"""Test Nothing (placeholder)."""
613

714
def test_nothing_at_all(self):
815
"""Test that the tests are running and that's it."""
916
self.assertTrue(True)
17+
18+
19+
class TestRepositoryLastSerial(TestCase):
20+
"""Tests `last_serial` gets properly set and reset with remote changes."""
21+
22+
def setUp(self):
23+
"""Set up class with repository with `last_serial` set."""
24+
self.remote = PythonRemote.objects.create(name="test", url="https://pypi.org")
25+
self.repo = PythonRepository.objects.create(
26+
name="test", remote=self.remote, last_serial=DEFAULT_SERIAL
27+
)
28+
29+
def test_remote_change(self):
30+
"""Test that `last_serial` gets reset upon remote change."""
31+
self.assertEqual(self.repo.remote.pk, self.remote.pk)
32+
self.assertEqual(self.repo.last_serial, DEFAULT_SERIAL)
33+
self.repo.remote = None
34+
self.repo.save()
35+
self.repo.refresh_from_db()
36+
self.assertEqual(self.repo.last_serial, 0)
37+
38+
def test_remote_update(self):
39+
"""Test that updating a remote will reset `last_serial`."""
40+
self.assertEqual(self.repo.remote.pk, self.remote.pk)
41+
self.assertEqual(self.repo.last_serial, DEFAULT_SERIAL)
42+
# Remote is only updated through update task
43+
new_body = {"url": "https://test.pypi.org"}
44+
update_remote(self.remote.pk, (self.repo.pk,), data=new_body, partial=True)
45+
self.repo.refresh_from_db()
46+
self.assertEqual(self.repo.last_serial, 0)
47+
48+
def test_remote_update_no_change(self):
49+
"""Test that changing 'includes' field doesn't reset `last_serial`."""
50+
self.assertEqual(self.repo.remote.pk, self.remote.pk)
51+
self.assertEqual(self.repo.last_serial, DEFAULT_SERIAL)
52+
new_body = {"includes": ["shelf-reader"]}
53+
update_remote(self.remote.pk, (self.repo.pk,), data=new_body, partial=True)
54+
self.repo.refresh_from_db()
55+
self.assertEqual(self.repo.last_serial, DEFAULT_SERIAL)

0 commit comments

Comments
 (0)