Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

45: Add possibility to ignore filename pairs #90

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/hope_dedup_engine/apps/api/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,8 @@
DUPLICATE = "duplicate"
DUPLICATE_LIST = f"{DUPLICATE}s"

IGNORED_KEYS = "ignored_key"
IGNORED_KEYS_LIST = f"{IGNORED_KEYS}s"
IGNORED = "ignored"
REFERENCE_PK = "reference_pk"
FILENAME = "filename"
IGNORED_REFERENCE_PK_LIST = f"{IGNORED}/{REFERENCE_PK}s"
IGNORED_FILENAME_LIST = f"{IGNORED}/{FILENAME}s"
42 changes: 31 additions & 11 deletions src/hope_dedup_engine/apps/api/deduplication/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,40 @@ def _sort_keys(pair: DuplicateKeyPair) -> DuplicateKeyPair:
def _save_duplicates(
finder: DuplicateFinder,
deduplication_set: DeduplicationSet,
ignored_key_pairs: frozenset[tuple[str, str]],
lock_enabled: bool,
lock: DeduplicationSetLock,
) -> None:
reference_pk_to_filename_mapping = dict(
deduplication_set.image_set.values_list("reference_pk", "filename")
)
ignored_filename_pairs = frozenset(
map(
tuple,
map(
sorted,
deduplication_set.ignoredfilenamepair_set.values_list(
"first", "second"
),
),
)
)

ignored_reference_pk_pairs = frozenset(
deduplication_set.ignoredreferencepkpair_set.values_list("first", "second")
)

for first, second, score in map(_sort_keys, finder.run()):
if (first, second) not in ignored_key_pairs:
first_filename, second_filename = sorted(
(
reference_pk_to_filename_mapping[first],
reference_pk_to_filename_mapping[second],
)
)
ignored = (first, second) in ignored_reference_pk_pairs or (
first_filename,
second_filename,
) in ignored_filename_pairs
if not ignored:
duplicate, _ = Duplicate.objects.get_or_create(
deduplication_set=deduplication_set,
first_reference_pk=first,
Expand Down Expand Up @@ -54,17 +82,9 @@ def find_duplicates(deduplication_set_id: str, serialized_lock: str) -> None:
# clean results
Duplicate.objects.filter(deduplication_set=deduplication_set).delete()

ignored_key_pairs = frozenset(
deduplication_set.ignoredkeypair_set.values_list(
"first_reference_pk", "second_reference_pk"
)
)

weight_total = 0
for finder in get_finders(deduplication_set):
_save_duplicates(
finder, deduplication_set, ignored_key_pairs, lock_enabled, lock
)
_save_duplicates(finder, deduplication_set, lock_enabled, lock)
weight_total += finder.weight

for duplicate in deduplication_set.duplicate_set.all():
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Generated by Django 5.0.7 on 2024-09-25 10:29

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
("api", "0006_alter_deduplicationset_state_and_more"),
]

operations = [
migrations.RenameModel(
old_name="IgnoredKeyPair",
new_name="IgnoredReferencePkPair",
),
migrations.RenameField(
model_name="ignoredreferencepkpair",
old_name="first_reference_pk",
new_name="first",
),
migrations.RenameField(
model_name="ignoredreferencepkpair",
old_name="second_reference_pk",
new_name="second",
),
migrations.AlterUniqueTogether(
name="ignoredreferencepkpair",
unique_together={("deduplication_set", "first", "second")},
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Generated by Django 5.0.7 on 2024-09-25 11:24

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("api", "0007_rename_ignoredkeypair_ignoredreferencepkpair_and_more"),
]

operations = [
migrations.CreateModel(
name="IgnoredFilenamePair",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("first", models.CharField(max_length=100)),
("second", models.CharField(max_length=100)),
(
"deduplication_set",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to="api.deduplicationset",
),
),
],
options={
"unique_together": {("deduplication_set", "first", "second")},
},
),
]
37 changes: 26 additions & 11 deletions src/hope_dedup_engine/apps/api/models/deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,21 +115,36 @@ class Duplicate(models.Model):
score = models.FloatField(default=0)


class IgnoredKeyPair(models.Model):
class IgnoredPair(models.Model):
deduplication_set = models.ForeignKey(DeduplicationSet, on_delete=models.CASCADE)
first_reference_pk = models.CharField(max_length=REFERENCE_PK_LENGTH)
second_reference_pk = models.CharField(max_length=REFERENCE_PK_LENGTH)

class Meta:
unique_together = (
"deduplication_set",
"first_reference_pk",
"second_reference_pk",
)
abstract = True

@override
def save(self, **kwargs: Any) -> None:
self.first_reference_pk, self.second_reference_pk = sorted(
(self.first_reference_pk, self.second_reference_pk)
)
self.first, self.second = sorted((self.first, self.second))
super().save(**kwargs)


UNIQUE_FOR_IGNORED_PAIR = (
"deduplication_set",
"first",
"second",
)


class IgnoredReferencePkPair(IgnoredPair):
first = models.CharField(max_length=REFERENCE_PK_LENGTH)
second = models.CharField(max_length=REFERENCE_PK_LENGTH)

class Meta:
unique_together = UNIQUE_FOR_IGNORED_PAIR


class IgnoredFilenamePair(IgnoredPair):
first = models.CharField(max_length=REFERENCE_PK_LENGTH)
second = models.CharField(max_length=REFERENCE_PK_LENGTH)

class Meta:
unique_together = UNIQUE_FOR_IGNORED_PAIR
31 changes: 24 additions & 7 deletions src/hope_dedup_engine/apps/api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from hope_dedup_engine.apps.api.models.deduplication import (
Config,
Duplicate,
IgnoredKeyPair,
IgnoredFilenamePair,
IgnoredReferencePkPair,
Image,
)

Expand Down Expand Up @@ -96,16 +97,32 @@ class Meta:
fields = "first", "second", "score"


class IgnoredKeyPairSerializer(serializers.ModelSerializer):
CREATE_PAIR_FIELDS = "first", "second"
PAIR_FIELDS = ("id", "deduplication_set") + CREATE_PAIR_FIELDS


class IgnoredReferencePkPairSerializer(serializers.ModelSerializer):
class Meta:
model = IgnoredReferencePkPair
fields = PAIR_FIELDS


class CreateIgnoredReferencePkPairSerializer(serializers.ModelSerializer):
class Meta:
model = IgnoredReferencePkPair
fields = CREATE_PAIR_FIELDS


class IgnoredFilenamePairSerializer(serializers.ModelSerializer):
class Meta:
model = IgnoredKeyPair
fields = "__all__"
model = IgnoredFilenamePair
fields = PAIR_FIELDS


class CreateIgnoredKeyPairSerializer(serializers.ModelSerializer):
class CreateIgnoredFilenamePairSerializer(serializers.ModelSerializer):
class Meta:
model = IgnoredKeyPair
fields = ("first_reference_pk", "second_reference_pk")
model = IgnoredFilenamePair
fields = CREATE_PAIR_FIELDS


class EmptySerializer(serializers.Serializer):
Expand Down
13 changes: 10 additions & 3 deletions src/hope_dedup_engine/apps/api/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,16 @@
DEDUPLICATION_SET,
DEDUPLICATION_SET_LIST,
DUPLICATE_LIST,
IGNORED_KEYS_LIST,
IGNORED_FILENAME_LIST,
IGNORED_REFERENCE_PK_LIST,
IMAGE_LIST,
)
from hope_dedup_engine.apps.api.views import (
BulkImageViewSet,
DeduplicationSetViewSet,
DuplicateViewSet,
IgnoredKeyPairViewSet,
IgnoredFilenamePairViewSet,
IgnoredReferencePkPairViewSet,
ImageViewSet,
)

Expand All @@ -40,7 +42,12 @@
DUPLICATE_LIST, DuplicateViewSet, basename=DUPLICATE_LIST
)
deduplication_sets_router.register(
IGNORED_KEYS_LIST, IgnoredKeyPairViewSet, basename=IGNORED_KEYS_LIST
IGNORED_FILENAME_LIST, IgnoredFilenamePairViewSet, basename=IGNORED_FILENAME_LIST
)
deduplication_sets_router.register(
IGNORED_REFERENCE_PK_LIST,
IgnoredReferencePkPairViewSet,
basename=IGNORED_REFERENCE_PK_LIST,
)

urlpatterns = [
Expand Down
46 changes: 36 additions & 10 deletions src/hope_dedup_engine/apps/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,20 @@
from hope_dedup_engine.apps.api.models import DeduplicationSet
from hope_dedup_engine.apps.api.models.deduplication import (
Duplicate,
IgnoredKeyPair,
IgnoredFilenamePair,
IgnoredReferencePkPair,
Image,
)
from hope_dedup_engine.apps.api.serializers import (
CreateDeduplicationSetSerializer,
CreateIgnoredKeyPairSerializer,
CreateIgnoredFilenamePairSerializer,
CreateIgnoredReferencePkPairSerializer,
CreateImageSerializer,
DeduplicationSetSerializer,
DuplicateSerializer,
EmptySerializer,
IgnoredKeyPairSerializer,
IgnoredFilenamePairSerializer,
IgnoredReferencePkPairSerializer,
ImageSerializer,
)
from hope_dedup_engine.apps.api.utils import delete_model_data, start_processing
Expand Down Expand Up @@ -272,8 +275,8 @@ def list(self, request: Request, *args: Any, **kwargs: Any) -> Response:
return super().list(request, *args, **kwargs)


class IgnoredKeyPairViewSet(
nested_viewsets.NestedViewSetMixin[IgnoredKeyPair],
class IgnoredPairViewSet[T](
nested_viewsets.NestedViewSetMixin[T],
mixins.ListModelMixin,
mixins.CreateModelMixin,
viewsets.GenericViewSet,
Expand All @@ -284,8 +287,6 @@ class IgnoredKeyPairViewSet(
AssignedToExternalSystem,
UserAndDeduplicationSetAreOfTheSameSystem,
)
serializer_class = IgnoredKeyPairSerializer
queryset = IgnoredKeyPair.objects.all()
parent_lookup_kwargs = {
DEDUPLICATION_SET_PARAM: DEDUPLICATION_SET_FILTER,
}
Expand All @@ -297,13 +298,38 @@ def perform_create(self, serializer: Serializer) -> None:
deduplication_set.updated_by = self.request.user
deduplication_set.save()

@extend_schema(description="List all ignored key pairs for the deduplication set")

class IgnoredFilenamePairViewSet(IgnoredPairViewSet[IgnoredFilenamePair]):
serializer_class = IgnoredFilenamePairSerializer
queryset = IgnoredFilenamePair.objects.all()

@extend_schema(
description="List all ignored filename pairs for the deduplication set"
)
def list(self, request: Request, *args: Any, **kwargs: Any) -> Response:
return super().list(request, *args, **kwargs)

@extend_schema(
request=CreateIgnoredFilenamePairSerializer,
description="Add ignored filename pair for the deduplication set",
)
def create(self, request: Request, *args: Any, **kwargs: Any) -> Response:
return super().create(request, *args, **kwargs)


class IgnoredReferencePkPairViewSet(IgnoredPairViewSet[IgnoredReferencePkPair]):
serializer_class = IgnoredReferencePkPairSerializer
queryset = IgnoredReferencePkPair.objects.all()

@extend_schema(
description="List all ignored reference pk pairs for the deduplication set"
)
def list(self, request: Request, *args: Any, **kwargs: Any) -> Response:
return super().list(request, *args, **kwargs)

@extend_schema(
request=CreateIgnoredKeyPairSerializer,
description="Add ignored key pair for the deduplication set",
request=CreateIgnoredReferencePkPairSerializer,
description="Add ignored reference pk pair for the deduplication set",
)
def create(self, request: Request, *args: Any, **kwargs: Any) -> Response:
return super().create(request, *args, **kwargs)
6 changes: 4 additions & 2 deletions tests/api/api_const.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
BULK_IMAGE_LIST,
DEDUPLICATION_SET_LIST,
DUPLICATE_LIST,
IGNORED_KEYS_LIST,
IGNORED_FILENAME_LIST,
IGNORED_REFERENCE_PK_LIST,
IMAGE_LIST,
)

Expand All @@ -17,4 +18,5 @@
BULK_IMAGE_LIST_VIEW = f"{BULK_IMAGE_LIST}-{LIST}"
BULK_IMAGE_CLEAR_VIEW = f"{BULK_IMAGE_LIST}-clear"
DUPLICATE_LIST_VIEW = f"{DUPLICATE_LIST}-{LIST}"
IGNORED_KEYS_LIST_VIEW = f"{IGNORED_KEYS_LIST}-{LIST}"
IGNORED_REFERENCE_PK_LIST_VIEW = f"{IGNORED_REFERENCE_PK_LIST}-{LIST}"
IGNORED_FILENAME_LIST_VIEW = f"{IGNORED_FILENAME_LIST}-{LIST}"
Loading
Loading