From 30862e38b20c2510d6123a7b29a31ff3d4577591 Mon Sep 17 00:00:00 2001 From: Sergey Misuk Date: Tue, 24 Sep 2024 00:37:25 +0300 Subject: [PATCH 1/3] Add possibility to configure face distance threshold per deduplication set --- .../apps/api/deduplication/adapters.py | 10 +++++- .../0004_config_deduplicationset_config.py | 36 +++++++++++++++++++ .../apps/api/models/deduplication.py | 5 +++ src/hope_dedup_engine/apps/api/serializers.py | 23 +++++++++++- .../faces/services/duplication_detector.py | 13 ++++--- .../apps/faces/services/image_processor.py | 4 +-- tests/api/conftest.py | 4 ++- tests/api/test_adapters.py | 3 +- tests/api/test_deduplication_set_create.py | 31 ++++++++++++---- tests/extras/testutils/factories/api.py | 9 +++++ tests/faces/conftest.py | 5 +-- tests/faces/faces_const.py | 1 + tests/faces/test_duplication_detector.py | 11 ++++-- tests/faces/test_image_processor.py | 3 +- 14 files changed, 134 insertions(+), 24 deletions(-) create mode 100644 src/hope_dedup_engine/apps/api/migrations/0004_config_deduplicationset_config.py diff --git a/src/hope_dedup_engine/apps/api/deduplication/adapters.py b/src/hope_dedup_engine/apps/api/deduplication/adapters.py index 47478d40..9deaf9e0 100644 --- a/src/hope_dedup_engine/apps/api/deduplication/adapters.py +++ b/src/hope_dedup_engine/apps/api/deduplication/adapters.py @@ -1,5 +1,7 @@ from collections.abc import Generator +from constance import config + from hope_dedup_engine.apps.api.deduplication.registry import DuplicateKeyPair from hope_dedup_engine.apps.api.models import DeduplicationSet from hope_dedup_engine.apps.faces.services.duplication_detector import ( @@ -20,8 +22,14 @@ def run(self) -> Generator[DuplicateKeyPair, None, None]: "reference_pk", "filename" ) } + face_distance_threshold: float = ( + self.deduplication_set.config + and self.deduplication_set.config.face_distance_threshold + ) or config.FACE_DISTANCE_THRESHOLD # ignored key pairs are not handled correctly in DuplicationDetector - detector = DuplicationDetector(tuple[str](filename_to_reference_pk.keys()), ()) + detector = DuplicationDetector( + tuple[str](filename_to_reference_pk.keys()), face_distance_threshold + ) for first_filename, second_filename, distance in detector.find_duplicates(): yield filename_to_reference_pk[first_filename], filename_to_reference_pk[ second_filename diff --git a/src/hope_dedup_engine/apps/api/migrations/0004_config_deduplicationset_config.py b/src/hope_dedup_engine/apps/api/migrations/0004_config_deduplicationset_config.py new file mode 100644 index 00000000..b421b0f2 --- /dev/null +++ b/src/hope_dedup_engine/apps/api/migrations/0004_config_deduplicationset_config.py @@ -0,0 +1,36 @@ +# Generated by Django 5.0.7 on 2024-09-22 21:12 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("api", "0003_remove_deduplicationset_name"), + ] + + operations = [ + migrations.CreateModel( + name="Config", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("face_distance_threshold", models.FloatField(null=True)), + ], + ), + migrations.AddField( + model_name="deduplicationset", + name="config", + field=models.OneToOneField( + null=True, on_delete=django.db.models.deletion.SET_NULL, to="api.config" + ), + ), + ] diff --git a/src/hope_dedup_engine/apps/api/models/deduplication.py b/src/hope_dedup_engine/apps/api/models/deduplication.py index a0b78455..97903227 100644 --- a/src/hope_dedup_engine/apps/api/models/deduplication.py +++ b/src/hope_dedup_engine/apps/api/models/deduplication.py @@ -9,6 +9,10 @@ REFERENCE_PK_LENGTH: Final[int] = 100 +class Config(models.Model): + face_distance_threshold = models.FloatField(null=True) + + class DeduplicationSet(models.Model): """ Bucket for entries we want to deduplicate @@ -52,6 +56,7 @@ class State(models.IntegerChoices): ) updated_at = models.DateTimeField(auto_now=True) notification_url = models.CharField(max_length=255, null=True, blank=True) + config = models.OneToOneField(Config, null=True, on_delete=models.SET_NULL) def __str__(self) -> str: return f"ID: {self.pk}" if not self.name else f"{self.name}" diff --git a/src/hope_dedup_engine/apps/api/serializers.py b/src/hope_dedup_engine/apps/api/serializers.py index f86d777c..b77cd9c5 100644 --- a/src/hope_dedup_engine/apps/api/serializers.py +++ b/src/hope_dedup_engine/apps/api/serializers.py @@ -4,14 +4,24 @@ from hope_dedup_engine.apps.api.models import DeduplicationSet from hope_dedup_engine.apps.api.models.deduplication import ( + Config, Duplicate, IgnoredKeyPair, Image, ) +CONFIG = "config" + + +class ConfigSerializer(serializers.ModelSerializer): + class Meta: + model = Config + exclude = ("id",) + class DeduplicationSetSerializer(serializers.ModelSerializer): state = serializers.CharField(source="get_state_display", read_only=True) + config = ConfigSerializer(required=False) class Meta: model = DeduplicationSet @@ -25,11 +35,22 @@ class Meta: "updated_by", ) + def create(self, validated_data) -> DeduplicationSet: + config_data = validated_data.get(CONFIG) and validated_data.pop(CONFIG) + config = Config.objects.create(**config_data) if config_data else None + return DeduplicationSet.objects.create(config=config, **validated_data) + + +class CreateConfigSerializer(ConfigSerializer): + pass + class CreateDeduplicationSetSerializer(serializers.ModelSerializer): + config = CreateConfigSerializer(required=False) + class Meta: model = DeduplicationSet - fields = ("reference_pk", "notification_url") + fields = ("config", "reference_pk", "notification_url") class ImageSerializer(serializers.ModelSerializer): diff --git a/src/hope_dedup_engine/apps/faces/services/duplication_detector.py b/src/hope_dedup_engine/apps/faces/services/duplication_detector.py index 0792b8c4..d7b60a35 100644 --- a/src/hope_dedup_engine/apps/faces/services/duplication_detector.py +++ b/src/hope_dedup_engine/apps/faces/services/duplication_detector.py @@ -5,7 +5,6 @@ import face_recognition import numpy as np -from constance import config from hope_dedup_engine.apps.faces.managers import StorageManager from hope_dedup_engine.apps.faces.services.image_processor import ImageProcessor @@ -20,7 +19,10 @@ class DuplicationDetector: logger: logging.Logger = logging.getLogger(__name__) def __init__( - self, filenames: tuple[str], ignore_pairs: tuple[tuple[str, str], ...] = tuple() + self, + filenames: tuple[str], + face_distance_threshold: float, + ignore_pairs: tuple[tuple[str, str], ...] = (), ) -> None: """ Initialize the DuplicationDetector with the given filenames and ignore pairs. @@ -31,9 +33,10 @@ def __init__( The pairs of filenames to ignore. Defaults to an empty tuple. """ self.filenames = filenames + self.face_distance_threshold = face_distance_threshold self.ignore_set = IgnorePairsValidator.validate(ignore_pairs) self.storages = StorageManager() - self.image_processor = ImageProcessor() + self.image_processor = ImageProcessor(face_distance_threshold) def _encodings_filename(self, filename: str) -> str: """ @@ -122,7 +125,7 @@ def find_duplicates(self) -> Generator[tuple[str, str, float], None, None]: encodings_all = self._load_encodings_all() for path1, path2 in combinations(existed_images_name, 2): - min_distance = config.FACE_DISTANCE_THRESHOLD + min_distance = self.face_distance_threshold encodings1 = encodings_all.get(path1) encodings2 = encodings_all.get(path2) if encodings1 is None or encodings2 is None: @@ -136,7 +139,7 @@ def find_duplicates(self) -> Generator[tuple[str, str, float], None, None]: ) < min_distance: min_distance = current_min - if min_distance < config.FACE_DISTANCE_THRESHOLD: + if min_distance < self.face_distance_threshold: yield (path1, path2, round(min_distance, 5)) except Exception as e: self.logger.exception( diff --git a/src/hope_dedup_engine/apps/faces/services/image_processor.py b/src/hope_dedup_engine/apps/faces/services/image_processor.py index 0d268806..7e3a0bad 100644 --- a/src/hope_dedup_engine/apps/faces/services/image_processor.py +++ b/src/hope_dedup_engine/apps/faces/services/image_processor.py @@ -56,7 +56,7 @@ class ImageProcessor: logger: logging.Logger = logging.getLogger(__name__) - def __init__(self) -> None: + def __init__(self, face_distance_threshold: float) -> None: """ Initialize the ImageProcessor with the required configurations. """ @@ -75,7 +75,7 @@ def __init__(self) -> None: model=config.FACE_ENCODINGS_MODEL, ) self.face_detection_confidence: float = config.FACE_DETECTION_CONFIDENCE - self.distance_threshold: float = config.FACE_DISTANCE_THRESHOLD + self.distance_threshold: float = face_distance_threshold self.nms_threshold: float = config.NMS_THRESHOLD def _get_face_detections_dnn( diff --git a/tests/api/conftest.py b/tests/api/conftest.py index 29395b48..94d58d65 100644 --- a/tests/api/conftest.py +++ b/tests/api/conftest.py @@ -11,6 +11,7 @@ NoDuplicateFinder, ) from testutils.factories.api import ( + ConfigFactory, DeduplicationSetFactory, DuplicateFactory, IgnoredKeyPairFactory, @@ -26,7 +27,7 @@ register(ExternalSystemFactory) register(UserFactory) register(DeduplicationSetFactory, external_system=LazyFixture("external_system")) -register(ImageFactory, deduplication_Set=LazyFixture("deduplication_set")) +register(ImageFactory, deduplication_set=LazyFixture("deduplication_set")) register( ImageFactory, _name="second_image", @@ -34,6 +35,7 @@ ) register(DuplicateFactory, deduplication_set=LazyFixture("deduplication_set")) register(IgnoredKeyPairFactory, deduplication_set=LazyFixture("deduplication_set")) +register(ConfigFactory) @fixture diff --git a/tests/api/test_adapters.py b/tests/api/test_adapters.py index 2db888b3..54596507 100644 --- a/tests/api/test_adapters.py +++ b/tests/api/test_adapters.py @@ -27,7 +27,8 @@ def test_duplicate_face_finder_uses_duplication_detector( found_pairs = tuple(finder.run()) duplication_detector.assert_called_once_with( - (image.filename, second_image.filename), () + (image.filename, second_image.filename), + deduplication_set.config.face_distance_threshold, ) duplication_detector.return_value.find_duplicates.assert_called_once() assert len(found_pairs) == 1 diff --git a/tests/api/test_deduplication_set_create.py b/tests/api/test_deduplication_set_create.py index 617acb5f..b36365b3 100644 --- a/tests/api/test_deduplication_set_create.py +++ b/tests/api/test_deduplication_set_create.py @@ -1,20 +1,22 @@ from api_const import DEDUPLICATION_SET_LIST_VIEW, JSON +from pytest import mark from rest_framework import status from rest_framework.reverse import reverse from rest_framework.test import APIClient from testutils.factories.api import DeduplicationSetFactory from hope_dedup_engine.apps.api.models import DeduplicationSet -from hope_dedup_engine.apps.api.serializers import DeduplicationSetSerializer +from hope_dedup_engine.apps.api.serializers import CreateDeduplicationSetSerializer def test_can_create_deduplication_set(api_client: APIClient) -> None: previous_amount = DeduplicationSet.objects.count() - data = DeduplicationSetSerializer(DeduplicationSetFactory.build()).data + data = CreateDeduplicationSetSerializer(DeduplicationSetFactory.build()).data response = api_client.post( reverse(DEDUPLICATION_SET_LIST_VIEW), data=data, format=JSON ) + assert response.status_code == status.HTTP_201_CREATED assert DeduplicationSet.objects.count() == previous_amount + 1 data = response.json() @@ -22,25 +24,40 @@ def test_can_create_deduplication_set(api_client: APIClient) -> None: def test_missing_fields_handling(api_client: APIClient) -> None: - data = DeduplicationSetSerializer(DeduplicationSetFactory.build()).data + data = CreateDeduplicationSetSerializer(DeduplicationSetFactory.build()).data del data["reference_pk"] response = api_client.post( reverse(DEDUPLICATION_SET_LIST_VIEW), data=data, format=JSON ) + assert response.status_code == status.HTTP_400_BAD_REQUEST errors = response.json() assert len(errors) == 1 assert "reference_pk" in errors -def test_invalid_values_handling(api_client: APIClient) -> None: - data = DeduplicationSetSerializer(DeduplicationSetFactory.build()).data - data["reference_pk"] = None +@mark.parametrize("field", ("reference_pk", "config")) +def test_invalid_values_handling(field: str, api_client: APIClient) -> None: + data = CreateDeduplicationSetSerializer(DeduplicationSetFactory.build()).data + data[field] = None + response = api_client.post( reverse(DEDUPLICATION_SET_LIST_VIEW), data=data, format=JSON ) + assert response.status_code == status.HTTP_400_BAD_REQUEST errors = response.json() assert len(errors) == 1 - assert "reference_pk" in errors + assert field in errors + + +def test_can_set_deduplication_set_without_config(api_client: APIClient) -> None: + data = CreateDeduplicationSetSerializer(DeduplicationSetFactory.build()).data + del data["config"] + + response = api_client.post( + reverse(DEDUPLICATION_SET_LIST_VIEW), data=data, format=JSON + ) + + assert response.status_code == status.HTTP_201_CREATED diff --git a/tests/extras/testutils/factories/api.py b/tests/extras/testutils/factories/api.py index 77d0549e..5f189b3c 100644 --- a/tests/extras/testutils/factories/api.py +++ b/tests/extras/testutils/factories/api.py @@ -4,6 +4,7 @@ from hope_dedup_engine.apps.api.models import DeduplicationSet, HDEToken from hope_dedup_engine.apps.api.models.deduplication import ( + Config, Duplicate, IgnoredKeyPair, Image, @@ -17,11 +18,19 @@ class Meta: model = HDEToken +class ConfigFactory(DjangoModelFactory): + face_distance_threshold = fuzzy.FuzzyFloat(low=0.1, high=1.0) + + class Meta: + model = Config + + class DeduplicationSetFactory(DjangoModelFactory): reference_pk = fuzzy.FuzzyText() external_system = SubFactory(ExternalSystemFactory) state = DeduplicationSet.State.CLEAN notification_url = fuzzy.FuzzyText(prefix="https://") + config = SubFactory(ConfigFactory) class Meta: model = DeduplicationSet diff --git a/tests/faces/conftest.py b/tests/faces/conftest.py index b5c46b2e..a2bb53e2 100644 --- a/tests/faces/conftest.py +++ b/tests/faces/conftest.py @@ -14,6 +14,7 @@ DEPLOY_PROTO_SHAPE, DNN_FILE, FACE_DETECTIONS, + FACE_DISTANCE_THRESHOLD, FACE_REGIONS_VALID, FILENAMES, IGNORE_PAIRS, @@ -116,7 +117,7 @@ def mock_image_processor( mocker.patch.object( BlobFromImageConfig, "_get_shape", return_value=DEPLOY_PROTO_SHAPE ) - mock_processor = ImageProcessor() + mock_processor = ImageProcessor(FACE_DISTANCE_THRESHOLD) mocker.patch.object( mock_processor.storages.get_storage("images"), "open", @@ -160,7 +161,7 @@ def mock_net(): @pytest.fixture def mock_dd(mock_image_processor, mock_net_manager, mock_storage_manager): - detector = DuplicationDetector(FILENAMES, IGNORE_PAIRS) + detector = DuplicationDetector(FILENAMES, FACE_DISTANCE_THRESHOLD, IGNORE_PAIRS) yield detector diff --git a/tests/faces/faces_const.py b/tests/faces/faces_const.py index 0e49c099..89ac631e 100644 --- a/tests/faces/faces_const.py +++ b/tests/faces/faces_const.py @@ -8,6 +8,7 @@ ["ignore_file.jpg", "ignore_file2.jpg"], ["ignore_file4.jpg", "ignore_file3.jpg"], ] +FACE_DISTANCE_THRESHOLD = 0.4 DNN_FILE = { "name": FILENAME, diff --git a/tests/faces/test_duplication_detector.py b/tests/faces/test_duplication_detector.py index bf0ad6f1..c55c1016 100644 --- a/tests/faces/test_duplication_detector.py +++ b/tests/faces/test_duplication_detector.py @@ -7,7 +7,12 @@ import numpy as np import pytest from constance import config -from faces_const import FILENAME, FILENAME_ENCODED_FORMAT, FILENAMES +from faces_const import ( + FACE_DISTANCE_THRESHOLD, + FILENAME, + FILENAME_ENCODED_FORMAT, + FILENAMES, +) from hope_dedup_engine.apps.faces.managers import StorageManager from hope_dedup_engine.apps.faces.services import DuplicationDetector @@ -65,7 +70,7 @@ def test_init_successful(mock_dd): def test_get_pairs_to_ignore_success( mock_storage_manager, mock_image_processor, ignore_input, expected_output ): - dd = DuplicationDetector(FILENAMES, ignore_input) + dd = DuplicationDetector(FILENAMES, FACE_DISTANCE_THRESHOLD, ignore_input) assert dd.ignore_set == expected_output @@ -86,7 +91,7 @@ def test_get_pairs_to_ignore_exception_handling( mock_storage_manager, mock_image_processor, ignore_input ): with pytest.raises(ValidationError): - DuplicationDetector(filenames=FILENAMES, ignore_pairs=ignore_input) + DuplicationDetector(FILENAMES, 0.2, ignore_pairs=ignore_input) def test_encodings_filename(mock_dd): diff --git a/tests/faces/test_image_processor.py b/tests/faces/test_image_processor.py index 75d5e616..1347eb75 100644 --- a/tests/faces/test_image_processor.py +++ b/tests/faces/test_image_processor.py @@ -10,6 +10,7 @@ BLOB_FROM_IMAGE_MEAN_VALUES, BLOB_FROM_IMAGE_SCALE_FACTOR, DEPLOY_PROTO_SHAPE, + FACE_DISTANCE_THRESHOLD, FACE_REGIONS_INVALID, FACE_REGIONS_VALID, FILENAME, @@ -43,7 +44,7 @@ def test_init_creates_expected_attributes( mock_image_processor.face_detection_confidence == config.FACE_DETECTION_CONFIDENCE ) - assert mock_image_processor.distance_threshold == config.FACE_DISTANCE_THRESHOLD + assert mock_image_processor.distance_threshold == FACE_DISTANCE_THRESHOLD assert mock_image_processor.nms_threshold == config.NMS_THRESHOLD From ee69503fe21a8db850a0f00533e356252464f895 Mon Sep 17 00:00:00 2001 From: Sergey Misuk Date: Tue, 24 Sep 2024 11:27:18 +0300 Subject: [PATCH 2/3] Check face_distance_threshold value passed to DuplicationDetector --- tests/api/test_adapters.py | 45 ++++++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/tests/api/test_adapters.py b/tests/api/test_adapters.py index 54596507..45f81309 100644 --- a/tests/api/test_adapters.py +++ b/tests/api/test_adapters.py @@ -1,18 +1,27 @@ +from random import random +from unittest.mock import MagicMock + +from constance.test.unittest import override_config +from pytest import fixture from pytest_mock import MockerFixture from hope_dedup_engine.apps.api.deduplication.adapters import DuplicateFaceFinder from hope_dedup_engine.apps.api.models import DeduplicationSet, Image +@fixture +def duplication_detector(mocker: MockerFixture) -> MagicMock: + yield mocker.patch( + "hope_dedup_engine.apps.api.deduplication.adapters.DuplicationDetector" + ) + + def test_duplicate_face_finder_uses_duplication_detector( deduplication_set: DeduplicationSet, image: Image, second_image: Image, - mocker: MockerFixture, + duplication_detector: MagicMock, ) -> None: - duplication_detector = mocker.patch( - "hope_dedup_engine.apps.api.deduplication.adapters.DuplicationDetector" - ) duplication_detector.return_value.find_duplicates.return_value = iter( ( ( @@ -37,3 +46,31 @@ def test_duplicate_face_finder_uses_duplication_detector( second_image.reference_pk, 1 - distance, ) + + +def _run_duplicate_face_finder(deduplication_set: DeduplicationSet) -> None: + finder = DuplicateFaceFinder(deduplication_set) + tuple(finder.run()) # tuple is used to make generator finish execution + + +def test_duplication_detector_is_initiated_with_correct_face_distance_threshold_value( + deduplication_set: DeduplicationSet, + duplication_detector: MagicMock, +) -> None: + # deduplication set face_distance_threshold config value is used + _run_duplicate_face_finder(deduplication_set) + duplication_detector.assert_called_once_with( + (), deduplication_set.config.face_distance_threshold + ) + face_distance_threshold = random() + with override_config(FACE_DISTANCE_THRESHOLD=face_distance_threshold): + # value from global config is used when face_distance_threshold is not set in deduplication set config + duplication_detector.reset_mock() + deduplication_set.config.face_distance_threshold = None + _run_duplicate_face_finder(deduplication_set) + duplication_detector.assert_called_once_with((), face_distance_threshold) + # value from global config is used when deduplication set has no config + duplication_detector.reset_mock() + deduplication_set.config = None + _run_duplicate_face_finder(deduplication_set) + duplication_detector.assert_called_once_with((), face_distance_threshold) From 22bfbbb3fa112c978253b87ada7d1a88a7dc4a0c Mon Sep 17 00:00:00 2001 From: Sergey Misuk Date: Tue, 24 Sep 2024 12:06:11 +0300 Subject: [PATCH 3/3] Resolve merge issues --- ...onset_config.py => 0005_config_deduplicationset_config.py} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename src/hope_dedup_engine/apps/api/migrations/{0004_config_deduplicationset_config.py => 0005_config_deduplicationset_config.py} (88%) diff --git a/src/hope_dedup_engine/apps/api/migrations/0004_config_deduplicationset_config.py b/src/hope_dedup_engine/apps/api/migrations/0005_config_deduplicationset_config.py similarity index 88% rename from src/hope_dedup_engine/apps/api/migrations/0004_config_deduplicationset_config.py rename to src/hope_dedup_engine/apps/api/migrations/0005_config_deduplicationset_config.py index b421b0f2..ddd414cd 100644 --- a/src/hope_dedup_engine/apps/api/migrations/0004_config_deduplicationset_config.py +++ b/src/hope_dedup_engine/apps/api/migrations/0005_config_deduplicationset_config.py @@ -1,4 +1,4 @@ -# Generated by Django 5.0.7 on 2024-09-22 21:12 +# Generated by Django 5.0.7 on 2024-09-24 09:05 import django.db.models.deletion from django.db import migrations, models @@ -7,7 +7,7 @@ class Migration(migrations.Migration): dependencies = [ - ("api", "0003_remove_deduplicationset_name"), + ("api", "0004_remove_deduplicationset_error_and_more"), ] operations = [