From a13c97f4571aceaf980b2cf169f8c9301a6ce80c Mon Sep 17 00:00:00 2001 From: Stefano Apostolico Date: Tue, 28 May 2024 18:11:16 +0200 Subject: [PATCH 1/7] Feature/ci rc (#23) * updates CI * updates CI * updates CI * updates CI * updates CI * updates CI * updates CI * updates CI * updates CI --- .github/actions/hash/action.yml | 24 ++++++ .github/actions/image_exists/action.yml | 43 ++++++++++ .github/workflows/ci.yml | 108 +++++------------------- .github/workflows/test.yml | 62 ++++++++++++++ .gitignore | 3 +- 5 files changed, 153 insertions(+), 87 deletions(-) create mode 100644 .github/actions/hash/action.yml create mode 100644 .github/actions/image_exists/action.yml create mode 100644 .github/workflows/test.yml diff --git a/.github/actions/hash/action.yml b/.github/actions/hash/action.yml new file mode 100644 index 00000000..00da682f --- /dev/null +++ b/.github/actions/hash/action.yml @@ -0,0 +1,24 @@ +# ref: https://docs.github.com/en/actions/creating-actions/creating-a-docker-container-action +name: 'Calculate version hash' +description: 'Calculate deps and os hash' +inputs: + files: + description: 'Files to use to calculate the hash' + required: true + default: "pdm.lock docker/bin/* docker/conf/* docker/Dockerfile" +outputs: + hash: # id of output + description: 'The time we greeted you' + value: ${{ steps.calc.outputs.hash }} + +runs: + using: 'composite' +# args: +# - ${{ inputs.files }} + steps: + - name: Calculate release hash + id: calc + shell: bash --noprofile --norc -eo pipefail -ux {0} + run: | + LOCK_SHA=$(echo sha1sum ${{ inputs.files }} | sha1sum | awk '{print $1}' | cut -c 1-8) + echo "hash=$LOCK_SHA" >> "$GITHUB_OUTPUT" diff --git a/.github/actions/image_exists/action.yml b/.github/actions/image_exists/action.yml new file mode 100644 index 00000000..9c0c528d --- /dev/null +++ b/.github/actions/image_exists/action.yml @@ -0,0 +1,43 @@ +# ref: https://docs.github.com/en/actions/creating-actions/creating-a-docker-container-action +name: 'Che cif image exists in dockerhub' +description: 'Calculate deps and os hash' +inputs: + image: + description: 'Docker Image ' + required: true + username: + description: 'DockerHub username ' + required: true + password: + description: 'DockerHub password ' + required: true + +outputs: + exists: + description: 'The time we greeted you' + value: ${{ steps.check.outputs.exixts }} + +runs: + using: 'composite' +# args: +# - ${{ inputs.files }} + steps: + - name: DockerHub login + uses: docker/login-action@v3 + with: + username: ${{ inputs.username }} + password: ${{ inputs.password }} + - name: Check Image Exists + id: check + continue-on-error: true + shell: bash --noprofile --norc -eo pipefail -ux {0} + run: | + set +e + exists=$(docker manifest inspect ${{inputs.image}} > /dev/null 2>&1 && echo "exists" || echo "not_found") + if [ $exists = "exists" ];then + echo "exists=true" >> "$GITHUB_OUTPUT" + echo "Image ${{inputs.image}} found" + else + echo "exists=false" >> "$GITHUB_OUTPUT" + echo "Image ${{inputs.image}} does not exist" + fi \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f643f682..23c014f2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,52 +8,51 @@ on: branches: - develop env: - HASH_SEEDS: pdm.lock docker/bin/* docker/conf/* docker/Dockerfile DOCKER_CACHE_IMAGE: ${{ vars.DOCKERHUB_ORGANIZATION }}/hope-support-images - BUILD_DATE: $(date +"%Y-%m-%d %H:%M") DOCKER_DEFAULT_PLATFORM: linux/amd64 jobs: build: runs-on: ubuntu-20.04 + outputs: + docker-image: ${{ steps.image_name.outputs.name }} steps: - name: Checkout code uses: actions/checkout@v4 - - name: DockerHub login - uses: docker/login-action@v3 + + - uses: ./.github/actions/hash + id: release_hash + + - name: Image name + id: image_name + run: | + image_name="$DOCKER_CACHE_IMAGE:hde-dev-${{ steps.release_hash.outputs.hash }}" + image_name_latest="$DOCKER_CACHE_IMAGE:hde-dev-latest" + echo "name=$image_name" >> $GITHUB_OUTPUT + echo "latest=$image_name_latest" >> $GITHUB_OUTPUT + - name: Check if image exists + uses: ./.github/actions/image_exists + id: image_exists with: + image: ${{ steps.image_name.outputs.name }} username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Check Dev Image - continue-on-error: true - id: image_exists - run: | - set +e - LOCK_SHA=$(echo sha1sum ${{env.HASH_SEEDS}}| sha1sum | awk '{print $1}' | cut -c 1-8) - exists=$(docker manifest inspect ${DOCKER_CACHE_IMAGE}:hde-dev-${LOCK_SHA} > /dev/null 2>&1 && echo "exists" || echo "not_found") - exists="exists" - echo "result=$exists" >> "$GITHUB_OUTPUT" - echo "SHA=$LOCK_SHA" >> "$GITHUB_OUTPUT" - if [ $exists = "exists" ];then - echo "Tag ${{ steps.image_exists.outputs.SHA }} found" - else - echo "Tag ${{ steps.image_exists.outputs.SHA }} does not exist" - fi - name: Build Dev Image - if: ${{ steps.image_exists.outputs.result == 'not_found' || contains(github.event.head_commit.message, 'rebuild') }} + if: ${{ !steps.image_exists.outputs.exists || contains(github.event.head_commit.message, 'rebuild') }} + id: docker_build run: | - LOCK_SHA=${{ steps.image_exists.outputs.SHA }} + BUILD_DATE=$(date +"%Y-%m-%d %H:%M") docker buildx create --use --platform x86_64 --name builder --driver docker-container - docker buildx build \ + docker buildx build \ --platform x86_64 \ --builder builder \ --build-arg BUILD_DATE="${BUILD_DATE}" \ --progress=plain \ --cache-to type=local,ref=${DOCKER_CACHE_IMAGE}:hde-dev-latest,dest=./.AAA \ --cache-from ${DOCKER_CACHE_IMAGE}:hde-dev-latest \ - -t ${DOCKER_CACHE_IMAGE}:hde-dev-${LOCK_SHA} \ - -t ${DOCKER_CACHE_IMAGE}:hde-dev-latest \ + -t ${{ steps.image_name.outputs.name }} \ + -t ${{ steps.image_name.outputs.latest }} \ -f ./docker/Dockerfile \ --push \ --target python_dev_deps . @@ -73,64 +72,3 @@ jobs: run: black --check src/ - name: Flake8 run: flake8 src/ - - test: - runs-on: ubuntu-20.04 - needs: [build] - container: - image: unicef/hope-support-images:hde-dev-latest - credentials: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - ports: - - 8000:8000 - services: - redis: - image: redis - db: - image: postgres:14 - env: - POSTGRES_DATABASE: dedupe - POSTGRES_PASSWORD: postgres - POSTGRES_USERNAME: postgres - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - env: - DATABASE_URL: postgres://postgres:postgres@db:5432/dedupe - SECRET_KEY: secret_key - CACHE_URL: redis://redis:6379/0 - CELERY_BROKER_URL: redis://redis:6379/0 - PYTHONPATH: "/hde/code/src:/hde/__pypackages__/3.12/lib" - steps: - - name: Checkout code - uses: actions/checkout@v2 - - name: Run tests - run: | - pytest tests -# echo "===================================" -# ls -al -# docker run --rm \ -# -e PYTHONPATH=/hde/code/src:/hde/__pypackages__/3.12/lib \ -# -e CACHE_URL="${CACHE_URL}" \ -# -e DATABASE_URL="${DATABASE_URL}" \ -# -e SECRET_KEY="${SECRET_KEY}" \ -# -e CELERY_BROKER_URL="${CELERY_BROKER_URL}" \ -# -v ${PWD}:/hde/code/ \ -# -w /hde/code/ \ -# -t ${DOCKER_CACHE_IMAGE}:hde-dev-latest \ -# pytest tests/ --create-db -v --cov --cov-report xml:coverage.xml -# -# - name: Upload coverage to Codecov -# uses: codecov/codecov-action@v4 -# with: -# directory: ./coverage/reports/ -# env_vars: OS,PYTHON -# fail_ci_if_error: true -# files: /hde/code/coverage1.xml -# flags: unittests -# name: codecov-umbrella -# token: ${{ secrets.CODECOV_TOKEN }} -# verbose: true \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..44e92204 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,62 @@ +name: Test + +on: + push: + branches: + - develop + pull_request: + branches: + - develop + workflow_run: + workflows: [ci] + types: + - completed + +jobs: + + test: + runs-on: ubuntu-20.04 + container: + image: unicef/hope-support-images:hde-dev-latest + credentials: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + ports: + - 8000:8000 + services: + redis: + image: redis + db: + image: postgres:14 + env: + POSTGRES_DATABASE: dedupe + POSTGRES_PASSWORD: postgres + POSTGRES_USERNAME: postgres + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + env: + DATABASE_URL: postgres://postgres:postgres@db:5432/dedupe + SECRET_KEY: secret_key + CACHE_URL: redis://redis:6379/0 + CELERY_BROKER_URL: redis://redis:6379/0 + PYTHONPATH: "/hde/code/src:/hde/__pypackages__/3.12/lib" + steps: + - name: Checkout code + uses: actions/checkout@v2 + - name: Run tests + run: | + pytest tests --create-db -v --cov --cov-report xml:coverage.xml +# - name: Upload coverage to Codecov +# uses: codecov/codecov-action@v4 +# with: +# directory: ./coverage/reports/ +# env_vars: OS,PYTHON +# fail_ci_if_error: true +# files: /hde/code/coverage1.xml +# flags: unittests +# name: codecov-umbrella +# token: ${{ secrets.CODECOV_TOKEN }} +# verbose: true diff --git a/.gitignore b/.gitignore index e72af819..72e9d96d 100644 --- a/.gitignore +++ b/.gitignore @@ -8,10 +8,9 @@ __pycache__/ !.dockerignore !.flake8 !.gitignore -!.gitlab/ +!.github/* !.tx/config !.mypy.ini -!.gitlab-ci.yml !.pre-commit-config.yaml !.bumpversion.cfg !.trivyignore From 82ed4154980edb2837520fe7eb53d044f2babb40 Mon Sep 17 00:00:00 2001 From: vitali-yanushchyk-valor <168179384+vitali-yanushchyk-valor@users.noreply.github.com> Date: Tue, 28 May 2024 16:51:41 -0400 Subject: [PATCH 2/7] fix ! typo (#22) --- compose.yml | 6 +++--- docker/bin/docker-entrypoint.sh | 5 +++-- docker/entrypoint.sh | 38 --------------------------------- 3 files changed, 6 insertions(+), 43 deletions(-) delete mode 100755 docker/entrypoint.sh diff --git a/compose.yml b/compose.yml index da8235d4..06a6274d 100644 --- a/compose.yml +++ b/compose.yml @@ -20,7 +20,7 @@ services: <<: *common ports: - 8000:8000 - command: ["entrypoint.sh", "dev"] + command: ["docker-entrypoint.sh", "dev"] healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/healthcheck"] interval: 10s @@ -29,11 +29,11 @@ services: celery_worker: <<: *common - command: ["entrypoint.sh", "celery_worker"] + command: ["docker-entrypoint.sh", "worker"] celery_beat: <<: *common - command: ["entrypoint.sh", "celery_beat"] + command: ["docker-entrypoint.sh", "beat"] db: image: postgres:15 diff --git a/docker/bin/docker-entrypoint.sh b/docker/bin/docker-entrypoint.sh index 0ce09c78..d76cfe34 100755 --- a/docker/bin/docker-entrypoint.sh +++ b/docker/bin/docker-entrypoint.sh @@ -37,10 +37,11 @@ case "$1" in exec uwsgi --ini /conf/uwsgi.ini ;; worker) - exec celery -A hope_dedup_engine.celery worker -E --loglevel=ERROR --concurrency=4 + export C_FORCE_ROOT=1 + exec celery -A hope_dedup_engine.config.celery worker -E --loglevel=ERROR --concurrency=4 ;; beat) - exec celery -A hope_dedup_engine.celery beat -E --loglevel=ERROR ---scheduler django_celery_beat.schedulers:DatabaseScheduler + exec celery -A hope_dedup_engine.config.celery beat --loglevel=ERROR --scheduler django_celery_beat.schedulers:DatabaseScheduler ;; dev) until pg_isready -h db -p 5432; diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh deleted file mode 100755 index 88299244..00000000 --- a/docker/entrypoint.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -set -eou pipefail - -production() { - uwsgi \ - --http :8000 \ - --master \ - --module=hope_dedup_engine.config.wsgi \ - --processes=2 \ - --buffer-size=8192 -} - -if [ $# -eq 0 ]; then - production -fi - -case "$1" in - dev) - wait-for-it.sh db:5432 - ./manage.py upgrade - ./manage.py runserver 0.0.0.0:8000 - ;; - prd) - tail -f /dev/null - production - ;; - celery_worker) - export C_FORCE_ROOT=1 - celery -A hope_dedup_engine.config.celery worker -l info - ;; - celery_beat) - celery -A hope_dedup_engine.config.celery beat -l info - ;; - *) - exec "$@" - ;; -esac \ No newline at end of file From fdb25da38aa1d33ff14b368fd4a4a25ccca0030c Mon Sep 17 00:00:00 2001 From: vitali-yanushchyk-valor <168179384+vitali-yanushchyk-valor@users.noreply.github.com> Date: Wed, 29 May 2024 07:05:58 -0400 Subject: [PATCH 3/7] Feature/celery results (#24) add ! django_celery_results --- compose.yml | 1 + pdm.lock | 117 ++++++++++-------- pyproject.toml | 6 +- src/hope_dedup_engine/__init__.py | 5 + src/hope_dedup_engine/apps/faces/admin.py | 16 --- .../apps/faces/celery_tasks.py | 13 +- .../apps/faces/migrations/0001_initial.py | 49 -------- .../apps/faces/migrations/__init__.py | 0 .../apps/faces/models/__init__.py | 1 - .../apps/faces/models/task_model.py | 24 ---- .../apps/faces/utils/celery_utils.py | 19 +-- .../apps/faces/utils/duplication_detector.py | 2 - src/hope_dedup_engine/config/celery.py | 4 + .../config/fragments/celery.py | 10 +- src/hope_dedup_engine/config/settings.py | 3 +- tests/conftest.py | 2 +- .../fixtures/{tasks.py => celery_tasks.py} | 15 +-- .../{test_tasks.py => test_celery_tasks.py} | 40 ++---- tests/faces/test_duplication_detector.py | 2 +- 19 files changed, 116 insertions(+), 213 deletions(-) delete mode 100644 src/hope_dedup_engine/apps/faces/admin.py delete mode 100644 src/hope_dedup_engine/apps/faces/migrations/0001_initial.py delete mode 100644 src/hope_dedup_engine/apps/faces/migrations/__init__.py delete mode 100644 src/hope_dedup_engine/apps/faces/models/__init__.py delete mode 100644 src/hope_dedup_engine/apps/faces/models/task_model.py rename tests/faces/fixtures/{tasks.py => celery_tasks.py} (58%) rename tests/faces/{test_tasks.py => test_celery_tasks.py} (54%) diff --git a/compose.yml b/compose.yml index 06a6274d..5d6fc8e5 100644 --- a/compose.yml +++ b/compose.yml @@ -70,6 +70,7 @@ services: azurite: image: mcr.microsoft.com/azure-storage/azurite command: "azurite -l /workspace -d /workspace/debug.log --blobPort 10000 --blobHost 0.0.0.0 --loose" + restart: always ports: - "10000:10000" # Blob service volumes: diff --git a/pdm.lock b/pdm.lock index 5827246b..235662af 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "dev"] strategy = ["cross_platform", "inherit_metadata"] lock_version = "4.4.1" -content_hash = "sha256:7d3f2042a8da96d2ff7757f480ddc05f36918c8b03c9275b2a838fae8c0d5278" +content_hash = "sha256:165cd62ebad1eac8924f3c14694414f6d7f2fd991008a97d353bd18b9b3e08ee" [[package]] name = "amqp" @@ -317,48 +317,48 @@ files = [ [[package]] name = "coverage" -version = "7.5.1" +version = "7.5.2" requires_python = ">=3.8" summary = "Code coverage measurement for Python" groups = ["dev"] files = [ - {file = "coverage-7.5.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b6cf3764c030e5338e7f61f95bd21147963cf6aa16e09d2f74f1fa52013c1206"}, - {file = "coverage-7.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ec92012fefebee89a6b9c79bc39051a6cb3891d562b9270ab10ecfdadbc0c34"}, - {file = "coverage-7.5.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16db7f26000a07efcf6aea00316f6ac57e7d9a96501e990a36f40c965ec7a95d"}, - {file = "coverage-7.5.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:beccf7b8a10b09c4ae543582c1319c6df47d78fd732f854ac68d518ee1fb97fa"}, - {file = "coverage-7.5.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8748731ad392d736cc9ccac03c9845b13bb07d020a33423fa5b3a36521ac6e4e"}, - {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7352b9161b33fd0b643ccd1f21f3a3908daaddf414f1c6cb9d3a2fd618bf2572"}, - {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:7a588d39e0925f6a2bff87154752481273cdb1736270642aeb3635cb9b4cad07"}, - {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:68f962d9b72ce69ea8621f57551b2fa9c70509af757ee3b8105d4f51b92b41a7"}, - {file = "coverage-7.5.1-cp312-cp312-win32.whl", hash = "sha256:f152cbf5b88aaeb836127d920dd0f5e7edff5a66f10c079157306c4343d86c19"}, - {file = "coverage-7.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:5a5740d1fb60ddf268a3811bcd353de34eb56dc24e8f52a7f05ee513b2d4f596"}, - {file = "coverage-7.5.1-pp38.pp39.pp310-none-any.whl", hash = "sha256:6537e7c10cc47c595828b8a8be04c72144725c383c4702703ff4e42e44577312"}, - {file = "coverage-7.5.1.tar.gz", hash = "sha256:54de9ef3a9da981f7af93eafde4ede199e0846cd819eb27c88e2b712aae9708c"}, + {file = "coverage-7.5.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9a42970ce74c88bdf144df11c52c5cf4ad610d860de87c0883385a1c9d9fa4ab"}, + {file = "coverage-7.5.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26716a1118c6ce2188283b4b60a898c3be29b480acbd0a91446ced4fe4e780d8"}, + {file = "coverage-7.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60b66b0363c5a2a79fba3d1cd7430c25bbd92c923d031cae906bdcb6e054d9a2"}, + {file = "coverage-7.5.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5d22eba19273b2069e4efeff88c897a26bdc64633cbe0357a198f92dca94268"}, + {file = "coverage-7.5.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3bb5b92a0ab3d22dfdbfe845e2fef92717b067bdf41a5b68c7e3e857c0cff1a4"}, + {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1aef719b6559b521ae913ddeb38f5048c6d1a3d366865e8b320270b7bc4693c2"}, + {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8809c0ea0e8454f756e3bd5c36d04dddf222989216788a25bfd6724bfcee342c"}, + {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1acc2e2ef098a1d4bf535758085f508097316d738101a97c3f996bccba963ea5"}, + {file = "coverage-7.5.2-cp312-cp312-win32.whl", hash = "sha256:97de509043d3f0f2b2cd171bdccf408f175c7f7a99d36d566b1ae4dd84107985"}, + {file = "coverage-7.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:8941e35a0e991a7a20a1fa3e3182f82abe357211f2c335a9e6007067c3392fcf"}, + {file = "coverage-7.5.2-pp38.pp39.pp310-none-any.whl", hash = "sha256:40dbb8e7727560fe8ab65efcddfec1ae25f30ef02e2f2e5d78cfb52a66781ec5"}, + {file = "coverage-7.5.2.tar.gz", hash = "sha256:13017a63b0e499c59b5ba94a8542fb62864ba3016127d1e4ef30d354fc2b00e9"}, ] [[package]] name = "coverage" -version = "7.5.1" +version = "7.5.2" extras = ["toml"] requires_python = ">=3.8" summary = "Code coverage measurement for Python" groups = ["dev"] dependencies = [ - "coverage==7.5.1", + "coverage==7.5.2", ] files = [ - {file = "coverage-7.5.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b6cf3764c030e5338e7f61f95bd21147963cf6aa16e09d2f74f1fa52013c1206"}, - {file = "coverage-7.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ec92012fefebee89a6b9c79bc39051a6cb3891d562b9270ab10ecfdadbc0c34"}, - {file = "coverage-7.5.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16db7f26000a07efcf6aea00316f6ac57e7d9a96501e990a36f40c965ec7a95d"}, - {file = "coverage-7.5.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:beccf7b8a10b09c4ae543582c1319c6df47d78fd732f854ac68d518ee1fb97fa"}, - {file = "coverage-7.5.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8748731ad392d736cc9ccac03c9845b13bb07d020a33423fa5b3a36521ac6e4e"}, - {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7352b9161b33fd0b643ccd1f21f3a3908daaddf414f1c6cb9d3a2fd618bf2572"}, - {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:7a588d39e0925f6a2bff87154752481273cdb1736270642aeb3635cb9b4cad07"}, - {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:68f962d9b72ce69ea8621f57551b2fa9c70509af757ee3b8105d4f51b92b41a7"}, - {file = "coverage-7.5.1-cp312-cp312-win32.whl", hash = "sha256:f152cbf5b88aaeb836127d920dd0f5e7edff5a66f10c079157306c4343d86c19"}, - {file = "coverage-7.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:5a5740d1fb60ddf268a3811bcd353de34eb56dc24e8f52a7f05ee513b2d4f596"}, - {file = "coverage-7.5.1-pp38.pp39.pp310-none-any.whl", hash = "sha256:6537e7c10cc47c595828b8a8be04c72144725c383c4702703ff4e42e44577312"}, - {file = "coverage-7.5.1.tar.gz", hash = "sha256:54de9ef3a9da981f7af93eafde4ede199e0846cd819eb27c88e2b712aae9708c"}, + {file = "coverage-7.5.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9a42970ce74c88bdf144df11c52c5cf4ad610d860de87c0883385a1c9d9fa4ab"}, + {file = "coverage-7.5.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26716a1118c6ce2188283b4b60a898c3be29b480acbd0a91446ced4fe4e780d8"}, + {file = "coverage-7.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60b66b0363c5a2a79fba3d1cd7430c25bbd92c923d031cae906bdcb6e054d9a2"}, + {file = "coverage-7.5.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5d22eba19273b2069e4efeff88c897a26bdc64633cbe0357a198f92dca94268"}, + {file = "coverage-7.5.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3bb5b92a0ab3d22dfdbfe845e2fef92717b067bdf41a5b68c7e3e857c0cff1a4"}, + {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1aef719b6559b521ae913ddeb38f5048c6d1a3d366865e8b320270b7bc4693c2"}, + {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8809c0ea0e8454f756e3bd5c36d04dddf222989216788a25bfd6724bfcee342c"}, + {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1acc2e2ef098a1d4bf535758085f508097316d738101a97c3f996bccba963ea5"}, + {file = "coverage-7.5.2-cp312-cp312-win32.whl", hash = "sha256:97de509043d3f0f2b2cd171bdccf408f175c7f7a99d36d566b1ae4dd84107985"}, + {file = "coverage-7.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:8941e35a0e991a7a20a1fa3e3182f82abe357211f2c335a9e6007067c3392fcf"}, + {file = "coverage-7.5.2-pp38.pp39.pp310-none-any.whl", hash = "sha256:40dbb8e7727560fe8ab65efcddfec1ae25f30ef02e2f2e5d78cfb52a66781ec5"}, + {file = "coverage-7.5.2.tar.gz", hash = "sha256:13017a63b0e499c59b5ba94a8542fb62864ba3016127d1e4ef30d354fc2b00e9"}, ] [[package]] @@ -527,6 +527,20 @@ files = [ {file = "django-celery-beat-2.6.0.tar.gz", hash = "sha256:f75b2d129731f1214be8383e18fae6bfeacdb55dffb2116ce849222c0106f9ad"}, ] +[[package]] +name = "django-celery-results" +version = "2.5.1" +summary = "Celery result backends for Django." +groups = ["default"] +dependencies = [ + "Django>=3.2.18", + "celery<6.0,>=5.2.7", +] +files = [ + {file = "django_celery_results-2.5.1-py3-none-any.whl", hash = "sha256:0da4cd5ecc049333e4524a23fcfc3460dfae91aa0a60f1fae4b6b2889c254e01"}, + {file = "django_celery_results-2.5.1.tar.gz", hash = "sha256:3ecb7147f773f34d0381bac6246337ce4cf88a2ea7b82774ed48e518b67bb8fd"}, +] + [[package]] name = "django-concurrency" version = "2.5" @@ -579,17 +593,17 @@ files = [ [[package]] name = "django-debug-toolbar" -version = "4.3.0" +version = "4.4.2" requires_python = ">=3.8" summary = "A configurable set of panels that display various debug information about the current request/response." groups = ["default"] dependencies = [ - "django>=3.2.4", + "django>=4.2.9", "sqlparse>=0.2", ] files = [ - {file = "django_debug_toolbar-4.3.0-py3-none-any.whl", hash = "sha256:e09b7dcb8417b743234dfc57c95a7c1d1d87a88844abd13b4c5387f807b31bf6"}, - {file = "django_debug_toolbar-4.3.0.tar.gz", hash = "sha256:0b0dddee5ea29b9cb678593bc0d7a6d76b21d7799cb68e091a2148341a80f3c4"}, + {file = "django_debug_toolbar-4.4.2-py3-none-any.whl", hash = "sha256:5d7afb2ea5f8730241e5b0735396e16cd1fd8c6b53a2f3e1e30bbab9abb23728"}, + {file = "django_debug_toolbar-4.4.2.tar.gz", hash = "sha256:9204050fcb1e4f74216c5b024bc76081451926a6303993d6c513f5e142675927"}, ] [[package]] @@ -1016,20 +1030,6 @@ files = [ {file = "graphene_stubs-0.16-py3-none-any.whl", hash = "sha256:7fae3ff663344db1b3ee5b187f054a1d018bb63c364c3624890fe02960c6affe"}, ] -[[package]] -name = "gunicorn" -version = "22.0.0" -requires_python = ">=3.7" -summary = "WSGI HTTP Server for UNIX" -groups = ["default"] -dependencies = [ - "packaging", -] -files = [ - {file = "gunicorn-22.0.0-py3-none-any.whl", hash = "sha256:350679f91b24062c86e386e198a15438d53a7a8207235a78ba1b53df4c4378d9"}, - {file = "gunicorn-22.0.0.tar.gz", hash = "sha256:4a0b436239ff76fb33f11c07a16482c521a7e09c1ce3cc293c2330afe01bec63"}, -] - [[package]] name = "identify" version = "2.5.36" @@ -1462,7 +1462,7 @@ name = "packaging" version = "24.0" requires_python = ">=3.7" summary = "Core utilities for Python packages" -groups = ["default", "dev"] +groups = ["dev"] files = [ {file = "packaging-24.0-py3-none-any.whl", hash = "sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5"}, {file = "packaging-24.0.tar.gz", hash = "sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9"}, @@ -1582,7 +1582,7 @@ files = [ [[package]] name = "prompt-toolkit" -version = "3.0.43" +version = "3.0.45" requires_python = ">=3.7.0" summary = "Library for building powerful interactive command lines in Python" groups = ["default", "dev"] @@ -1590,8 +1590,8 @@ dependencies = [ "wcwidth", ] files = [ - {file = "prompt_toolkit-3.0.43-py3-none-any.whl", hash = "sha256:a11a29cb3bf0a28a387fe5122cdb649816a957cd9261dcedf8c9f1fef33eacf6"}, - {file = "prompt_toolkit-3.0.43.tar.gz", hash = "sha256:3527b7af26106cbc65a040bcc84839a3566ec1b051bb0bfe953631e704b0ff7d"}, + {file = "prompt_toolkit-3.0.45-py3-none-any.whl", hash = "sha256:a29b89160e494e3ea8622b09fa5897610b437884dcdcd054fdc1308883326c2a"}, + {file = "prompt_toolkit-3.0.45.tar.gz", hash = "sha256:07c60ee4ab7b7e90824b61afa840c8f5aad2d46b3e2e10acc33d8ecc94a49089"}, ] [[package]] @@ -2114,6 +2114,23 @@ files = [ {file = "sentry_sdk-2.3.1.tar.gz", hash = "sha256:139a71a19f5e9eb5d3623942491ce03cf8ebc14ea2e39ba3e6fe79560d8a5b1f"}, ] +[[package]] +name = "sentry-sdk" +version = "2.3.1" +extras = ["celery", "django"] +requires_python = ">=3.6" +summary = "Python client for Sentry (https://sentry.io)" +groups = ["default"] +dependencies = [ + "celery>=3", + "django>=1.8", + "sentry-sdk==2.3.1", +] +files = [ + {file = "sentry_sdk-2.3.1-py2.py3-none-any.whl", hash = "sha256:c5aeb095ba226391d337dd42a6f9470d86c9fc236ecc71cfc7cd1942b45010c6"}, + {file = "sentry_sdk-2.3.1.tar.gz", hash = "sha256:139a71a19f5e9eb5d3623942491ce03cf8ebc14ea2e39ba3e6fe79560d8a5b1f"}, +] + [[package]] name = "setuptools" version = "70.0.0" diff --git a/pyproject.toml b/pyproject.toml index f30e2ff8..f1868a97 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,11 +31,15 @@ dependencies = [ "face-recognition>=1.3.0", "opencv-python>=4.9.0.80", "psycopg2-binary>=2.9.9", - "sentry-sdk", + "sentry-sdk[celery,django]>=2.2.1", "social-auth-app-django", "social-auth-core", "unicef-security", "uwsgi>=2.0.25.1", + "drf-nested-routers>=0.94.1", + "face-recognition>=1.3.0", + "opencv-python>=4.9.0.80", + "django-celery-results>=2.5.1", ] [tool.pdm.build] diff --git a/src/hope_dedup_engine/__init__.py b/src/hope_dedup_engine/__init__.py index 428345e8..0e2df472 100644 --- a/src/hope_dedup_engine/__init__.py +++ b/src/hope_dedup_engine/__init__.py @@ -1 +1,6 @@ +from hope_dedup_engine.config.celery import app as celery_app + + VERSION = __version__ = "0.1.0" + +__all__ = ("celery_app",) diff --git a/src/hope_dedup_engine/apps/faces/admin.py b/src/hope_dedup_engine/apps/faces/admin.py deleted file mode 100644 index d845c120..00000000 --- a/src/hope_dedup_engine/apps/faces/admin.py +++ /dev/null @@ -1,16 +0,0 @@ -from django.contrib import admin -from django.http.request import HttpRequest - -from .models import TaskModel - - -@admin.register(TaskModel) -class TaskModelAdmin(admin.ModelAdmin): - def has_add_permission(self, request: HttpRequest) -> bool: - return False - - def has_change_permission(self, request: HttpRequest, obj=None) -> bool: - return False - - def has_delete_permission(self, request: HttpRequest, obj=None) -> bool: - return False diff --git a/src/hope_dedup_engine/apps/faces/celery_tasks.py b/src/hope_dedup_engine/apps/faces/celery_tasks.py index 32a1954c..aaf28981 100644 --- a/src/hope_dedup_engine/apps/faces/celery_tasks.py +++ b/src/hope_dedup_engine/apps/faces/celery_tasks.py @@ -1,4 +1,6 @@ -from celery import shared_task +import traceback + +from celery import shared_task, states from hope_dedup_engine.apps.faces.utils.celery_utils import task_lifecycle from hope_dedup_engine.apps.faces.utils.duplication_detector import DuplicationDetector @@ -8,6 +10,9 @@ @task_lifecycle(name="Deduplicate", ttl=1 * 60 * 60) # TODO: Use DeduplicationSet objects as input to deduplication pipeline def deduplicate(self, filename: str): - # deduplicate.delay(filename=filename) - dd = DuplicationDetector(filename) - return dd.find_duplicates() + try: + dd = DuplicationDetector(filename) + return dd.find_duplicates() + except Exception as e: + self.update_state(state=states.FAILURE, meta={"exc_message": str(e), "traceback": traceback.format_exc()}) + raise e diff --git a/src/hope_dedup_engine/apps/faces/migrations/0001_initial.py b/src/hope_dedup_engine/apps/faces/migrations/0001_initial.py deleted file mode 100644 index 00c8b4c2..00000000 --- a/src/hope_dedup_engine/apps/faces/migrations/0001_initial.py +++ /dev/null @@ -1,49 +0,0 @@ -# Generated by Django 4.2.11 on 2024-05-06 17:41 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - initial = True - - dependencies = [] - - operations = [ - migrations.CreateModel( - name="TaskModel", - fields=[ - ( - "id", - models.AutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ("name", models.CharField(max_length=100)), - ("celery_task_id", models.UUIDField(blank=True, null=True)), - ( - "status", - models.CharField( - choices=[ - ("PROCESSING", "Processing"), - ("COMPLETED_SUCCESS", "Completed Successfully"), - ("FAILED", "Failed"), - ], - default="PROCESSING", - max_length=20, - ), - ), - ("is_success", models.BooleanField(default=False)), - ("error", models.TextField(blank=True, null=True)), - ("created_at", models.DateTimeField(auto_now_add=True)), - ("completed_at", models.DateTimeField(blank=True, null=True)), - ], - options={ - "verbose_name": "Task", - "ordering": ["-created_at"], - }, - ), - ] diff --git a/src/hope_dedup_engine/apps/faces/migrations/__init__.py b/src/hope_dedup_engine/apps/faces/migrations/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/hope_dedup_engine/apps/faces/models/__init__.py b/src/hope_dedup_engine/apps/faces/models/__init__.py deleted file mode 100644 index c3c2f051..00000000 --- a/src/hope_dedup_engine/apps/faces/models/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .task_model import TaskModel # noqa F401 diff --git a/src/hope_dedup_engine/apps/faces/models/task_model.py b/src/hope_dedup_engine/apps/faces/models/task_model.py deleted file mode 100644 index 55d5b5a4..00000000 --- a/src/hope_dedup_engine/apps/faces/models/task_model.py +++ /dev/null @@ -1,24 +0,0 @@ -from django.db import models -from django.utils.translation import gettext_lazy as _ - - -class TaskModel(models.Model): - class StatusChoices(models.TextChoices): - PROCESSING = "PROCESSING", _("Processing") - COMPLETED_SUCCESS = "COMPLETED_SUCCESS", _("Completed Successfully") - FAILED = "FAILED", _("Failed") - - name = models.CharField(max_length=100) - celery_task_id = models.UUIDField(null=True, blank=True) - status = models.CharField(max_length=20, choices=StatusChoices.choices, default=StatusChoices.PROCESSING) - is_success = models.BooleanField(default=False) - error = models.TextField(null=True, blank=True) - created_at = models.DateTimeField(auto_now_add=True) - completed_at = models.DateTimeField(null=True, blank=True) - - def __str__(self): - return f"{self.name} - {self.status} - {self.created_at} - {self.completed_at}" - - class Meta: - ordering = ["-created_at"] - verbose_name = "Task" diff --git a/src/hope_dedup_engine/apps/faces/utils/celery_utils.py b/src/hope_dedup_engine/apps/faces/utils/celery_utils.py index 8e45ce04..27ee145d 100644 --- a/src/hope_dedup_engine/apps/faces/utils/celery_utils.py +++ b/src/hope_dedup_engine/apps/faces/utils/celery_utils.py @@ -2,12 +2,9 @@ from functools import wraps from django.conf import settings -from django.utils import timezone import redis -from hope_dedup_engine.apps.faces.models import TaskModel - redis_client = redis.Redis.from_url(settings.CELERY_BROKER_URL) @@ -17,6 +14,7 @@ def decorator(func): def wrapper(self, *args, **kwargs): logger = logging.getLogger(func.__module__) logger.info(f"{name} task started") + result = None filename: str = args[0] if args else kwargs.get("filename") lock_name: str = f"{name}_{filename}" @@ -24,26 +22,13 @@ def wrapper(self, *args, **kwargs): logger.info(f"Task {name} with brocker lock {lock_name} is already running.") return None - task: TaskModel = None - result = None - try: - task = TaskModel.objects.create(name=name, celery_task_id=self.request.id) result = func(self, *args, **kwargs) - task.status = TaskModel.StatusChoices.COMPLETED_SUCCESS - task.completed_at = timezone.now() - task.is_success = True except Exception as e: logger.exception(f"{name} task failed", exc_info=e) - if task: - task.status = TaskModel.StatusChoices.FAILED - task.completed_at = timezone.now() - task.is_success = False - task.error = str(e) + raise e finally: _release_lock(lock_name) - if task: - task.save(update_fields=["status", "completed_at", "is_success", "error"]) logger.info(f"{name} task ended") return result diff --git a/src/hope_dedup_engine/apps/faces/utils/duplication_detector.py b/src/hope_dedup_engine/apps/faces/utils/duplication_detector.py index 0312fb99..a0618035 100644 --- a/src/hope_dedup_engine/apps/faces/utils/duplication_detector.py +++ b/src/hope_dedup_engine/apps/faces/utils/duplication_detector.py @@ -13,7 +13,6 @@ class DuplicationDetector: def __init__(self, filename: str) -> None: - print() self.logger = logging.getLogger(__name__) self.storages = { @@ -110,7 +109,6 @@ def find_duplicates(self) -> Tuple[str]: for encoding1 in encodings1: for encoding2 in encodings2: distance = face_recognition.face_distance([encoding1], encoding2) - print(f"{distance.item():10.8f}\t{path1} vs {path2}") if distance < settings.DISTANCE_THRESHOLD: duplicated_images.update([path1, path2]) break diff --git a/src/hope_dedup_engine/config/celery.py b/src/hope_dedup_engine/config/celery.py index 503f09ff..e4249a60 100644 --- a/src/hope_dedup_engine/config/celery.py +++ b/src/hope_dedup_engine/config/celery.py @@ -4,10 +4,14 @@ import sentry_sdk from celery import Celery, signals +from hope_dedup_engine.config import settings + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "hope_dedup_engine.config.settings") + app = Celery("hde") app.config_from_object("django.conf:settings", namespace="CELERY") +app.autodiscover_tasks(lambda: settings.INSTALLED_APPS, related_name="celery_tasks") @signals.celeryd_init.connect diff --git a/src/hope_dedup_engine/config/fragments/celery.py b/src/hope_dedup_engine/config/fragments/celery.py index 94d6f80f..ee15ce52 100644 --- a/src/hope_dedup_engine/config/fragments/celery.py +++ b/src/hope_dedup_engine/config/fragments/celery.py @@ -18,7 +18,6 @@ CELERY_TASK_TIME_LIMIT = None CELERY_TASK_TRACK_STARTED = True - CELERY_WORKER_DISABLE_RATE_LIMITS = False CELERY_WORKER_PREFETCH_MULTIPLIER = 1 @@ -26,10 +25,11 @@ CELERY_CACHE_BACKEND = "django-cache" -# CELERY_RESULT_BACKEND = "django-db" -# CELERY_RESULT_EXPIRES = None -# CELERY_RESULT_EXTENDED = True -# CELERY_RESULT_SERIALIZER = "json" +CELERY_RESULT_BACKEND = "django-db" +CELERY_RESULT_PERSISTENT = True +CELERY_RESULT_EXPIRES = None +CELERY_RESULT_EXTENDED = True +CELERY_RESULT_SERIALIZER = "json" CELERY_BROKER_CONNECTION_RETRY = False CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP = False diff --git a/src/hope_dedup_engine/config/settings.py b/src/hope_dedup_engine/config/settings.py index 9c115e80..d1e75080 100644 --- a/src/hope_dedup_engine/config/settings.py +++ b/src/hope_dedup_engine/config/settings.py @@ -18,7 +18,6 @@ "hope_dedup_engine.web", "hope_dedup_engine.apps.core.apps.Config", "hope_dedup_engine.apps.security.apps.Config", - "hope_dedup_engine.apps.faces.apps.Config", # "unicef_security", "django.contrib.contenttypes", "django.contrib.auth", @@ -39,9 +38,11 @@ "constance", "rest_framework", "django_celery_beat", + "django_celery_results", "drf_spectacular", "drf_spectacular_sidecar", "hope_dedup_engine.apps.api", + "hope_dedup_engine.apps.faces", "storages", ) diff --git a/tests/conftest.py b/tests/conftest.py index 6823372f..3734f277 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -64,5 +64,5 @@ def mocked_responses(): yield rsps +from faces.fixtures.celery_tasks import * # noqa: E402, F401, F403 from faces.fixtures.duplication_detector import * # noqa: E402, F401, F403 -from faces.fixtures.tasks import * # noqa: E402, F401, F403 diff --git a/tests/faces/fixtures/tasks.py b/tests/faces/fixtures/celery_tasks.py similarity index 58% rename from tests/faces/fixtures/tasks.py rename to tests/faces/fixtures/celery_tasks.py index 15570f28..1bcd89c0 100644 --- a/tests/faces/fixtures/tasks.py +++ b/tests/faces/fixtures/celery_tasks.py @@ -1,12 +1,12 @@ -from unittest.mock import MagicMock, patch +from unittest.mock import patch import pytest +import docker + @pytest.fixture(scope="session") def docker_client(): - import docker - client = docker.from_env() yield client client.close() @@ -24,12 +24,3 @@ def mock_duplication_detector(): "hope_dedup_engine.apps.faces.utils.duplication_detector.DuplicationDetector.find_duplicates" ) as mock_find: yield mock_find - - -@pytest.fixture -def mock_task_model(): - with patch("hope_dedup_engine.apps.faces.models.TaskModel.objects.create") as mock_create: - mock_instance = MagicMock() - mock_create.return_value = mock_instance - mock_instance.save = MagicMock() - yield mock_create, mock_instance diff --git a/tests/faces/test_tasks.py b/tests/faces/test_celery_tasks.py similarity index 54% rename from tests/faces/test_tasks.py rename to tests/faces/test_celery_tasks.py index 0851809b..dc986b64 100644 --- a/tests/faces/test_tasks.py +++ b/tests/faces/test_celery_tasks.py @@ -1,16 +1,13 @@ from unittest.mock import patch +from celery import states from faces_const import FILENAME, FILENAMES from hope_dedup_engine.apps.faces.celery_tasks import deduplicate -from hope_dedup_engine.apps.faces.models import TaskModel -def test_deduplicate_task_already_running( - mock_redis_client, mock_duplication_detector, mock_task_model, celery_app, celery_worker -): +def test_deduplicate_task_already_running(mock_redis_client, mock_duplication_detector, celery_app, celery_worker): mock_set, mock_delete = mock_redis_client - mock_create, _ = mock_task_model mock_set.return_value = False # Lock is not acquired task = deduplicate.apply(args=[FILENAME]) @@ -19,52 +16,37 @@ def test_deduplicate_task_already_running( mock_duplication_detector.assert_not_called() # DeduplicationDetector is not called mock_set.assert_called_once_with(f"Deduplicate_{FILENAME}", "true", nx=True, ex=3600) mock_delete.assert_not_called() # Lock is not released - mock_create.assert_not_called() # TaskModel is not created -def test_deduplicate_task_success( - dd, mock_redis_client, mock_duplication_detector, mock_task_model, celery_app, celery_worker -): +def test_deduplicate_task_success(dd, mock_redis_client, mock_duplication_detector, celery_app, celery_worker): mock_set, mock_delete = mock_redis_client - mock_create, mock_task_instance = mock_task_model mock_find = mock_duplication_detector - mock_set.return_value = True # Lock is acquired mock_find.return_value = set(FILENAMES[:2]) # Assuming the first two are duplicates based on mock data with patch("hope_dedup_engine.apps.faces.celery_tasks.DuplicationDetector", return_value=dd): task_result = deduplicate.apply(args=[FILENAME]).get() + assert task_result == set(FILENAMES[:2]) # Assuming the first two are duplicates based on mock data mock_set.assert_called_once_with(f"Deduplicate_{FILENAME}", "true", nx=True, ex=3600) mock_delete.assert_called_once_with(f"Deduplicate_{FILENAME}") # Lock is released - mock_create.assert_called_once() # TaskModel is created - mock_task_instance.save.assert_called_once() # TaskModel is saved - assert mock_task_instance.status == TaskModel.StatusChoices.COMPLETED_SUCCESS - assert mock_task_instance.is_success is True - assert mock_task_instance.completed_at is not None - def test_deduplicate_task_exception_handling( - dd, mock_redis_client, mock_task_model, mock_duplication_detector, celery_app, celery_worker + dd, mock_redis_client, mock_duplication_detector, celery_app, celery_worker ): mock_set, mock_delete = mock_redis_client - mock_create, mock_task_instance = mock_task_model mock_find = mock_duplication_detector mock_find.side_effect = Exception("Simulated task failure") with patch("hope_dedup_engine.apps.faces.celery_tasks.DuplicationDetector", return_value=dd): task = deduplicate.apply(args=[FILENAME]) - assert task.result is None # Task is not executed - mock_duplication_detector.assert_called_once() # DeduplicationDetector is called - - mock_create.assert_called_once_with(name="Deduplicate", celery_task_id=task.id) - mock_task_instance.save.assert_called_once() - assert mock_task_instance.status == TaskModel.StatusChoices.FAILED - assert not mock_task_instance.is_success - assert mock_task_instance.error == "Simulated task failure" + assert task.state == states.FAILURE + assert isinstance(task.result, Exception) + assert str(task.result) == "Simulated task failure" + assert task.traceback is not None - # Check that the Redis lock was acquired and then released + mock_find.assert_called_once() mock_set.assert_called_once_with(f"Deduplicate_{FILENAME}", "true", nx=True, ex=3600) - mock_delete.assert_called_once_with(f"Deduplicate_{FILENAME}") + mock_delete.assert_called_once_with(f"Deduplicate_{FILENAME}") # Lock is released diff --git a/tests/faces/test_duplication_detector.py b/tests/faces/test_duplication_detector.py index 48c49d29..d63d6c7a 100644 --- a/tests/faces/test_duplication_detector.py +++ b/tests/faces/test_duplication_detector.py @@ -118,7 +118,6 @@ def test_load_encodings_all_exception_handling(dd): try: dd._load_encodings_all() except Exception: - print(f"\n{dd.logger.exception.assert_called_once()=}") ... dd.logger.reset_mock() @@ -149,6 +148,7 @@ def test_encode_face_invalid_region(dd, image_bytes_io): # Check that the error was logged with the correct message mock_error_logger.assert_called_once_with(f"Invalid face region {(0, 0, 10)}") + dd.logger.reset_mock() def test_encode_face_exception_handling(dd): From 21d1de6b99571c1b3d278ea8abc0b045f9280ca8 Mon Sep 17 00:00:00 2001 From: sergey-misuk-valor <168101676+sergey-misuk-valor@users.noreply.github.com> Date: Mon, 10 Jun 2024 14:45:45 +0400 Subject: [PATCH 4/7] Add status callback url (#28) * Add status callback url * Fix failing test --- pdm.lock | 8 +++--- pyproject.toml | 1 + .../apps/api/migrations/0001_initial.py | 3 +- .../apps/api/models/deduplication.py | 1 + src/hope_dedup_engine/apps/api/utils.py | 10 +++++++ tests/api/test_utils.py | 28 +++++++++++++++++++ tests/extras/testutils/factories/api.py | 1 + 7 files changed, 47 insertions(+), 5 deletions(-) create mode 100644 tests/api/test_utils.py diff --git a/pdm.lock b/pdm.lock index 235662af..be5ced83 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "dev"] strategy = ["cross_platform", "inherit_metadata"] lock_version = "4.4.1" -content_hash = "sha256:165cd62ebad1eac8924f3c14694414f6d7f2fd991008a97d353bd18b9b3e08ee" +content_hash = "sha256:07d1e2f3b569e6b60d3825c075de0d525c36024c441e7d4fcbace92bb8cd931b" [[package]] name = "amqp" @@ -1969,7 +1969,7 @@ files = [ [[package]] name = "requests" -version = "2.32.2" +version = "2.32.3" requires_python = ">=3.8" summary = "Python HTTP for Humans." groups = ["default", "dev"] @@ -1980,8 +1980,8 @@ dependencies = [ "urllib3<3,>=1.21.1", ] files = [ - {file = "requests-2.32.2-py3-none-any.whl", hash = "sha256:fc06670dd0ed212426dfeb94fc1b983d917c4f9847c863f313c9dfaaffb7c23c"}, - {file = "requests-2.32.2.tar.gz", hash = "sha256:dd951ff5ecf3e3b3aa26b40703ba77495dab41da839ae72ef3c8e5d8e2433289"}, + {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, + {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, ] [[package]] diff --git a/pyproject.toml b/pyproject.toml index f1868a97..dca935bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ dependencies = [ "face-recognition>=1.3.0", "opencv-python>=4.9.0.80", "django-celery-results>=2.5.1", + "requests>=2.32.3", ] [tool.pdm.build] diff --git a/src/hope_dedup_engine/apps/api/migrations/0001_initial.py b/src/hope_dedup_engine/apps/api/migrations/0001_initial.py index 9a1a30a3..37ad2d18 100644 --- a/src/hope_dedup_engine/apps/api/migrations/0001_initial.py +++ b/src/hope_dedup_engine/apps/api/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Generated by Django 5.0.6 on 2024-05-24 11:45 +# Generated by Django 5.0.6 on 2024-06-04 13:30 import django.db.models.deletion import uuid @@ -32,6 +32,7 @@ class Migration(migrations.Migration): ("error", models.CharField(blank=True, max_length=255, null=True)), ("created_at", models.DateTimeField(auto_now_add=True)), ("updated_at", models.DateTimeField(auto_now=True)), + ("notification_url", models.CharField(blank=True, max_length=255, null=True)), ( "created_by", models.ForeignKey( diff --git a/src/hope_dedup_engine/apps/api/models/deduplication.py b/src/hope_dedup_engine/apps/api/models/deduplication.py index 817d6a80..77da90e4 100644 --- a/src/hope_dedup_engine/apps/api/models/deduplication.py +++ b/src/hope_dedup_engine/apps/api/models/deduplication.py @@ -33,6 +33,7 @@ class State(models.IntegerChoices): settings.AUTH_USER_MODEL, on_delete=models.CASCADE, null=True, blank=True, related_name="+" ) updated_at = models.DateTimeField(auto_now=True) + notification_url = models.CharField(max_length=255, null=True, blank=True) class Image(models.Model): diff --git a/src/hope_dedup_engine/apps/api/utils.py b/src/hope_dedup_engine/apps/api/utils.py index fbb30c28..af45893e 100644 --- a/src/hope_dedup_engine/apps/api/utils.py +++ b/src/hope_dedup_engine/apps/api/utils.py @@ -1,3 +1,5 @@ +import requests + from hope_dedup_engine.apps.api.models import DeduplicationSet @@ -7,3 +9,11 @@ def start_processing(_: DeduplicationSet) -> None: def delete_model_data(_: DeduplicationSet) -> None: pass + + +REQUEST_TIMEOUT = 5 + + +def send_notification(deduplication_set: DeduplicationSet) -> None: + if url := deduplication_set.notification_url: + requests.get(url, timeout=REQUEST_TIMEOUT) diff --git a/tests/api/test_utils.py b/tests/api/test_utils.py new file mode 100644 index 00000000..64385448 --- /dev/null +++ b/tests/api/test_utils.py @@ -0,0 +1,28 @@ +from unittest.mock import MagicMock + +from pytest import fixture, mark +from pytest_mock import MockerFixture + +from hope_dedup_engine.apps.api.models import DeduplicationSet +from hope_dedup_engine.apps.api.utils import REQUEST_TIMEOUT, send_notification + + +@fixture +def requests_get_mock(mocker: MockerFixture) -> MagicMock: + return mocker.patch("hope_dedup_engine.apps.api.utils.requests.get") + + +@mark.parametrize("deduplication_set__notification_url", ("https://example.com",)) +def test_notification_is_sent_when_url_is_set( + requests_get_mock: MagicMock, deduplication_set: DeduplicationSet +) -> None: + send_notification(deduplication_set) + requests_get_mock.assert_called_once_with(deduplication_set.notification_url, timeout=REQUEST_TIMEOUT) + + +@mark.parametrize("deduplication_set__notification_url", (None,)) +def test_notification_is_not_sent_when_url_is_not_set( + requests_get_mock: MagicMock, deduplication_set: DeduplicationSet +) -> None: + send_notification(deduplication_set) + requests_get_mock.assert_not_called() diff --git a/tests/extras/testutils/factories/api.py b/tests/extras/testutils/factories/api.py index 66bef975..931a7823 100644 --- a/tests/extras/testutils/factories/api.py +++ b/tests/extras/testutils/factories/api.py @@ -16,6 +16,7 @@ class DeduplicationSetFactory(DjangoModelFactory): reference_pk = fuzzy.FuzzyText() external_system = SubFactory(ExternalSystemFactory) state = DeduplicationSet.State.CLEAN + notification_url = fuzzy.FuzzyText() class Meta: model = DeduplicationSet From c3eae1b75522e850711472df023cc1779af7c1fa Mon Sep 17 00:00:00 2001 From: sergey-misuk-valor <168101676+sergey-misuk-valor@users.noreply.github.com> Date: Mon, 10 Jun 2024 14:48:17 +0400 Subject: [PATCH 5/7] Register API models in admin (#27) --- src/hope_dedup_engine/apps/api/admin.py | 8 ++++++++ .../apps/api/models/__init__.py | 2 +- tests/extras/testutils/factories/__init__.py | 16 +++++++++++++++- tests/extras/testutils/factories/api.py | 4 +++- 4 files changed, 27 insertions(+), 3 deletions(-) create mode 100644 src/hope_dedup_engine/apps/api/admin.py diff --git a/src/hope_dedup_engine/apps/api/admin.py b/src/hope_dedup_engine/apps/api/admin.py new file mode 100644 index 00000000..9f384e0a --- /dev/null +++ b/src/hope_dedup_engine/apps/api/admin.py @@ -0,0 +1,8 @@ +from django.contrib import admin + +from hope_dedup_engine.apps.api.models import DeduplicationSet, Duplicate, HDEToken, Image + +admin.site.register(DeduplicationSet) +admin.site.register(Duplicate) +admin.site.register(HDEToken) +admin.site.register(Image) diff --git a/src/hope_dedup_engine/apps/api/models/__init__.py b/src/hope_dedup_engine/apps/api/models/__init__.py index 7ae749fe..571a4bfd 100644 --- a/src/hope_dedup_engine/apps/api/models/__init__.py +++ b/src/hope_dedup_engine/apps/api/models/__init__.py @@ -1,2 +1,2 @@ from hope_dedup_engine.apps.api.models.auth import HDEToken # noqa: F401 -from hope_dedup_engine.apps.api.models.deduplication import DeduplicationSet # noqa: F401 +from hope_dedup_engine.apps.api.models.deduplication import DeduplicationSet, Duplicate, Image # noqa: F401 diff --git a/tests/extras/testutils/factories/__init__.py b/tests/extras/testutils/factories/__init__.py index 80ea8a38..2a687069 100644 --- a/tests/extras/testutils/factories/__init__.py +++ b/tests/extras/testutils/factories/__init__.py @@ -1,3 +1,8 @@ +import importlib +import pkgutil +from pathlib import Path + +from factory.django import DjangoModelFactory from pytest_factoryboy import register from .base import AutoRegisterModelFactory, TAutoRegisterModelFactory, factories_registry @@ -6,8 +11,14 @@ from .user import ExternalSystemFactory, GroupFactory, SuperUserFactory, User, UserFactory # noqa from .userrole import UserRole, UserRoleFactory # noqa +for _, name, _ in pkgutil.iter_modules([str(Path(__file__).parent)]): + importlib.import_module(f".{name}", __package__) + + +django_model_factories = {factory._meta.model: factory for factory in DjangoModelFactory.__subclasses__()} -def get_factory_for_model(_model) -> type[TAutoRegisterModelFactory]: + +def get_factory_for_model(_model) -> type[TAutoRegisterModelFactory] | type[DjangoModelFactory]: class Meta: model = _model @@ -15,4 +26,7 @@ class Meta: if _model in factories_registry: return factories_registry[_model] # noqa + if _model in django_model_factories: + return django_model_factories[_model] + return register(type(f"{_model._meta.model_name}AutoCreatedFactory", bases, {"Meta": Meta})) # noqa diff --git a/tests/extras/testutils/factories/api.py b/tests/extras/testutils/factories/api.py index 931a7823..a795e755 100644 --- a/tests/extras/testutils/factories/api.py +++ b/tests/extras/testutils/factories/api.py @@ -1,12 +1,14 @@ from factory import SubFactory, fuzzy from factory.django import DjangoModelFactory -from testutils.factories import ExternalSystemFactory +from testutils.factories import ExternalSystemFactory, UserFactory from hope_dedup_engine.apps.api.models import DeduplicationSet, HDEToken from hope_dedup_engine.apps.api.models.deduplication import Duplicate, Image class TokenFactory(DjangoModelFactory): + user = SubFactory(UserFactory) + class Meta: model = HDEToken From c84dd6162b0681786ca28debd9651801b31034cd Mon Sep 17 00:00:00 2001 From: vitali-yanushchyk-valor <168179384+vitali-yanushchyk-valor@users.noreply.github.com> Date: Mon, 10 Jun 2024 06:50:42 -0400 Subject: [PATCH 6/7] Feature/fc constance (#25) * chg ! comment * add ! processing case if face regions for image are not detected * chg ! tests * chg ! move face_recognition settings to constance * chg ! tests * fix ! int for cv2 backends * chg ! pdm lock * chg ! get some parametres for fr from proto file * chg ! refactor DuplicationDetector, add NMS * add ! validator for means * chg ! optimize find duplicates, tests * chg ! tests --- pdm.lock | 71 +++---- pdm.toml | 5 - .../apps/faces/utils/duplication_detector.py | 157 ++++++++++---- .../apps/faces/validators.py | 24 +++ .../config/fragments/constance.py | 127 ++++++++++- .../config/fragments/recognition.py | 7 - src/hope_dedup_engine/config/settings.py | 1 - tests/faces/faces_const.py | 25 +++ tests/faces/fixtures/duplication_detector.py | 90 +++----- tests/faces/test_celery_tasks.py | 1 + tests/faces/test_duplication_detector.py | 201 +++++++++++------- tests/faces/test_validators.py | 41 ++++ 12 files changed, 519 insertions(+), 231 deletions(-) delete mode 100644 pdm.toml create mode 100644 src/hope_dedup_engine/apps/faces/validators.py delete mode 100644 src/hope_dedup_engine/config/fragments/recognition.py create mode 100644 tests/faces/test_validators.py diff --git a/pdm.lock b/pdm.lock index be5ced83..f9eba18d 100644 --- a/pdm.lock +++ b/pdm.lock @@ -317,48 +317,48 @@ files = [ [[package]] name = "coverage" -version = "7.5.2" +version = "7.5.3" requires_python = ">=3.8" summary = "Code coverage measurement for Python" groups = ["dev"] files = [ - {file = "coverage-7.5.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9a42970ce74c88bdf144df11c52c5cf4ad610d860de87c0883385a1c9d9fa4ab"}, - {file = "coverage-7.5.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26716a1118c6ce2188283b4b60a898c3be29b480acbd0a91446ced4fe4e780d8"}, - {file = "coverage-7.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60b66b0363c5a2a79fba3d1cd7430c25bbd92c923d031cae906bdcb6e054d9a2"}, - {file = "coverage-7.5.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5d22eba19273b2069e4efeff88c897a26bdc64633cbe0357a198f92dca94268"}, - {file = "coverage-7.5.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3bb5b92a0ab3d22dfdbfe845e2fef92717b067bdf41a5b68c7e3e857c0cff1a4"}, - {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1aef719b6559b521ae913ddeb38f5048c6d1a3d366865e8b320270b7bc4693c2"}, - {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8809c0ea0e8454f756e3bd5c36d04dddf222989216788a25bfd6724bfcee342c"}, - {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1acc2e2ef098a1d4bf535758085f508097316d738101a97c3f996bccba963ea5"}, - {file = "coverage-7.5.2-cp312-cp312-win32.whl", hash = "sha256:97de509043d3f0f2b2cd171bdccf408f175c7f7a99d36d566b1ae4dd84107985"}, - {file = "coverage-7.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:8941e35a0e991a7a20a1fa3e3182f82abe357211f2c335a9e6007067c3392fcf"}, - {file = "coverage-7.5.2-pp38.pp39.pp310-none-any.whl", hash = "sha256:40dbb8e7727560fe8ab65efcddfec1ae25f30ef02e2f2e5d78cfb52a66781ec5"}, - {file = "coverage-7.5.2.tar.gz", hash = "sha256:13017a63b0e499c59b5ba94a8542fb62864ba3016127d1e4ef30d354fc2b00e9"}, + {file = "coverage-7.5.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:296a7d9bbc598e8744c00f7a6cecf1da9b30ae9ad51c566291ff1314e6cbbed8"}, + {file = "coverage-7.5.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:34d6d21d8795a97b14d503dcaf74226ae51eb1f2bd41015d3ef332a24d0a17b3"}, + {file = "coverage-7.5.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e317953bb4c074c06c798a11dbdd2cf9979dbcaa8ccc0fa4701d80042d4ebf1"}, + {file = "coverage-7.5.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705f3d7c2b098c40f5b81790a5fedb274113373d4d1a69e65f8b68b0cc26f6db"}, + {file = "coverage-7.5.3-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1196e13c45e327d6cd0b6e471530a1882f1017eb83c6229fc613cd1a11b53cd"}, + {file = "coverage-7.5.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:015eddc5ccd5364dcb902eaecf9515636806fa1e0d5bef5769d06d0f31b54523"}, + {file = "coverage-7.5.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:fd27d8b49e574e50caa65196d908f80e4dff64d7e592d0c59788b45aad7e8b35"}, + {file = "coverage-7.5.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:33fc65740267222fc02975c061eb7167185fef4cc8f2770267ee8bf7d6a42f84"}, + {file = "coverage-7.5.3-cp312-cp312-win32.whl", hash = "sha256:7b2a19e13dfb5c8e145c7a6ea959485ee8e2204699903c88c7d25283584bfc08"}, + {file = "coverage-7.5.3-cp312-cp312-win_amd64.whl", hash = "sha256:0bbddc54bbacfc09b3edaec644d4ac90c08ee8ed4844b0f86227dcda2d428fcb"}, + {file = "coverage-7.5.3-pp38.pp39.pp310-none-any.whl", hash = "sha256:3538d8fb1ee9bdd2e2692b3b18c22bb1c19ffbefd06880f5ac496e42d7bb3884"}, + {file = "coverage-7.5.3.tar.gz", hash = "sha256:04aefca5190d1dc7a53a4c1a5a7f8568811306d7a8ee231c42fb69215571944f"}, ] [[package]] name = "coverage" -version = "7.5.2" +version = "7.5.3" extras = ["toml"] requires_python = ">=3.8" summary = "Code coverage measurement for Python" groups = ["dev"] dependencies = [ - "coverage==7.5.2", + "coverage==7.5.3", ] files = [ - {file = "coverage-7.5.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9a42970ce74c88bdf144df11c52c5cf4ad610d860de87c0883385a1c9d9fa4ab"}, - {file = "coverage-7.5.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26716a1118c6ce2188283b4b60a898c3be29b480acbd0a91446ced4fe4e780d8"}, - {file = "coverage-7.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60b66b0363c5a2a79fba3d1cd7430c25bbd92c923d031cae906bdcb6e054d9a2"}, - {file = "coverage-7.5.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5d22eba19273b2069e4efeff88c897a26bdc64633cbe0357a198f92dca94268"}, - {file = "coverage-7.5.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3bb5b92a0ab3d22dfdbfe845e2fef92717b067bdf41a5b68c7e3e857c0cff1a4"}, - {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1aef719b6559b521ae913ddeb38f5048c6d1a3d366865e8b320270b7bc4693c2"}, - {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8809c0ea0e8454f756e3bd5c36d04dddf222989216788a25bfd6724bfcee342c"}, - {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1acc2e2ef098a1d4bf535758085f508097316d738101a97c3f996bccba963ea5"}, - {file = "coverage-7.5.2-cp312-cp312-win32.whl", hash = "sha256:97de509043d3f0f2b2cd171bdccf408f175c7f7a99d36d566b1ae4dd84107985"}, - {file = "coverage-7.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:8941e35a0e991a7a20a1fa3e3182f82abe357211f2c335a9e6007067c3392fcf"}, - {file = "coverage-7.5.2-pp38.pp39.pp310-none-any.whl", hash = "sha256:40dbb8e7727560fe8ab65efcddfec1ae25f30ef02e2f2e5d78cfb52a66781ec5"}, - {file = "coverage-7.5.2.tar.gz", hash = "sha256:13017a63b0e499c59b5ba94a8542fb62864ba3016127d1e4ef30d354fc2b00e9"}, + {file = "coverage-7.5.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:296a7d9bbc598e8744c00f7a6cecf1da9b30ae9ad51c566291ff1314e6cbbed8"}, + {file = "coverage-7.5.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:34d6d21d8795a97b14d503dcaf74226ae51eb1f2bd41015d3ef332a24d0a17b3"}, + {file = "coverage-7.5.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e317953bb4c074c06c798a11dbdd2cf9979dbcaa8ccc0fa4701d80042d4ebf1"}, + {file = "coverage-7.5.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705f3d7c2b098c40f5b81790a5fedb274113373d4d1a69e65f8b68b0cc26f6db"}, + {file = "coverage-7.5.3-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1196e13c45e327d6cd0b6e471530a1882f1017eb83c6229fc613cd1a11b53cd"}, + {file = "coverage-7.5.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:015eddc5ccd5364dcb902eaecf9515636806fa1e0d5bef5769d06d0f31b54523"}, + {file = "coverage-7.5.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:fd27d8b49e574e50caa65196d908f80e4dff64d7e592d0c59788b45aad7e8b35"}, + {file = "coverage-7.5.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:33fc65740267222fc02975c061eb7167185fef4cc8f2770267ee8bf7d6a42f84"}, + {file = "coverage-7.5.3-cp312-cp312-win32.whl", hash = "sha256:7b2a19e13dfb5c8e145c7a6ea959485ee8e2204699903c88c7d25283584bfc08"}, + {file = "coverage-7.5.3-cp312-cp312-win_amd64.whl", hash = "sha256:0bbddc54bbacfc09b3edaec644d4ac90c08ee8ed4844b0f86227dcda2d428fcb"}, + {file = "coverage-7.5.3-pp38.pp39.pp310-none-any.whl", hash = "sha256:3538d8fb1ee9bdd2e2692b3b18c22bb1c19ffbefd06880f5ac496e42d7bb3884"}, + {file = "coverage-7.5.3.tar.gz", hash = "sha256:04aefca5190d1dc7a53a4c1a5a7f8568811306d7a8ee231c42fb69215571944f"}, ] [[package]] @@ -919,7 +919,7 @@ files = [ [[package]] name = "faker" -version = "25.2.0" +version = "25.3.0" requires_python = ">=3.8" summary = "Faker is a Python package that generates fake data for you." groups = ["dev"] @@ -927,8 +927,8 @@ dependencies = [ "python-dateutil>=2.4", ] files = [ - {file = "Faker-25.2.0-py3-none-any.whl", hash = "sha256:cfe97c4857c4c36ee32ea4aaabef884895992e209bae4cbd26807cf3e05c6918"}, - {file = "Faker-25.2.0.tar.gz", hash = "sha256:45b84f47ff1ef86e3d1a8d11583ca871ecf6730fad0660edadc02576583a2423"}, + {file = "Faker-25.3.0-py3-none-any.whl", hash = "sha256:0158d47e955b6ec22134c0a74ebb7ed34fe600896208bafbf1008db831b17f04"}, + {file = "Faker-25.3.0.tar.gz", hash = "sha256:bcbe31eee5ef4bbf87ce36c4eba53c01e2a1d912fde2a4d3528b430d2beb784f"}, ] [[package]] @@ -1358,16 +1358,13 @@ files = [ [[package]] name = "nodeenv" -version = "1.8.0" -requires_python = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*" +version = "1.9.0" +requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" summary = "Node.js virtual environment builder" groups = ["dev"] -dependencies = [ - "setuptools", -] files = [ - {file = "nodeenv-1.8.0-py2.py3-none-any.whl", hash = "sha256:df865724bb3c3adc86b3876fa209771517b0cfe596beff01a92700e0e8be4cec"}, - {file = "nodeenv-1.8.0.tar.gz", hash = "sha256:d51e0c37e64fbf47d017feac3145cdbb58836d7eee8c6f6d3b6880c5456227d2"}, + {file = "nodeenv-1.9.0-py2.py3-none-any.whl", hash = "sha256:508ecec98f9f3330b636d4448c0f1a56fc68017c68f1e7857ebc52acf0eb879a"}, + {file = "nodeenv-1.9.0.tar.gz", hash = "sha256:07f144e90dae547bf0d4ee8da0ee42664a42a04e02ed68e06324348dafe4bdb1"}, ] [[package]] diff --git a/pdm.toml b/pdm.toml deleted file mode 100644 index 0e5b99fa..00000000 --- a/pdm.toml +++ /dev/null @@ -1,5 +0,0 @@ -[python] -use_venv = true - -[venv] -in_project = true diff --git a/src/hope_dedup_engine/apps/faces/utils/duplication_detector.py b/src/hope_dedup_engine/apps/faces/utils/duplication_detector.py index a0618035..b1ec8fd1 100644 --- a/src/hope_dedup_engine/apps/faces/utils/duplication_detector.py +++ b/src/hope_dedup_engine/apps/faces/utils/duplication_detector.py @@ -1,21 +1,31 @@ import logging import os -from typing import Dict, List, Tuple +import re from django.conf import settings import cv2 import face_recognition import numpy as np +from constance import config from hope_dedup_engine.apps.core.storage import CV2DNNStorage, HDEAzureStorage, HOPEAzureStorage class DuplicationDetector: - def __init__(self, filename: str) -> None: - self.logger = logging.getLogger(__name__) + """ + A class to detect and process duplicate faces in images. + """ - self.storages = { + def __init__(self, filename: str) -> None: + """ + Initialize the DuplicationDetector with the given filename. + + Args: + filename (str): The filename of the image to process. + """ + self.logger: logging.Logger = logging.getLogger(__name__) + self.storages: dict[str, CV2DNNStorage | HDEAzureStorage | HOPEAzureStorage] = { "images": HOPEAzureStorage(), "cv2dnn": CV2DNNStorage(settings.CV2DNN_PATH), "encoded": HDEAzureStorage(), @@ -25,48 +35,88 @@ def __init__(self, filename: str) -> None: if not self.storages.get("cv2dnn").exists(file): raise FileNotFoundError(f"File {file} does not exist in storage.") - self.net = cv2.dnn.readNetFromCaffe( - self.storages.get("cv2dnn").path(settings.PROTOTXT_FILE), - self.storages.get("cv2dnn").path(settings.CAFFEMODEL_FILE), - ) - - self.net.setPreferableBackend(settings.DNN_BACKEND) - self.net.setPreferableTarget(settings.DNN_TARGET) + self.shape: dict[str, int] = self._get_shape() + self.net: cv2.dnn_Net = self._set_net(self.storages.get("cv2dnn")) self.filename: str = filename - self.encodings_filename = f"{self.filename}.npy" - - self.confidence: float = settings.FACE_DETECTION_CONFIDENCE - self.threshold: float = settings.DISTANCE_THRESHOLD + self.encodings_filename: str = f"{self.filename}.npy" + self.scale_factor: float = config.BLOB_FROM_IMAGE_SCALE_FACTOR + self.mean_values: tuple[float, float, float] = tuple(map(float, config.BLOB_FROM_IMAGE_MEAN_VALUES.split(", "))) + # self.mean_values: config.BLOB_FROM_IMAGE_MEAN_VALUES + self.face_detection_confidence: float = config.FACE_DETECTION_CONFIDENCE + self.face_encodings_model: str = config.FACE_ENCODINGS_MODEL + self.face_encodings_num_jitters: int = config.FACE_ENCODINGS_NUM_JITTERS + self.distance_threshold: float = config.FACE_DISTANCE_THRESHOLD + self.nms_threshold: float = config.NMS_THRESHOLD @property def has_encodings(self) -> bool: return self.storages["encoded"].exists(self.encodings_filename) - def _get_face_detections_dnn(self) -> List[Tuple[int, int, int, int]]: - # TODO: Implement case if face regions for image are not detected - face_regions: List[Tuple[int, int, int, int]] = [] + def _set_net(self, storage: CV2DNNStorage) -> cv2.dnn_Net: + net = cv2.dnn.readNetFromCaffe( + storage.path(settings.PROTOTXT_FILE), + storage.path(settings.CAFFEMODEL_FILE), + ) + net.setPreferableBackend(int(config.DNN_BACKEND)) + net.setPreferableTarget(int(config.DNN_TARGET)) + return net + + def _get_shape(self) -> dict[str, int]: + pattern = r"input_shape\s*\{\s*" r"dim:\s*(\d+)\s*" r"dim:\s*(\d+)\s*" r"dim:\s*(\d+)\s*" r"dim:\s*(\d+)\s*\}" + with open(settings.PROTOTXT_FILE, "r") as file: + if match := re.search(pattern, file.read()): + return { + "batch_size": int(match.group(1)), + "channels": int(match.group(2)), + "height": int(match.group(3)), + "width": int(match.group(4)), + } + else: + raise ValueError("Could not find input_shape in prototxt file.") + + def _get_face_detections_dnn(self) -> list[tuple[int, int, int, int]]: + face_regions: list[tuple[int, int, int, int]] = [] try: with self.storages["images"].open(self.filename, "rb") as img_file: img_array = np.frombuffer(img_file.read(), dtype=np.uint8) + # Decode image from binary buffer to 3D numpy array (height, width, channels of BlueGreeRed color space) image = cv2.imdecode(img_array, cv2.IMREAD_COLOR) (h, w) = image.shape[:2] + # Create a blob (4D tensor) from the image blob = cv2.dnn.blobFromImage( - image=cv2.resize(image, dsize=(300, 300)), scalefactor=1.0, size=(300, 300), mean=(104.0, 177.0, 123.0) + image=cv2.resize(image, dsize=(self.shape["height"], self.shape["width"])), + size=(self.shape["height"], self.shape["width"]), + scalefactor=self.scale_factor, + mean=self.mean_values, ) self.net.setInput(blob) + # Forward pass to get output with shape (1, 1, N, 7), + # where N is the number of faces and 7 are the detection values: + # 1st: image index (0), 2nd: class label (0), 3rd: confidence (0-1), + # 4th-5th: x, y coordinates, 6th-7th: width, height detections = self.net.forward() - for i in range(0, detections.shape[2]): + boxes, confidences = [], [] + for i in range(detections.shape[2]): confidence = detections[0, 0, i, 2] - if confidence > self.confidence: - box = detections[0, 0, i, 3:7] * np.array([w, h, w, h]) - face_regions.append(tuple(box.astype("int").tolist())) + # Filter out weak detections by ensuring the confidence is greater than the minimum confidence + if confidence > self.face_detection_confidence: + box = (detections[0, 0, i, 3:7] * np.array([w, h, w, h])).astype("int") + boxes.append(box) + confidences.append(confidence) + if boxes: + # Apply non-maxima suppression to suppress weak, overlapping bounding boxes + indices = cv2.dnn.NMSBoxes(boxes, confidences, self.face_detection_confidence, self.nms_threshold) + if indices is not None: + for i in indices: + face_regions.append(tuple(boxes[i])) except Exception as e: - self.logger.exception(f"Error processing face detection for image {self.filename}", exc_info=e) + self.logger.exception("Error processing face detection for image %s", self.filename) + raise e return face_regions - def _load_encodings_all(self) -> Dict[str, List[np.ndarray]]: - data: Dict[str, List[np.ndarray]] = {} + def _load_encodings_all(self) -> dict[str, list[np.ndarray]]: + data: dict[str, list[np.ndarray]] = {} try: _, files = self.storages["encoded"].listdir("") for file in files: @@ -74,7 +124,8 @@ def _load_encodings_all(self) -> Dict[str, List[np.ndarray]]: with self.storages["encoded"].open(file, "rb") as f: data[os.path.splitext(file)[0]] = np.load(f, allow_pickle=False) except Exception as e: - self.logger.exception(f"Error loading encodings: {e}", exc_info=True) + self.logger.exception("Error loading encodings.") + raise e return data def _encode_face(self) -> None: @@ -83,20 +134,36 @@ def _encode_face(self) -> None: image = face_recognition.load_image_file(img_file) encodings: list = [] face_regions = self._get_face_detections_dnn() - for region in face_regions: - if isinstance(region, (list, tuple)) and len(region) == 4: - top, right, bottom, left = region - face_encodings = face_recognition.face_encodings(image, [(top, right, bottom, left)], model="hog") - encodings.extend(face_encodings) - else: - self.logger.error(f"Invalid face region {region}") - with self.storages["encoded"].open(self.encodings_filename, "wb") as f: - np.save(f, encodings) + if not face_regions: + self.logger.error("No face regions detected in image %s", self.filename) + else: + for region in face_regions: + if isinstance(region, (list, tuple)) and len(region) == 4: + top, right, bottom, left = region + # Compute the face encodings for the face regions in the image + face_encodings = face_recognition.face_encodings( + image, + [(top, right, bottom, left)], + num_jitters=self.face_encodings_num_jitters, + model=self.face_encodings_model, + ) + encodings.extend(face_encodings) + else: + self.logger.error("Invalid face region.") + with self.storages["encoded"].open(self.encodings_filename, "wb") as f: + np.save(f, encodings) except Exception as e: - self.logger.exception(f"Error processing face encodings for image {self.filename}", exc_info=e) + self.logger.exception("Error processing face encodings for image %s", self.filename) + raise e + + def find_duplicates(self) -> tuple[str]: + """ + Find and return a list of duplicate images based on face encodings. - def find_duplicates(self) -> Tuple[str]: - duplicated_images = set() + Returns: + tuple[str]: A tuple of filenames of duplicate images. + """ + duplicated_images: set[str] = set() path1 = self.filename try: if not self.has_encodings: @@ -104,17 +171,23 @@ def find_duplicates(self) -> Tuple[str]: encodings_all = self._load_encodings_all() encodings1 = encodings_all[path1] + checked_pairs = set() for path2, encodings2 in encodings_all.items(): if path1 != path2: for encoding1 in encodings1: for encoding2 in encodings2: + if (path1, path2, tuple(encoding1), tuple(encoding2)) in checked_pairs: + continue + distance = face_recognition.face_distance([encoding1], encoding2) - if distance < settings.DISTANCE_THRESHOLD: + if distance < self.distance_threshold: duplicated_images.update([path1, path2]) break + + checked_pairs.add((path1, path2, tuple(encoding1), tuple(encoding2))) if path2 in duplicated_images: break return tuple(duplicated_images) except Exception as e: - self.logger.exception(f"Error finding duplicates for image {path1}", exc_info=e) - return tuple(duplicated_images) + self.logger.exception("Error finding duplicates for image %s", path1) + raise e diff --git a/src/hope_dedup_engine/apps/faces/validators.py b/src/hope_dedup_engine/apps/faces/validators.py new file mode 100644 index 00000000..893275ba --- /dev/null +++ b/src/hope_dedup_engine/apps/faces/validators.py @@ -0,0 +1,24 @@ +from django.forms import CharField, ValidationError + + +class MeanValuesTupleField(CharField): + def to_python(self, value): + try: + values = tuple(map(float, value.split(", "))) + if len(values) != 3: + raise ValueError("The tuple must have exactly three elements.") + if not all(-255 <= v <= 255 for v in values): + raise ValueError("Each value in the tuple must be between -255 and 255.") + return values + except Exception as e: + raise ValidationError( + """ + Enter a valid tuple of three float values separated by commas and spaces, e.g. '0.0, 0.0, 0.0'. + Each value must be between -255 and 255. + """ + ) from e + + def prepare_value(self, value): + if isinstance(value, tuple): + return ", ".join(map(str, value)) + return value diff --git a/src/hope_dedup_engine/config/fragments/constance.py b/src/hope_dedup_engine/config/fragments/constance.py index a4f2c8fe..555dbc49 100644 --- a/src/hope_dedup_engine/config/fragments/constance.py +++ b/src/hope_dedup_engine/config/fragments/constance.py @@ -1,15 +1,132 @@ +import cv2 + from hope_dedup_engine.apps.security.constants import DEFAULT_GROUP_NAME CONSTANCE_BACKEND = "constance.backends.database.DatabaseBackend" +CONSTANCE_CONFIG = { + "NEW_USER_IS_STAFF": (False, "Set any new user as staff", bool), + "NEW_USER_DEFAULT_GROUP": (DEFAULT_GROUP_NAME, "Group to assign to any new user", str), + "DNN_BACKEND": ( + cv2.dnn.DNN_BACKEND_OPENCV, + "Specifies the computation backend to be used by OpenCV for deep learning inference.", + "dnn_backend", + ), + "DNN_TARGET": ( + cv2.dnn.DNN_TARGET_CPU, + "Specifies the target device on which OpenCV will perform the deep learning computations.", + "dnn_target", + ), + "BLOB_FROM_IMAGE_SCALE_FACTOR": ( + 1.0, + """Specifies the scaling factor applied to all pixel values when converting an image to a blob. Mostly + it equals 1.0 for no scaling or 1.0/255.0 and normalizing to the [0, 1] range. + Remember that scaling factor is also applied to mean values. Both scaling factor and mean values + must be the same for the training and inference to get the correct results. + """, + float, + ), + "BLOB_FROM_IMAGE_MEAN_VALUES": ( + "104.0, 177.0, 123.0", + """Specifies the mean BGR values used in image preprocessing to normalize pixel values by subtracting + the mean values of the training dataset. This helps in reducing model bias and improving accuracy. + The specified mean values are subtracted from each channel (Blue, Green, Red) of the input image. + Remember that mean values are also applied to scaling factor. Both scaling factor and mean values + must be the same for the training and inference to get the correct results. + """, + "tuple_field", + ), + "FACE_DETECTION_CONFIDENCE": ( + 0.7, + """ + Specifies the minimum confidence score required for a detected face to be considered valid. Detections + with confidence scores below this threshold are discarded as likely false positives. + """, + float, + ), + "NMS_THRESHOLD": ( + 0.4, + """ + Specifies the Intersection over Union (IoU) threshold used in Non-Maximum Suppression (NMS) to filter out + overlapping bounding boxes. If the IoU between two boxes exceeds this threshold, the box with the lower + confidence score is suppressed. Lower values result in fewer, more distinct boxes; higher values allow more + overlapping boxes to remain. + """, + float, + ), + "FACE_ENCODINGS_NUM_JITTERS": ( + 1, + """ + Specifies the number of times to re-sample the face when calculating the encoding. Higher values increase + accuracy but are computationally more expensive and slower. For example, setting 'num_jitters' to 100 makes + the process 100 times slower. + """, + int, + ), + "FACE_ENCODINGS_MODEL": ( + "small", + """ + Specifies the model type used for encoding face landmarks. It can be either 'small' which is faster and + detects only 5 key facial landmarks, or 'large' which is more precise and identifies 68 key facial landmarks + but requires more computational resources. + """, + "face_encodings_model", + ), + "FACE_DISTANCE_THRESHOLD": ( + 0.5, + """ + Specifies the maximum allowable distance between two face embeddings for them to be considered a match. It helps + determine if two faces belong to the same person by setting a threshold for similarity. Lower values result in + stricter matching, while higher values allow for more lenient matches. + """, + float, + ), +} + + +CONSTANCE_CONFIG_FIELDSETS = { + "User settings": { + "fields": ("NEW_USER_IS_STAFF", "NEW_USER_DEFAULT_GROUP"), + "collapse": False, + }, + "Face recognition settings": { + "fields": ( + "DNN_BACKEND", + "DNN_TARGET", + "BLOB_FROM_IMAGE_SCALE_FACTOR", + "BLOB_FROM_IMAGE_MEAN_VALUES", + "FACE_DETECTION_CONFIDENCE", + "NMS_THRESHOLD", + "FACE_ENCODINGS_NUM_JITTERS", + "FACE_ENCODINGS_MODEL", + "FACE_DISTANCE_THRESHOLD", + ), + "collapse": False, + }, +} + CONSTANCE_ADDITIONAL_FIELDS = { "email": [ "django.forms.EmailField", {}, ], -} - -CONSTANCE_CONFIG = { - "NEW_USER_IS_STAFF": (False, "Set any new user as staff", bool), - "NEW_USER_DEFAULT_GROUP": (DEFAULT_GROUP_NAME, "Group to assign to any new user", str), + "dnn_backend": [ + "django.forms.ChoiceField", + { + "choices": ((cv2.dnn.DNN_BACKEND_OPENCV, "DNN_BACKEND_OPENCV"),), + }, + ], + "dnn_target": [ + "django.forms.ChoiceField", + { + "choices": ((cv2.dnn.DNN_TARGET_CPU, "DNN_TARGET_CPU"),), + }, + ], + "face_encodings_model": [ + "django.forms.ChoiceField", + { + "choices": (("small", "SMALL"), ("large", "LARGE")), + }, + ], + "tuple_field": ["hope_dedup_engine.apps.faces.validators.MeanValuesTupleField", {}], } diff --git a/src/hope_dedup_engine/config/fragments/recognition.py b/src/hope_dedup_engine/config/fragments/recognition.py deleted file mode 100644 index 7b349441..00000000 --- a/src/hope_dedup_engine/config/fragments/recognition.py +++ /dev/null @@ -1,7 +0,0 @@ -import cv2 - -DNN_BACKEND = cv2.dnn.DNN_TARGET_CPU -DNN_TARGET = cv2.dnn.DNN_TARGET_CPU - -FACE_DETECTION_CONFIDENCE = 0.5 -DISTANCE_THRESHOLD = 0.4 diff --git a/src/hope_dedup_engine/config/settings.py b/src/hope_dedup_engine/config/settings.py index d1e75080..ef57a797 100644 --- a/src/hope_dedup_engine/config/settings.py +++ b/src/hope_dedup_engine/config/settings.py @@ -190,7 +190,6 @@ from .fragments.csp import * # noqa from .fragments.debug_toolbar import * # noqa from .fragments.flags import * # noqa -from .fragments.recognition import * # noqa from .fragments.rest_framework import * # noqa from .fragments.root import * # noqa from .fragments.sentry import * # noqa diff --git a/tests/faces/faces_const.py b/tests/faces/faces_const.py index 0d7df597..7a506a4e 100644 --- a/tests/faces/faces_const.py +++ b/tests/faces/faces_const.py @@ -2,3 +2,28 @@ FILENAME: Final[str] = "test_file.jpg" FILENAMES: Final[list[str]] = ["test_file.jpg", "test_file2.jpg"] +DEPLOY_PROTO_CONTENT: Final[str] = "input_shape { dim: 1 dim: 3 dim: 300 dim: 300 }" +DEPLOY_PROTO_SHAPE: Final[dict[str, int]] = {"batch_size": 1, "channels": 3, "height": 300, "width": 300} +FACE_REGIONS_INVALID: Final[list[list[tuple[int, int, int, int]]]] = [[], [(0, 0, 10)]] +FACE_REGIONS_VALID: Final[list[tuple[int, int, int, int]]] = [ + (10, 10, 20, 20), + (30, 30, 40, 40), +] +FACE_DETECTION_CONFIDENCE: Final[float] = 0.7 +FACE_DETECTIONS: Final[list[tuple[float]]] = [ + (0, 0, 0.95, 0.1, 0.1, 0.2, 0.2), # with confidence 0.95 -> valid detection + (0, 0, 0.75, 0.3, 0.3, 0.4, 0.4), # with confidence 0.75 -> valid detection + (0, 0, 0.15, 0.1, 0.1, 0.2, 0.2), # with confidence 0.15 -> invalid detection +] +IMAGE_SIZE: Final[tuple[int, int, int]] = (100, 100, 3) # Size of the image after decoding (h, w, number of channels) +RESIZED_IMAGE_SIZE: Final[tuple[int, int, int]] = ( + 300, + 300, + 3, +) # Size of the image after resizing for processing (h, w, number of channels) +BLOB_SHAPE: Final[tuple[int, int, int, int]] = ( + 1, + 3, + 300, + 300, +) # Shape of the blob (4D tensor) for input to the neural network (batch_size, channels, h, w) diff --git a/tests/faces/fixtures/duplication_detector.py b/tests/faces/fixtures/duplication_detector.py index 46a6f39a..af547724 100644 --- a/tests/faces/fixtures/duplication_detector.py +++ b/tests/faces/fixtures/duplication_detector.py @@ -1,6 +1,7 @@ from io import BytesIO -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, mock_open, patch +import cv2 import numpy as np import pytest from PIL import Image @@ -8,66 +9,55 @@ from hope_dedup_engine.apps.core.storage import CV2DNNStorage, HDEAzureStorage, HOPEAzureStorage from hope_dedup_engine.apps.faces.utils.duplication_detector import DuplicationDetector -from ..faces_const import FILENAME +from ..faces_const import ( + BLOB_SHAPE, + DEPLOY_PROTO_CONTENT, + FACE_DETECTIONS, + FACE_REGIONS_VALID, + FILENAME, + IMAGE_SIZE, + RESIZED_IMAGE_SIZE, +) -@pytest.fixture(scope="module", autouse=True) -def dd(mock_hope_azure_storage, mock_cv2dnn_storage, mock_hde_azure_storage): +@pytest.fixture +def dd(mock_hope_azure_storage, mock_cv2dnn_storage, mock_hde_azure_storage, mock_prototxt_file, db): with ( patch("hope_dedup_engine.apps.faces.utils.duplication_detector.CV2DNNStorage", mock_cv2dnn_storage), patch("hope_dedup_engine.apps.faces.utils.duplication_detector.HOPEAzureStorage", mock_hope_azure_storage), patch("hope_dedup_engine.apps.faces.utils.duplication_detector.HDEAzureStorage", mock_hde_azure_storage), + patch("builtins.open", mock_prototxt_file), ): - mock_cv2dnn_storage.exists.return_value = False - detector = DuplicationDetector(FILENAME) - mock_logger = MagicMock() - detector.logger = mock_logger - return detector + return DuplicationDetector(FILENAME) + + +@pytest.fixture +def mock_prototxt_file(): + return mock_open(read_data=DEPLOY_PROTO_CONTENT) -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture def mock_cv2dnn_storage(): - storage = MagicMock(spec=CV2DNNStorage) - storage.exists.return_value = True - storage.path.side_effect = lambda filename: FILENAME - return storage + return MagicMock(spec=CV2DNNStorage) -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture def mock_hde_azure_storage(): - storage = MagicMock(spec=HDEAzureStorage) - storage.exists.return_value = True - # storage.listdir.return_value = (None, FILENAMES) - storage.open.return_value.__enter__.return_value.read.return_value = b"binary image data" - return storage + return MagicMock(spec=HDEAzureStorage) -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture def mock_hope_azure_storage(): - storage = MagicMock(spec=HOPEAzureStorage) - storage.exists.return_value = True - storage.open.return_value.__enter__.return_value.read.return_value = b"binary image data" - return storage + return MagicMock(spec=HOPEAzureStorage) @pytest.fixture def image_bytes_io(dd): - # Create an image and save it to a BytesIO object - image = Image.new("RGB", (100, 100), color="red") img_byte_arr = BytesIO() + image = Image.new("RGB", (100, 100), color="red") image.save(img_byte_arr, format="JPEG") img_byte_arr.seek(0) - - def fake_open(file, mode="rb", *args, **kwargs): - if "rb" in mode and file == dd.filename: - # Return a new BytesIO object with image data each time to avoid file closure - return BytesIO(img_byte_arr.getvalue()) - else: - # Return a MagicMock for other cases to simulate other file behaviors - return MagicMock() - - img_byte_arr.fake_open = fake_open - + img_byte_arr.fake_open = lambda *_: BytesIO(img_byte_arr.getvalue()) return img_byte_arr @@ -80,21 +70,11 @@ def mock_open_context_manager(image_bytes_io): @pytest.fixture def mock_net(): - mock_net = MagicMock() # Mocking the neural network object - mock_detections = np.array( - [ - [ - [ - [0, 0, 0.95, 0.1, 0.1, 0.2, 0.2], # with confidence 0.95 - [0, 0, 0.15, 0.1, 0.1, 0.2, 0.2], # with confidence 0.15 - ] - ] - ], - dtype=np.float32, - ) # Mocking the detections array - expected_regions = [(10, 10, 20, 20)] # Mocking the expected regions + mock_net = MagicMock(spec=cv2.dnn_Net) # Mocking the neural network object + mock_detections = np.array([[FACE_DETECTIONS]], dtype=np.float32) # Mocking the detections array + mock_expected_regions = FACE_REGIONS_VALID mock_net.forward.return_value = mock_detections # Setting up the forward method of the mock network - mock_imdecode = MagicMock(return_value=np.ones((100, 100, 3), dtype=np.uint8)) - mock_resize = MagicMock(return_value=np.ones((300, 300, 3), dtype=np.uint8)) - mock_blob = np.zeros((1, 3, 300, 300)) - return mock_net, mock_imdecode, mock_resize, mock_blob, expected_regions + mock_imdecode = MagicMock(return_value=np.ones(IMAGE_SIZE, dtype=np.uint8)) + mock_resize = MagicMock(return_value=np.ones(RESIZED_IMAGE_SIZE, dtype=np.uint8)) + mock_blob = np.zeros(BLOB_SHAPE) + return mock_net, mock_imdecode, mock_resize, mock_blob, mock_expected_regions diff --git a/tests/faces/test_celery_tasks.py b/tests/faces/test_celery_tasks.py index dc986b64..a468ef11 100644 --- a/tests/faces/test_celery_tasks.py +++ b/tests/faces/test_celery_tasks.py @@ -48,5 +48,6 @@ def test_deduplicate_task_exception_handling( assert task.traceback is not None mock_find.assert_called_once() + # Check that the Redis lock was acquired and then released mock_set.assert_called_once_with(f"Deduplicate_{FILENAME}", "true", nx=True, ex=3600) mock_delete.assert_called_once_with(f"Deduplicate_{FILENAME}") # Lock is released diff --git a/tests/faces/test_duplication_detector.py b/tests/faces/test_duplication_detector.py index d63d6c7a..25b92b1f 100644 --- a/tests/faces/test_duplication_detector.py +++ b/tests/faces/test_duplication_detector.py @@ -6,43 +6,72 @@ import cv2 import numpy as np import pytest -from faces_const import FILENAME, FILENAMES +from constance import config +from faces_const import DEPLOY_PROTO_SHAPE, FACE_REGIONS_INVALID, FILENAME, FILENAMES from hope_dedup_engine.apps.faces.utils.duplication_detector import DuplicationDetector def test_duplication_detector_initialization(dd): assert isinstance(dd.net, cv2.dnn_Net) - assert isinstance(dd.logger, MagicMock) - assert dd.confidence == settings.FACE_DETECTION_CONFIDENCE - assert dd.threshold == settings.DISTANCE_THRESHOLD + assert dd.face_detection_confidence == config.FACE_DETECTION_CONFIDENCE + assert dd.distance_threshold == config.FACE_DISTANCE_THRESHOLD assert dd.filename == FILENAME assert dd.encodings_filename == f"{FILENAME}.npy" + assert dd.scale_factor == config.BLOB_FROM_IMAGE_SCALE_FACTOR + assert dd.mean_values == tuple(map(float, config.BLOB_FROM_IMAGE_MEAN_VALUES.split(", "))) + assert dd.face_encodings_model == config.FACE_ENCODINGS_MODEL + assert dd.face_encodings_num_jitters == config.FACE_ENCODINGS_NUM_JITTERS + assert dd.nms_threshold == config.NMS_THRESHOLD + assert dd.shape == DEPLOY_PROTO_SHAPE + + +def test_get_shape(dd, mock_prototxt_file): + with patch("builtins.open", mock_prototxt_file): + shape = dd._get_shape() + assert shape == DEPLOY_PROTO_SHAPE + + +def test_set_net(dd, mock_cv2dnn_storage, mock_net): + mock_net_instance, *_ = mock_net + with patch("cv2.dnn.readNetFromCaffe", return_value=mock_net_instance) as mock_read_net: + net = dd._set_net(mock_cv2dnn_storage) + mock_read_net.assert_called_once_with( + mock_cv2dnn_storage.path(settings.PROTOTXT_FILE), + mock_cv2dnn_storage.path(settings.CAFFEMODEL_FILE), + ) + + assert net == mock_net_instance + mock_net_instance.setPreferableBackend.assert_called_once_with(int(config.DNN_BACKEND)) + mock_net_instance.setPreferableTarget.assert_called_once_with(int(config.DNN_TARGET)) + for storage_name, storage in dd.storages.items(): assert isinstance(storage, MagicMock) if storage_name == "cv2dnn": storage.exists.assert_any_call(settings.PROTOTXT_FILE) storage.exists.assert_any_call(settings.CAFFEMODEL_FILE) - storage.path.assert_any_call(settings.CAFFEMODEL_FILE) + storage.path.assert_any_call(settings.PROTOTXT_FILE) storage.path.assert_any_call(settings.CAFFEMODEL_FILE) -def test_missing_files_in_storage(dd, mock_cv2dnn_storage): +@pytest.mark.parametrize("missing_file", [settings.PROTOTXT_FILE, settings.CAFFEMODEL_FILE]) +def test_initialization_missing_files_in_cv2dnn_storage(mock_cv2dnn_storage, missing_file): with patch( - "hope_dedup_engine.apps.faces.utils.duplication_detector.CV2DNNStorage", new=lambda _: mock_cv2dnn_storage + "hope_dedup_engine.apps.faces.utils.duplication_detector.CV2DNNStorage", return_value=mock_cv2dnn_storage ): - mock_cv2dnn_storage.exists.return_value = False + mock_cv2dnn_storage.exists.side_effect = lambda filename: filename != missing_file with pytest.raises(FileNotFoundError): DuplicationDetector(FILENAME) + mock_cv2dnn_storage.exists.assert_any_call(missing_file) def test_has_encodings_false(dd): - dd.storages["encoded"].exists = MagicMock(return_value=False) + dd.storages["encoded"].exists.return_value = False assert not dd.has_encodings def test_has_encodings_true(dd): - dd.storages["encoded"].exists = MagicMock(return_value=True) + dd.storages["encoded"].exists.return_value = True assert dd.has_encodings @@ -51,37 +80,36 @@ def test_get_face_detections_dnn_no_detections(dd, mock_open_context_manager): patch.object(dd.storages["images"], "open", return_value=mock_open_context_manager), patch.object(dd, "_get_face_detections_dnn", return_value=[]), ): - face_regions = dd._get_face_detections_dnn() - assert len(face_regions) == 0 # Assuming no faces are detected + assert len(face_regions) == 0 def test_get_face_detections_dnn_with_detections(dd, mock_net, mock_open_context_manager): - net, imdecode, resize, blob, expected_regions = mock_net + net, imdecode, resize, _, expected_regions = mock_net with ( patch.object(dd.storages["images"], "open", return_value=mock_open_context_manager), patch("cv2.imdecode", imdecode), patch("cv2.resize", resize), + patch.object(dd, "net", net), ): - - dd.net.setInput(blob) - dd.net = net face_regions = dd._get_face_detections_dnn() assert face_regions == expected_regions - assert len(face_regions) == 1 # Assuming one face is detected - assert isinstance(face_regions[0], tuple) # Each detected face region should be a tuple - assert len(face_regions[0]) == 4 # Each tuple should have four elements (coordinates of the bounding box) + for region in face_regions: + assert isinstance(region, tuple) + assert len(region) == 4 def test_get_face_detections_dnn_exception_handling(dd): - with patch("builtins.open", side_effect=Exception("Test exception")): - try: + with ( + patch.object(dd.storages["images"], "open", side_effect=Exception("Test exception")) as mock_storage_open, + patch.object(dd.logger, "exception") as mock_logger_exception, + ): + with pytest.raises(Exception, match="Test exception"): dd._get_face_detections_dnn() - except Exception: - ... - dd.logger.exception.assert_called_once() - dd.logger.reset_mock() + + mock_storage_open.assert_called_once_with(dd.filename, "rb") + mock_logger_exception.assert_called_once() def test_load_encodings_all_no_files(dd): @@ -91,77 +119,94 @@ def test_load_encodings_all_no_files(dd): def test_load_encodings_all_with_files(dd): - mock_encoded_data = {f"{filename}.npy": [np.array([1, 2, 3]), np.array([4, 5, 6])] for filename in FILENAMES} + mock_encoded_data = {f"{filename}.npy": np.array([1, 2, 3]) for filename in FILENAMES} encoded_data = {os.path.splitext(key)[0]: value for key, value in mock_encoded_data.items()} - print(f"\n{mock_encoded_data=}\n{encoded_data=}") - # Mock the storage's listdir method to return the file names with patch.object( - dd.storages["encoded"], - "listdir", - return_value=(None, [f"{filename}.npy" for filename in FILENAMES]), + dd.storages["encoded"], "listdir", return_value=(None, [f"{filename}.npy" for filename in FILENAMES]) + ): + with patch("builtins.open", mock_open()) as mocked_open: + for filename, data in mock_encoded_data.items(): + mocked_file = mock_open(read_data=data.tobytes()).return_value + mocked_open.side_effect = lambda f, mode="rb", mocked_file=mocked_file, filename=filename: ( + mocked_file if f.endswith(filename) else MagicMock() + ) + with patch("numpy.load", return_value=data): + result = dd._load_encodings_all() + + for key, value in encoded_data.items(): + assert np.array_equal(result[key], value) + + +def test_load_encodings_all_exception_handling_listdir(dd): + with ( + patch.object(dd.storages["encoded"], "listdir", side_effect=Exception("Test exception")) as mock_listdir, + patch.object(dd.logger, "exception") as mock_logger_exception, ): - print(f"{dd.storages['encoded'].listdir()[1]=}") - # Mock the storage's open method to return the data for each file - with patch( - "builtins.open", - side_effect=lambda f: mock_open(read_data=np.save(mock_encoded_data[f])).return_value, - ): + with pytest.raises(Exception, match="Test exception"): dd._load_encodings_all() - # Assert that the returned encodings match the expected data - # TODO: Fix - # assert all(np.array_equal(encodings[key], value) for key, value in encoded_data.items()) + mock_listdir.assert_called_once_with("") -def test_load_encodings_all_exception_handling(dd): - with patch("builtins.open", side_effect=Exception("Test exception")): - try: + mock_logger_exception.assert_called_once() + + +def test_load_encodings_all_exception_handling_open(dd): + with ( + patch.object(dd.storages["encoded"], "listdir", return_value=(None, [f"{FILENAME}.npy"])) as mock_listdir, + patch.object(dd.storages["encoded"], "open", side_effect=Exception("Test exception")) as mock_open, + patch.object(dd.logger, "exception") as mock_logger_exception, + ): + with pytest.raises(Exception, match="Test exception"): dd._load_encodings_all() - except Exception: - ... - dd.logger.reset_mock() + mock_listdir.assert_called_once_with("") + mock_open.assert_called_once_with(f"{FILENAME}.npy", "rb") -def test_encode_face_successful(dd, image_bytes_io): + mock_logger_exception.assert_called_once() + + +def test_encode_face_successful(dd, image_bytes_io, mock_net): + mock_net, *_ = mock_net with ( - patch("builtins.open", new_callable=lambda: image_bytes_io.fake_open), patch.object(dd.storages["images"], "open", side_effect=image_bytes_io.fake_open) as mocked_image_open, + patch.object(dd, "net", mock_net), ): dd._encode_face() - # Checks that the file was opened correctly and in binary read mode - print(f"{mocked_image_open.assert_called_with(dd.filename, 'rb')=}") - assert mocked_image_open.called, "The open function should be called" + mocked_image_open.assert_called_with(dd.filename, "rb") + assert mocked_image_open.side_effect == image_bytes_io.fake_open + assert mocked_image_open.called -def test_encode_face_invalid_region(dd, image_bytes_io): - # Mock _get_face_detections_dnn to return an invalid region +@pytest.mark.parametrize("face_regions", FACE_REGIONS_INVALID) +def test_encode_face_error(dd, image_bytes_io, face_regions): with ( - patch("builtins.open", new_callable=lambda: image_bytes_io.fake_open), - patch.object(dd.storages["images"], "open", side_effect=image_bytes_io.fake_open), - patch.object(dd, "_get_face_detections_dnn", return_value=[(0, 0, 10)]), + patch.object(dd.storages["images"], "open", side_effect=image_bytes_io.fake_open) as mock_storage_open, + patch.object(dd, "_get_face_detections_dnn", return_value=face_regions) as mock_get_face_detections_dnn, patch.object(dd.logger, "error") as mock_error_logger, ): - - # Invoke the _encode_face method, expecting an error log due to an invalid region dd._encode_face() - # Check that the error was logged with the correct message - mock_error_logger.assert_called_once_with(f"Invalid face region {(0, 0, 10)}") - dd.logger.reset_mock() + mock_storage_open.assert_called_with(dd.filename, "rb") + mock_get_face_detections_dnn.assert_called_once() + + mock_error_logger.assert_called_once() def test_encode_face_exception_handling(dd): - with patch("builtins.open", side_effect=Exception("Test exception")): - try: + with ( + patch.object(dd.storages["images"], "open", side_effect=Exception("Test exception")) as mock_storage_open, + patch.object(dd.logger, "exception") as mock_logger_exception, + ): + with pytest.raises(Exception, match="Test exception"): dd._encode_face() - except Exception: - ... - dd.logger.exception.assert_called_once() - dd.logger.reset_mock() + mock_storage_open.assert_called_with(dd.filename, "rb") + mock_logger_exception.assert_called_once() -def test_find_duplicates_successful(dd, mock_hde_azure_storage): + +def test_find_duplicates_successful_when_encoded(dd, mock_hde_azure_storage): # Generate mock return values dynamically based on FILENAMES mock_encodings = {filename: [np.array([0.1, 0.2, 0.3 + i * 0.001])] for i, filename in enumerate(FILENAMES)} @@ -185,23 +230,21 @@ def test_find_duplicates_successful(dd, mock_hde_azure_storage): def test_find_duplicates_calls_encode_face_when_no_encodings(dd): - # Prepare a mock for the 'exists' method used in the 'has_encodings' property with ( - patch.object(dd.storages["encoded"], "exists", return_value=False), patch.object(dd, "_encode_face") as mock_encode_face, + patch.object(dd, "_load_encodings_all", return_value={FILENAME: [MagicMock()]}), ): - + dd.storages["encoded"].exists.return_value = False dd.find_duplicates() - mock_encode_face.assert_called_once() - dd.logger.reset_mock() def test_find_duplicates_exception_handling(dd): - with patch.object(dd, "_load_encodings_all", side_effect=Exception("Test exception")): - try: + with ( + patch.object(dd, "_load_encodings_all", side_effect=Exception("Test exception")), + patch.object(dd.logger, "exception") as mock_logger_exception, + ): + with pytest.raises(Exception, match="Test exception"): dd.find_duplicates() - except Exception: - ... - dd.logger.exception.assert_called_once() - dd.logger.reset_mock() + + mock_logger_exception.assert_called_once() diff --git a/tests/faces/test_validators.py b/tests/faces/test_validators.py new file mode 100644 index 00000000..79b3e0df --- /dev/null +++ b/tests/faces/test_validators.py @@ -0,0 +1,41 @@ +from django.forms import ValidationError + +import pytest + +from hope_dedup_engine.apps.faces.validators import MeanValuesTupleField + + +def test_to_python_valid_tuple(): + field = MeanValuesTupleField() + assert field.to_python("104.0, 177.0, 123.0") == (104.0, 177.0, 123.0) + + +def test_to_python_invalid_length(): + field = MeanValuesTupleField() + with pytest.raises(ValidationError) as exc_info: + field.to_python("104.0, 177.0") + assert "Enter a valid tuple of three float values separated by commas and spaces" in str(exc_info.value) + + +def test_to_python_value_out_of_range(): + field = MeanValuesTupleField() + with pytest.raises(ValidationError) as exc_info: + field.to_python("104.0, 177.0, 256.0") + assert "Each value must be between -255 and 255." in str(exc_info.value) + + +def test_to_python_non_numeric_value(): + field = MeanValuesTupleField() + with pytest.raises(ValidationError) as exc_info: + field.to_python("104.0, abc, 123.0") + assert "Enter a valid tuple of three float values separated by commas and spaces" in str(exc_info.value) + + +def test_prepare_value_with_tuple(): + field = MeanValuesTupleField() + assert field.prepare_value((104.0, 177.0, 123.0)) == "104.0, 177.0, 123.0" + + +def test_prepare_value_with_string(): + field = MeanValuesTupleField() + assert field.prepare_value("104.0, 177.0, 123.0") == "104.0, 177.0, 123.0" From af9234c2ec9c0231f42bffe6d820dd6a1c8b17e4 Mon Sep 17 00:00:00 2001 From: vitali-yanushchyk-valor <168179384+vitali-yanushchyk-valor@users.noreply.github.com> Date: Mon, 10 Jun 2024 12:34:28 -0400 Subject: [PATCH 7/7] Feature/fr batch image processing (#29) * add ! processing case if face regions for image are not detected * chg ! move face_recognition settings to constance * chg ! tests * chg ! get some parametres for fr from proto file * chg ! refactor DuplicationDetector, add NMS * chg ! optimize find duplicates, tests * chg ! tests * add ! batch processing * chg ! tests --- .../apps/faces/celery_tasks.py | 15 +- .../apps/faces/utils/celery_utils.py | 19 +- .../apps/faces/utils/duplication_detector.py | 184 ++++++++++------ .../apps/faces/validators.py | 2 +- tests/faces/faces_const.py | 16 +- tests/faces/fixtures/celery_tasks.py | 16 +- tests/faces/fixtures/duplication_detector.py | 5 +- tests/faces/test_celery_tasks.py | 92 ++++---- tests/faces/test_duplication_detector.py | 202 ++++++++++++------ 9 files changed, 369 insertions(+), 182 deletions(-) diff --git a/src/hope_dedup_engine/apps/faces/celery_tasks.py b/src/hope_dedup_engine/apps/faces/celery_tasks.py index aaf28981..2c156cfb 100644 --- a/src/hope_dedup_engine/apps/faces/celery_tasks.py +++ b/src/hope_dedup_engine/apps/faces/celery_tasks.py @@ -9,9 +9,20 @@ @shared_task(bind=True, soft_time_limit=0.5 * 60 * 60, time_limit=1 * 60 * 60) @task_lifecycle(name="Deduplicate", ttl=1 * 60 * 60) # TODO: Use DeduplicationSet objects as input to deduplication pipeline -def deduplicate(self, filename: str): +def deduplicate(self, filenames: tuple[str], ignore_pairs: tuple[tuple[str, str]] = tuple()) -> tuple[tuple[str]]: + """ + Deduplicate a set of filenames, ignoring any specified pairs of filenames. + + Args: + filenames (tuple[str]): A tuple of filenames to process. + ignore_pairs (tuple[tuple[str, str]]): A tuple of tuples, where each inner tuple contains + a pair of filenames to be ignored in the duplication check. + + Returns: + tuple[tuple[str]]: A tuple of tuples, where each inner tuple represents a group of duplicates. + """ try: - dd = DuplicationDetector(filename) + dd = DuplicationDetector(filenames, ignore_pairs) return dd.find_duplicates() except Exception as e: self.update_state(state=states.FAILURE, meta={"exc_message": str(e), "traceback": traceback.format_exc()}) diff --git a/src/hope_dedup_engine/apps/faces/utils/celery_utils.py b/src/hope_dedup_engine/apps/faces/utils/celery_utils.py index 27ee145d..eec34e9a 100644 --- a/src/hope_dedup_engine/apps/faces/utils/celery_utils.py +++ b/src/hope_dedup_engine/apps/faces/utils/celery_utils.py @@ -1,3 +1,4 @@ +import hashlib import logging from functools import wraps @@ -8,16 +9,17 @@ redis_client = redis.Redis.from_url(settings.CELERY_BROKER_URL) -def task_lifecycle(name: str, ttl: int): - def decorator(func): +def task_lifecycle(name: str, ttl: int) -> callable: + def decorator(func) -> callable: @wraps(func) - def wrapper(self, *args, **kwargs): + def wrapper(self, *args, **kwargs) -> any: logger = logging.getLogger(func.__module__) logger.info(f"{name} task started") result = None - filename: str = args[0] if args else kwargs.get("filename") - lock_name: str = f"{name}_{filename}" + filenames = args[0] if args else kwargs.get("filenames") + ignore_pairs = args[1] if args else kwargs.get("ignore_pairs") + lock_name: str = f"{name}_{_get_hash(filenames, ignore_pairs)}" if not _acquire_lock(lock_name, ttl): logger.info(f"Task {name} with brocker lock {lock_name} is already running.") return None @@ -43,3 +45,10 @@ def _acquire_lock(lock_name: str, ttl: int = 1 * 60 * 60) -> bool: def _release_lock(lock_name: str) -> None: redis_client.delete(lock_name) + + +def _get_hash(filenames: tuple[str], ignore_pairs: tuple[tuple[str, str]]) -> str: + fn_str: str = ",".join(sorted(filenames)) + ip_sorted = sorted((min(item1, item2), max(item1, item2)) for item1, item2 in ignore_pairs) + ip_str = ",".join(f"{item1},{item2}" for item1, item2 in ip_sorted) + return hashlib.sha256(f"{fn_str}{ip_str}".encode()).hexdigest() diff --git a/src/hope_dedup_engine/apps/faces/utils/duplication_detector.py b/src/hope_dedup_engine/apps/faces/utils/duplication_detector.py index b1ec8fd1..c0683943 100644 --- a/src/hope_dedup_engine/apps/faces/utils/duplication_detector.py +++ b/src/hope_dedup_engine/apps/faces/utils/duplication_detector.py @@ -1,6 +1,8 @@ import logging import os import re +from collections import defaultdict +from dataclasses import dataclass from django.conf import settings @@ -17,14 +19,27 @@ class DuplicationDetector: A class to detect and process duplicate faces in images. """ - def __init__(self, filename: str) -> None: + @dataclass(frozen=True, slots=True) + class BlobFromImageConfig: + shape: dict[str, int] + scale_factor: float + mean_values: tuple[float, float, float] + + @dataclass(frozen=True, slots=True) + class FaceEncodingsConfig: + num_jitters: int + model: str + + logger: logging.Logger = logging.getLogger(__name__) + + def __init__(self, filenames: tuple[str], ignore_pairs: tuple[str, str] = tuple()) -> None: """ - Initialize the DuplicationDetector with the given filename. + Initialize the DuplicationDetector with the given filenames. Args: - filename (str): The filename of the image to process. + filenames (list[str]): The filenames of the images to process. + ignore_pairs (list[tuple[str, str]]): The pairs of filenames to ignore. """ - self.logger: logging.Logger = logging.getLogger(__name__) self.storages: dict[str, CV2DNNStorage | HDEAzureStorage | HOPEAzureStorage] = { "images": HOPEAzureStorage(), "cv2dnn": CV2DNNStorage(settings.CV2DNN_PATH), @@ -35,23 +50,28 @@ def __init__(self, filename: str) -> None: if not self.storages.get("cv2dnn").exists(file): raise FileNotFoundError(f"File {file} does not exist in storage.") - self.shape: dict[str, int] = self._get_shape() self.net: cv2.dnn_Net = self._set_net(self.storages.get("cv2dnn")) - self.filename: str = filename - self.encodings_filename: str = f"{self.filename}.npy" - self.scale_factor: float = config.BLOB_FROM_IMAGE_SCALE_FACTOR - self.mean_values: tuple[float, float, float] = tuple(map(float, config.BLOB_FROM_IMAGE_MEAN_VALUES.split(", "))) - # self.mean_values: config.BLOB_FROM_IMAGE_MEAN_VALUES + self.filenames: tuple[str] = filenames + self.ignore_set: set[tuple[str, str]] = self._get_pairs_to_ignore(ignore_pairs) + + self.blob_from_image_cfg = self.BlobFromImageConfig( + shape=self._get_shape(), + scale_factor=config.BLOB_FROM_IMAGE_SCALE_FACTOR, + mean_values=( + tuple(map(float, config.BLOB_FROM_IMAGE_MEAN_VALUES.split(", "))) + if isinstance(config.BLOB_FROM_IMAGE_MEAN_VALUES, str) + else config.BLOB_FROM_IMAGE_MEAN_VALUES + ), + ) self.face_detection_confidence: float = config.FACE_DETECTION_CONFIDENCE - self.face_encodings_model: str = config.FACE_ENCODINGS_MODEL - self.face_encodings_num_jitters: int = config.FACE_ENCODINGS_NUM_JITTERS self.distance_threshold: float = config.FACE_DISTANCE_THRESHOLD - self.nms_threshold: float = config.NMS_THRESHOLD + self.face_encodings_cfg = self.FaceEncodingsConfig( + num_jitters=config.FACE_ENCODINGS_NUM_JITTERS, + model=config.FACE_ENCODINGS_MODEL, + ) - @property - def has_encodings(self) -> bool: - return self.storages["encoded"].exists(self.encodings_filename) + self.nms_threshold: float = config.NMS_THRESHOLD def _set_net(self, storage: CV2DNNStorage) -> cv2.dnn_Net: net = cv2.dnn.readNetFromCaffe( @@ -75,20 +95,44 @@ def _get_shape(self) -> dict[str, int]: else: raise ValueError("Could not find input_shape in prototxt file.") - def _get_face_detections_dnn(self) -> list[tuple[int, int, int, int]]: + def _get_pairs_to_ignore(self, ignore: tuple[tuple[str, str]]) -> set[tuple[str, str]]: + ignore = tuple(tuple(pair) for pair in ignore) + if not ignore: + return set() + if all( + isinstance(pair, tuple) and len(pair) == 2 and all(isinstance(item, str) and item for item in pair) + for pair in ignore + ): + return {(item1, item2) for item1, item2 in ignore} | {(item2, item1) for item1, item2 in ignore} + elif len(ignore) == 2 and all(isinstance(item, str) for item in ignore): + return {(ignore[0], ignore[1]), (ignore[1], ignore[0])} + else: + raise ValueError( + "Invalid format for 'ignore'. Expected tuple of tuples each containing exactly two strings." + ) + + def _encodings_filename(self, filename: str) -> str: + return f"{filename}.npy" + + def _has_encodings(self, filename: str) -> bool: + return self.storages["encoded"].exists(self._encodings_filename(filename)) + + def _get_face_detections_dnn(self, filename: str) -> list[tuple[int, int, int, int]]: face_regions: list[tuple[int, int, int, int]] = [] try: - with self.storages["images"].open(self.filename, "rb") as img_file: + with self.storages["images"].open(filename, "rb") as img_file: img_array = np.frombuffer(img_file.read(), dtype=np.uint8) # Decode image from binary buffer to 3D numpy array (height, width, channels of BlueGreeRed color space) image = cv2.imdecode(img_array, cv2.IMREAD_COLOR) (h, w) = image.shape[:2] # Create a blob (4D tensor) from the image blob = cv2.dnn.blobFromImage( - image=cv2.resize(image, dsize=(self.shape["height"], self.shape["width"])), - size=(self.shape["height"], self.shape["width"]), - scalefactor=self.scale_factor, - mean=self.mean_values, + image=cv2.resize( + image, dsize=(self.blob_from_image_cfg.shape["height"], self.blob_from_image_cfg.shape["width"]) + ), + size=(self.blob_from_image_cfg.shape["height"], self.blob_from_image_cfg.shape["width"]), + scalefactor=self.blob_from_image_cfg.scale_factor, + mean=self.blob_from_image_cfg.mean_values, ) self.net.setInput(blob) # Forward pass to get output with shape (1, 1, N, 7), @@ -111,7 +155,7 @@ def _get_face_detections_dnn(self) -> list[tuple[int, int, int, int]]: for i in indices: face_regions.append(tuple(boxes[i])) except Exception as e: - self.logger.exception("Error processing face detection for image %s", self.filename) + self.logger.exception("Error processing face detection for image %s", filename) raise e return face_regions @@ -120,74 +164,92 @@ def _load_encodings_all(self) -> dict[str, list[np.ndarray]]: try: _, files = self.storages["encoded"].listdir("") for file in files: - if file.endswith(".npy"): + if self._has_encodings(filename := os.path.splitext(file)[0]): with self.storages["encoded"].open(file, "rb") as f: - data[os.path.splitext(file)[0]] = np.load(f, allow_pickle=False) + data[filename] = np.load(f, allow_pickle=False) except Exception as e: self.logger.exception("Error loading encodings.") raise e return data - def _encode_face(self) -> None: + def _encode_face(self, filename: str) -> None: try: - with self.storages["images"].open(self.filename, "rb") as img_file: + with self.storages["images"].open(filename, "rb") as img_file: image = face_recognition.load_image_file(img_file) encodings: list = [] - face_regions = self._get_face_detections_dnn() + face_regions = self._get_face_detections_dnn(filename) if not face_regions: - self.logger.error("No face regions detected in image %s", self.filename) + self.logger.error("No face regions detected in image %s", filename) else: for region in face_regions: if isinstance(region, (list, tuple)) and len(region) == 4: top, right, bottom, left = region - # Compute the face encodings for the face regions in the image face_encodings = face_recognition.face_encodings( image, [(top, right, bottom, left)], - num_jitters=self.face_encodings_num_jitters, - model=self.face_encodings_model, + num_jitters=self.face_encodings_cfg.num_jitters, + model=self.face_encodings_cfg.model, ) encodings.extend(face_encodings) else: - self.logger.error("Invalid face region.") - with self.storages["encoded"].open(self.encodings_filename, "wb") as f: + self.logger.error("Invalid face region %s", region) + with self.storages["encoded"].open(self._encodings_filename(filename), "wb") as f: np.save(f, encodings) except Exception as e: - self.logger.exception("Error processing face encodings for image %s", self.filename) + self.logger.exception("Error processing face encodings for image %s", filename) raise e - def find_duplicates(self) -> tuple[str]: + def _get_duplicated_groups(self, checked: set[tuple[str, str, float]]) -> tuple[tuple[str]]: + # Dictionary to store connections between paths where distances are less than the threshold + groups = [] + connections = defaultdict(set) + for path1, path2, dist in checked: + if dist < self.distance_threshold: + connections[path1].add(path2) + connections[path2].add(path1) + # Iterate over each path and form groups + for path, neighbors in connections.items(): + # Check if the path has already been included in any group + if not any(path in group for group in groups): + new_group = {path} + queue = list(neighbors) + # Try to expand the group ensuring each new path is duplicated to all in the group + while queue: + neighbor = queue.pop(0) + if neighbor not in new_group and all(neighbor in connections[member] for member in new_group): + new_group.add(neighbor) + # Add neighbors of the current neighbor, excluding those already in the group + queue.extend([n for n in connections[neighbor] if n not in new_group]) + # Add the newly formed group to the list of groups + groups.append(new_group) + return tuple(map(tuple, groups)) + + def find_duplicates(self) -> tuple[tuple[str]]: """ Find and return a list of duplicate images based on face encodings. Returns: - tuple[str]: A tuple of filenames of duplicate images. + tuple[tuple[str]]: A tuple of filenames of duplicate images. """ - duplicated_images: set[str] = set() - path1 = self.filename try: - if not self.has_encodings: - self._encode_face() + for filename in self.filenames: + if not self._has_encodings(filename): + self._encode_face(filename) encodings_all = self._load_encodings_all() - encodings1 = encodings_all[path1] - - checked_pairs = set() - for path2, encodings2 in encodings_all.items(): - if path1 != path2: - for encoding1 in encodings1: - for encoding2 in encodings2: - if (path1, path2, tuple(encoding1), tuple(encoding2)) in checked_pairs: - continue - - distance = face_recognition.face_distance([encoding1], encoding2) - if distance < self.distance_threshold: - duplicated_images.update([path1, path2]) - break - - checked_pairs.add((path1, path2, tuple(encoding1), tuple(encoding2))) - if path2 in duplicated_images: - break - return tuple(duplicated_images) + + checked = set() + for path1, encodings1 in encodings_all.items(): + for path2, encodings2 in encodings_all.items(): + if path1 < path2 and (path1, path2) not in self.ignore_set: + min_distance = float("inf") + for encoding1 in encodings1: + if ( + current_min := min(face_recognition.face_distance(encodings2, encoding1)) + ) < min_distance: + min_distance = current_min + checked.add((path1, path2, min_distance)) + + return self._get_duplicated_groups(checked) except Exception as e: - self.logger.exception("Error finding duplicates for image %s", path1) + self.logger.exception("Error finding duplicates for images %s", self.filenames) raise e diff --git a/src/hope_dedup_engine/apps/faces/validators.py b/src/hope_dedup_engine/apps/faces/validators.py index 893275ba..1b8288f4 100644 --- a/src/hope_dedup_engine/apps/faces/validators.py +++ b/src/hope_dedup_engine/apps/faces/validators.py @@ -21,4 +21,4 @@ def to_python(self, value): def prepare_value(self, value): if isinstance(value, tuple): return ", ".join(map(str, value)) - return value + return super().prepare_value(value) diff --git a/tests/faces/faces_const.py b/tests/faces/faces_const.py index 7a506a4e..64b2c543 100644 --- a/tests/faces/faces_const.py +++ b/tests/faces/faces_const.py @@ -1,7 +1,21 @@ from typing import Final FILENAME: Final[str] = "test_file.jpg" -FILENAMES: Final[list[str]] = ["test_file.jpg", "test_file2.jpg"] +FILENAME_ENCODED_FORMAT: Final[str] = "{}.npy" +FILENAMES: Final[list[str]] = ["test_file.jpg", "test_file2.jpg", "test_file3.jpg"] +IGNORE_PAIRS: Final[list[tuple[str, str]]] = [ + ("ignore_file.jpg", "ignore_file2.jpg"), + ("ignore_file4.jpg", "ignore_file3.jpg"), +] + +CELERY_TASK_NAME: Final[str] = "Deduplicate" +CELERY_TASK_TTL: Final[int] = 1 * 60 * 60 +CELERY_TASK_DELAYS: Final[dict[str, int]] = { + "SoftTimeLimitExceeded": 5 * 60 * 60, + "TimeLimitExceeded": 10 * 60 * 60, + "CustomException": 0, +} + DEPLOY_PROTO_CONTENT: Final[str] = "input_shape { dim: 1 dim: 3 dim: 300 dim: 300 }" DEPLOY_PROTO_SHAPE: Final[dict[str, int]] = {"batch_size": 1, "channels": 3, "height": 300, "width": 300} FACE_REGIONS_INVALID: Final[list[list[tuple[int, int, int, int]]]] = [[], [(0, 0, 10)]] diff --git a/tests/faces/fixtures/celery_tasks.py b/tests/faces/fixtures/celery_tasks.py index 1bcd89c0..7bf0602c 100644 --- a/tests/faces/fixtures/celery_tasks.py +++ b/tests/faces/fixtures/celery_tasks.py @@ -1,13 +1,16 @@ from unittest.mock import patch import pytest +from freezegun import freeze_time -import docker +from docker import from_env + +from ..faces_const import FILENAMES @pytest.fixture(scope="session") def docker_client(): - client = docker.from_env() + client = from_env() yield client client.close() @@ -19,8 +22,15 @@ def mock_redis_client(): @pytest.fixture -def mock_duplication_detector(): +def mock_dd_find(): with patch( "hope_dedup_engine.apps.faces.utils.duplication_detector.DuplicationDetector.find_duplicates" ) as mock_find: + mock_find.return_value = (FILENAMES[:2],) # Assuming the first two are duplicates based on mock data yield mock_find + + +@pytest.fixture +def time_control(): + with freeze_time("2024-01-01") as frozen_time: + yield frozen_time diff --git a/tests/faces/fixtures/duplication_detector.py b/tests/faces/fixtures/duplication_detector.py index af547724..748c498b 100644 --- a/tests/faces/fixtures/duplication_detector.py +++ b/tests/faces/fixtures/duplication_detector.py @@ -14,7 +14,8 @@ DEPLOY_PROTO_CONTENT, FACE_DETECTIONS, FACE_REGIONS_VALID, - FILENAME, + FILENAMES, + IGNORE_PAIRS, IMAGE_SIZE, RESIZED_IMAGE_SIZE, ) @@ -28,7 +29,7 @@ def dd(mock_hope_azure_storage, mock_cv2dnn_storage, mock_hde_azure_storage, moc patch("hope_dedup_engine.apps.faces.utils.duplication_detector.HDEAzureStorage", mock_hde_azure_storage), patch("builtins.open", mock_prototxt_file), ): - return DuplicationDetector(FILENAME) + return DuplicationDetector(FILENAMES, IGNORE_PAIRS) @pytest.fixture diff --git a/tests/faces/test_celery_tasks.py b/tests/faces/test_celery_tasks.py index a468ef11..bf6fe492 100644 --- a/tests/faces/test_celery_tasks.py +++ b/tests/faces/test_celery_tasks.py @@ -1,53 +1,63 @@ +from datetime import timedelta from unittest.mock import patch +import pytest from celery import states -from faces_const import FILENAME, FILENAMES +from celery.exceptions import SoftTimeLimitExceeded, TimeLimitExceeded +from faces_const import CELERY_TASK_DELAYS, CELERY_TASK_NAME, CELERY_TASK_TTL, FILENAMES, IGNORE_PAIRS from hope_dedup_engine.apps.faces.celery_tasks import deduplicate +from hope_dedup_engine.apps.faces.utils.celery_utils import _get_hash -def test_deduplicate_task_already_running(mock_redis_client, mock_duplication_detector, celery_app, celery_worker): +@pytest.mark.parametrize("lock_is_acquired", [True, False]) +def test_deduplicate_task_locking(mock_redis_client, mock_dd_find, dd, lock_is_acquired): mock_set, mock_delete = mock_redis_client - - mock_set.return_value = False # Lock is not acquired - task = deduplicate.apply(args=[FILENAME]) - - assert task.result is None # Task is not executed - mock_duplication_detector.assert_not_called() # DeduplicationDetector is not called - mock_set.assert_called_once_with(f"Deduplicate_{FILENAME}", "true", nx=True, ex=3600) - mock_delete.assert_not_called() # Lock is not released - - -def test_deduplicate_task_success(dd, mock_redis_client, mock_duplication_detector, celery_app, celery_worker): - mock_set, mock_delete = mock_redis_client - mock_find = mock_duplication_detector - mock_set.return_value = True # Lock is acquired - mock_find.return_value = set(FILENAMES[:2]) # Assuming the first two are duplicates based on mock data + mock_set.return_value = lock_is_acquired + mock_find = mock_dd_find with patch("hope_dedup_engine.apps.faces.celery_tasks.DuplicationDetector", return_value=dd): - task_result = deduplicate.apply(args=[FILENAME]).get() - - assert task_result == set(FILENAMES[:2]) # Assuming the first two are duplicates based on mock data - mock_set.assert_called_once_with(f"Deduplicate_{FILENAME}", "true", nx=True, ex=3600) - mock_delete.assert_called_once_with(f"Deduplicate_{FILENAME}") # Lock is released - - -def test_deduplicate_task_exception_handling( - dd, mock_redis_client, mock_duplication_detector, celery_app, celery_worker -): + task_result = deduplicate.apply(args=(FILENAMES, IGNORE_PAIRS)).get() + hash_value = _get_hash(FILENAMES, IGNORE_PAIRS) + + mock_set.assert_called_once_with(f"{CELERY_TASK_NAME}_{hash_value}", "true", nx=True, ex=CELERY_TASK_TTL) + if lock_is_acquired: + assert task_result == mock_find.return_value + mock_find.assert_called_once() + mock_delete.assert_called_once_with(f"{CELERY_TASK_NAME}_{hash_value}") + else: + assert task_result is None + mock_find.assert_not_called() + mock_delete.assert_not_called() + + +@pytest.mark.parametrize( + "delay, exception", + [ + (CELERY_TASK_DELAYS["SoftTimeLimitExceeded"], SoftTimeLimitExceeded()), + (CELERY_TASK_DELAYS["TimeLimitExceeded"], TimeLimitExceeded()), + (CELERY_TASK_DELAYS["CustomException"], Exception("Simulated custom task failure")), + ], +) +def test_deduplicate_task_exception_handling(mock_redis_client, mock_dd_find, time_control, dd, delay, exception): mock_set, mock_delete = mock_redis_client - mock_find = mock_duplication_detector - mock_find.side_effect = Exception("Simulated task failure") - - with patch("hope_dedup_engine.apps.faces.celery_tasks.DuplicationDetector", return_value=dd): - task = deduplicate.apply(args=[FILENAME]) - - assert task.state == states.FAILURE - assert isinstance(task.result, Exception) - assert str(task.result) == "Simulated task failure" - assert task.traceback is not None - + mock_find = mock_dd_find + mock_find.side_effect = exception + + time_control.tick(delta=timedelta(seconds=delay)) + + with ( + pytest.raises(type(exception)) as exc_info, + patch("hope_dedup_engine.apps.faces.celery_tasks.DuplicationDetector", return_value=dd), + ): + task = deduplicate.apply(args=(FILENAMES, IGNORE_PAIRS)) + assert exc_info.value == exception + assert isinstance(task.result, exception) + assert task.state == states.FAILURE + assert str(task.result) == str(exception) + assert task.traceback is not None + + hash_value = _get_hash(FILENAMES, IGNORE_PAIRS) + mock_set.assert_called_once_with(f"{CELERY_TASK_NAME}_{hash_value}", "true", nx=True, ex=3600) + mock_delete.assert_called_once_with(f"{CELERY_TASK_NAME}_{hash_value}") # Lock is released mock_find.assert_called_once() - # Check that the Redis lock was acquired and then released - mock_set.assert_called_once_with(f"Deduplicate_{FILENAME}", "true", nx=True, ex=3600) - mock_delete.assert_called_once_with(f"Deduplicate_{FILENAME}") # Lock is released diff --git a/tests/faces/test_duplication_detector.py b/tests/faces/test_duplication_detector.py index 25b92b1f..d74a818c 100644 --- a/tests/faces/test_duplication_detector.py +++ b/tests/faces/test_duplication_detector.py @@ -7,23 +7,29 @@ import numpy as np import pytest from constance import config -from faces_const import DEPLOY_PROTO_SHAPE, FACE_REGIONS_INVALID, FILENAME, FILENAMES +from faces_const import DEPLOY_PROTO_SHAPE, FACE_REGIONS_INVALID, FILENAME, FILENAME_ENCODED_FORMAT, FILENAMES from hope_dedup_engine.apps.faces.utils.duplication_detector import DuplicationDetector def test_duplication_detector_initialization(dd): assert isinstance(dd.net, cv2.dnn_Net) + assert dd.filenames == FILENAMES assert dd.face_detection_confidence == config.FACE_DETECTION_CONFIDENCE assert dd.distance_threshold == config.FACE_DISTANCE_THRESHOLD - assert dd.filename == FILENAME - assert dd.encodings_filename == f"{FILENAME}.npy" - assert dd.scale_factor == config.BLOB_FROM_IMAGE_SCALE_FACTOR - assert dd.mean_values == tuple(map(float, config.BLOB_FROM_IMAGE_MEAN_VALUES.split(", "))) - assert dd.face_encodings_model == config.FACE_ENCODINGS_MODEL - assert dd.face_encodings_num_jitters == config.FACE_ENCODINGS_NUM_JITTERS assert dd.nms_threshold == config.NMS_THRESHOLD - assert dd.shape == DEPLOY_PROTO_SHAPE + + assert isinstance(dd.blob_from_image_cfg, DuplicationDetector.BlobFromImageConfig) + assert dd.blob_from_image_cfg.scale_factor == config.BLOB_FROM_IMAGE_SCALE_FACTOR + if isinstance(config.BLOB_FROM_IMAGE_MEAN_VALUES, str): + expected_mean_values = tuple(map(float, config.BLOB_FROM_IMAGE_MEAN_VALUES.split(", "))) + else: + expected_mean_values = config.BLOB_FROM_IMAGE_MEAN_VALUES + assert dd.blob_from_image_cfg.mean_values == expected_mean_values + + assert isinstance(dd.face_encodings_cfg, DuplicationDetector.FaceEncodingsConfig) + assert dd.face_encodings_cfg.num_jitters == config.FACE_ENCODINGS_NUM_JITTERS + assert dd.face_encodings_cfg.model == config.FACE_ENCODINGS_MODEL def test_get_shape(dd, mock_prototxt_file): @@ -54,25 +60,77 @@ def test_set_net(dd, mock_cv2dnn_storage, mock_net): storage.path.assert_any_call(settings.CAFFEMODEL_FILE) +@pytest.mark.parametrize( + "ignore_input, expected_output", + [ + (tuple(), set()), + ((("file1.jpg", "file2.jpg"),), {("file1.jpg", "file2.jpg"), ("file2.jpg", "file1.jpg")}), + ( + (("file1.jpg", "file2.jpg"), ("file2.jpg", "file1.jpg")), + {("file1.jpg", "file2.jpg"), ("file2.jpg", "file1.jpg")}, + ), + ( + (("file1.jpg", "file3.jpg"), ("file2.jpg", "file3.jpg")), + { + ("file1.jpg", "file3.jpg"), + ("file3.jpg", "file1.jpg"), + ("file2.jpg", "file3.jpg"), + ("file3.jpg", "file2.jpg"), + }, + ), + ], +) +def test_get_pairs_to_ignore_success(mock_cv2dnn_storage, mock_prototxt_file, ignore_input, expected_output): + with ( + patch("hope_dedup_engine.apps.faces.utils.duplication_detector.CV2DNNStorage", mock_cv2dnn_storage), + patch("builtins.open", mock_prototxt_file), + ): + dd = DuplicationDetector(FILENAMES, ignore_input) + assert dd.ignore_set == expected_output + + +@pytest.mark.parametrize( + "ignore_input", + [ + (("file1.jpg",),), + (("file1.jpg", "file2.jpg", "file3.jpg"),), + ( + "file1.jpg", + "file2.jpg", + ), + ((1, "file2.jpg"),), + (("", "file2.jpg"),), + ], +) +def test_get_pairs_to_ignore_exception_handling(mock_cv2dnn_storage, mock_prototxt_file, ignore_input): + with ( + pytest.raises(ValueError), + patch("hope_dedup_engine.apps.faces.utils.duplication_detector.CV2DNNStorage", mock_cv2dnn_storage), + patch("builtins.open", mock_prototxt_file), + ): + DuplicationDetector(filenames=FILENAMES, ignore_pairs=ignore_input) + + @pytest.mark.parametrize("missing_file", [settings.PROTOTXT_FILE, settings.CAFFEMODEL_FILE]) def test_initialization_missing_files_in_cv2dnn_storage(mock_cv2dnn_storage, missing_file): - with patch( - "hope_dedup_engine.apps.faces.utils.duplication_detector.CV2DNNStorage", return_value=mock_cv2dnn_storage + with ( + pytest.raises(FileNotFoundError), + patch("hope_dedup_engine.apps.faces.utils.duplication_detector.CV2DNNStorage", mock_cv2dnn_storage), ): mock_cv2dnn_storage.exists.side_effect = lambda filename: filename != missing_file - with pytest.raises(FileNotFoundError): - DuplicationDetector(FILENAME) + DuplicationDetector(FILENAME) mock_cv2dnn_storage.exists.assert_any_call(missing_file) -def test_has_encodings_false(dd): - dd.storages["encoded"].exists.return_value = False - assert not dd.has_encodings +def test_encodings_filename(dd): + assert dd._encodings_filename(FILENAME) == FILENAME_ENCODED_FORMAT.format(FILENAME) -def test_has_encodings_true(dd): - dd.storages["encoded"].exists.return_value = True - assert dd.has_encodings +@pytest.mark.parametrize("file_exists", [True, False]) +def test_has_encodings(dd, file_exists): + dd.storages["encoded"].exists.return_value = file_exists + assert dd._has_encodings(FILENAME) == file_exists + dd.storages["encoded"].exists.assert_called_with(FILENAME_ENCODED_FORMAT.format(FILENAME)) def test_get_face_detections_dnn_no_detections(dd, mock_open_context_manager): @@ -92,7 +150,7 @@ def test_get_face_detections_dnn_with_detections(dd, mock_net, mock_open_context patch("cv2.resize", resize), patch.object(dd, "net", net), ): - face_regions = dd._get_face_detections_dnn() + face_regions = dd._get_face_detections_dnn(FILENAME) assert face_regions == expected_regions for region in face_regions: @@ -102,67 +160,76 @@ def test_get_face_detections_dnn_with_detections(dd, mock_net, mock_open_context def test_get_face_detections_dnn_exception_handling(dd): with ( + pytest.raises(Exception, match="Test exception"), patch.object(dd.storages["images"], "open", side_effect=Exception("Test exception")) as mock_storage_open, patch.object(dd.logger, "exception") as mock_logger_exception, ): - with pytest.raises(Exception, match="Test exception"): - dd._get_face_detections_dnn() + dd._get_face_detections_dnn(FILENAME) - mock_storage_open.assert_called_once_with(dd.filename, "rb") + mock_storage_open.assert_called_once_with(FILENAME, "rb") mock_logger_exception.assert_called_once() -def test_load_encodings_all_no_files(dd): - with patch.object(dd.storages["encoded"], "listdir", return_value=(None, [])): - encodings = dd._load_encodings_all() - assert encodings == {} +@pytest.mark.parametrize( + "filenames, expected", [(FILENAMES, {filename: np.array([1, 2, 3]) for filename in FILENAMES}), ([], {})] +) +def test_load_encodings_all_files(dd, filenames, expected): + mock_encoded_data = {FILENAME_ENCODED_FORMAT.format(filename): np.array([1, 2, 3]) for filename in filenames} + with ( + patch.object( + dd.storages["encoded"], + "listdir", + return_value=(None, [FILENAME_ENCODED_FORMAT.format(filename) for filename in filenames]), + ), + patch("builtins.open", mock_open()) as mocked_open, + patch("numpy.load") as mock_load, + ): -def test_load_encodings_all_with_files(dd): - mock_encoded_data = {f"{filename}.npy": np.array([1, 2, 3]) for filename in FILENAMES} - encoded_data = {os.path.splitext(key)[0]: value for key, value in mock_encoded_data.items()} + mocked_files_read = { + filename: mock_open(read_data=data.tobytes()).return_value for filename, data in mock_encoded_data.items() + } + mocked_open.side_effect = lambda f, mode="rb": mocked_files_read[os.path.basename(f)] - with patch.object( - dd.storages["encoded"], "listdir", return_value=(None, [f"{filename}.npy" for filename in FILENAMES]) - ): - with patch("builtins.open", mock_open()) as mocked_open: - for filename, data in mock_encoded_data.items(): - mocked_file = mock_open(read_data=data.tobytes()).return_value - mocked_open.side_effect = lambda f, mode="rb", mocked_file=mocked_file, filename=filename: ( - mocked_file if f.endswith(filename) else MagicMock() - ) - with patch("numpy.load", return_value=data): - result = dd._load_encodings_all() - - for key, value in encoded_data.items(): + for filename, data in mock_encoded_data.items(): + mock_load.side_effect = lambda f, data=data, filename=filename, allow_pickle=False: ( + data if f.name.endswith(filename) else MagicMock() + ) + + result = dd._load_encodings_all() + + if filenames: + for key, value in expected.items(): assert np.array_equal(result[key], value) + else: + assert result == expected def test_load_encodings_all_exception_handling_listdir(dd): with ( + pytest.raises(Exception, match="Test exception"), patch.object(dd.storages["encoded"], "listdir", side_effect=Exception("Test exception")) as mock_listdir, patch.object(dd.logger, "exception") as mock_logger_exception, ): - with pytest.raises(Exception, match="Test exception"): - dd._load_encodings_all() + dd._load_encodings_all() mock_listdir.assert_called_once_with("") - mock_logger_exception.assert_called_once() def test_load_encodings_all_exception_handling_open(dd): with ( - patch.object(dd.storages["encoded"], "listdir", return_value=(None, [f"{FILENAME}.npy"])) as mock_listdir, + pytest.raises(Exception, match="Test exception"), + patch.object( + dd.storages["encoded"], "listdir", return_value=(None, [FILENAME_ENCODED_FORMAT.format(FILENAME)]) + ) as mock_listdir, patch.object(dd.storages["encoded"], "open", side_effect=Exception("Test exception")) as mock_open, patch.object(dd.logger, "exception") as mock_logger_exception, ): - with pytest.raises(Exception, match="Test exception"): - dd._load_encodings_all() + dd._load_encodings_all() mock_listdir.assert_called_once_with("") - mock_open.assert_called_once_with(f"{FILENAME}.npy", "rb") - + mock_open.assert_called_once_with(FILENAME_ENCODED_FORMAT.format(FILENAME), "rb") mock_logger_exception.assert_called_once() @@ -172,9 +239,9 @@ def test_encode_face_successful(dd, image_bytes_io, mock_net): patch.object(dd.storages["images"], "open", side_effect=image_bytes_io.fake_open) as mocked_image_open, patch.object(dd, "net", mock_net), ): - dd._encode_face() + dd._encode_face(FILENAME) - mocked_image_open.assert_called_with(dd.filename, "rb") + mocked_image_open.assert_called_with(FILENAME, "rb") assert mocked_image_open.side_effect == image_bytes_io.fake_open assert mocked_image_open.called @@ -186,9 +253,9 @@ def test_encode_face_error(dd, image_bytes_io, face_regions): patch.object(dd, "_get_face_detections_dnn", return_value=face_regions) as mock_get_face_detections_dnn, patch.object(dd.logger, "error") as mock_error_logger, ): - dd._encode_face() + dd._encode_face(FILENAME) - mock_storage_open.assert_called_with(dd.filename, "rb") + mock_storage_open.assert_called_with(FILENAME, "rb") mock_get_face_detections_dnn.assert_called_once() mock_error_logger.assert_called_once() @@ -196,13 +263,13 @@ def test_encode_face_error(dd, image_bytes_io, face_regions): def test_encode_face_exception_handling(dd): with ( + pytest.raises(Exception, match="Test exception"), patch.object(dd.storages["images"], "open", side_effect=Exception("Test exception")) as mock_storage_open, patch.object(dd.logger, "exception") as mock_logger_exception, ): - with pytest.raises(Exception, match="Test exception"): - dd._encode_face() + dd._encode_face(FILENAME) - mock_storage_open.assert_called_with(dd.filename, "rb") + mock_storage_open.assert_called_with(FILENAME, "rb") mock_logger_exception.assert_called_once() @@ -221,30 +288,33 @@ def test_find_duplicates_successful_when_encoded(dd, mock_hde_azure_storage): duplicates = dd.find_duplicates() # Check that the correct list of duplicates is returned - expected_duplicates = set(FILENAMES[:2]) # Assuming the first two are duplicates based on mock data - assert all(name in duplicates for name in expected_duplicates) + expected_duplicates = (tuple(FILENAMES),) + assert {frozenset(t) for t in duplicates} == {frozenset(t) for t in expected_duplicates} dd._encode_face.assert_not_called() dd._load_encodings_all.assert_called_once() - mock_hde_azure_storage.exists.assert_called_once_with(f"{FILENAME}.npy") + mock_hde_azure_storage.exists.assert_called_with(FILENAME_ENCODED_FORMAT.format(FILENAMES[-1])) -def test_find_duplicates_calls_encode_face_when_no_encodings(dd): +def test_find_duplicates_no_encodings(dd): with ( + patch.object(dd, "_has_encodings", return_value=False), patch.object(dd, "_encode_face") as mock_encode_face, - patch.object(dd, "_load_encodings_all", return_value={FILENAME: [MagicMock()]}), + patch.object(dd, "_load_encodings_all", return_value={}) as mock_load_encodings, ): - dd.storages["encoded"].exists.return_value = False + dd.find_duplicates() - mock_encode_face.assert_called_once() + + mock_encode_face.assert_called_with(FILENAMES[-1]) + mock_load_encodings.assert_called_once() def test_find_duplicates_exception_handling(dd): with ( + pytest.raises(Exception, match="Test exception"), patch.object(dd, "_load_encodings_all", side_effect=Exception("Test exception")), patch.object(dd.logger, "exception") as mock_logger_exception, ): - with pytest.raises(Exception, match="Test exception"): - dd.find_duplicates() + dd.find_duplicates() mock_logger_exception.assert_called_once()