diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a8ae7b4a..55377c160 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,11 @@ and this project adheres to ## Added -- github actions to managed Crowdin workflow +- github actions to manage Crowdin workflow + +## Changed + +- 🚸(backend) match emails by Levenstein distance when searching users #575 ## [2.0.1] - 2025-01-17 diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 0f83e1e59..7cc48f9aa 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -20,6 +20,7 @@ Subquery, Value, ) +from django.db.models.expressions import RawSQL from django.http import Http404 import rest_framework as drf @@ -150,29 +151,35 @@ def get_queryset(self): """ queryset = self.queryset - if self.action == "list": - # Exclude all users already in the given document - if document_id := self.request.GET.get("document_id", ""): - queryset = queryset.exclude(documentaccess__document_id=document_id) - - # Filter users by email similarity - if query := self.request.GET.get("q", ""): - # For performance reasons we filter first by similarity, which relies on an index, - # then only calculate precise similarity scores for sorting purposes - queryset = queryset.filter(email__trigram_word_similar=query) - - queryset = queryset.annotate( - similarity=TrigramSimilarity("email", query) - ) - # When the query only is on the name part, we should try to make many proposals - # But when the query looks like an email we should only propose serious matches - threshold = 0.6 if "@" in query else 0.1 + if self.action != "list": + return queryset - queryset = queryset.filter(similarity__gt=threshold).order_by( - "-similarity", "email" + # Exclude all users already in the given document + if document_id := self.request.GET.get("document_id", ""): + queryset = queryset.exclude(documentaccess__document_id=document_id) + + if not (query := self.request.GET.get("q", "")): + return queryset + + # For emails, match emails by Levenstein distance to prevent typing errors + if "@" in query: + return ( + queryset.annotate( + distance=RawSQL("levenshtein(email::text, %s::text)", (query,)) ) + .filter(distance__lte=3) + .order_by("distance", "email") + ) - return queryset + # Use trigram similarity for non-email-like queries + # For performance reasons we filter first by similarity, which relies on an + # index, then only calculate precise similarity scores for sorting purposes + return ( + queryset.filter(email__trigram_word_similar=query) + .annotate(similarity=TrigramSimilarity("email", query)) + .filter(similarity__gt=0.3) + .order_by("-similarity", "email") + ) @drf.decorators.action( detail=False, diff --git a/src/backend/core/migrations/0013_activate_fuzzystrmatch_extension.py b/src/backend/core/migrations/0013_activate_fuzzystrmatch_extension.py new file mode 100644 index 000000000..db7fbb3aa --- /dev/null +++ b/src/backend/core/migrations/0013_activate_fuzzystrmatch_extension.py @@ -0,0 +1,16 @@ +# Generated by Django 5.1.4 on 2025-01-25 08:38 + +from django.db import migrations + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0012_make_document_creator_and_invitation_issuer_optional'), + ] + + operations = [ + migrations.RunSQL( + "CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;", + reverse_sql="DROP EXTENSION IF EXISTS fuzzystrmatch;", + ), + ] diff --git a/src/backend/core/tests/test_api_users.py b/src/backend/core/tests/test_api_users.py index e739d4d1f..cb3022d38 100644 --- a/src/backend/core/tests/test_api_users.py +++ b/src/backend/core/tests/test_api_users.py @@ -42,8 +42,9 @@ def test_api_users_list_authenticated(): def test_api_users_list_query_email(): """ - Authenticated users should be able to list users - and filter by email. + Authenticated users should be able to list users and filter by email. + Only results with a Levenstein distance less than 3 with the query should be returned. + We want to match by Levenstein distance because we want to prevent typing errors. """ user = factories.UserFactory() @@ -51,9 +52,7 @@ def test_api_users_list_query_email(): client.force_login(user) dave = factories.UserFactory(email="david.bowman@work.com") - nicole = factories.UserFactory(email="nicole_foole@work.com") - frank = factories.UserFactory(email="frank_poole@work.com") - factories.UserFactory(email="heywood_floyd@work.com") + factories.UserFactory(email="nicole.bowman@work.com") response = client.get( "/api/v1.0/users/?q=david.bowman@work.com", @@ -62,59 +61,53 @@ def test_api_users_list_query_email(): user_ids = [user["id"] for user in response.json()["results"]] assert user_ids == [str(dave.id)] - response = client.get("/api/v1.0/users/?q=oole") + response = client.get( + "/api/v1.0/users/?q=davig.bovman@worm.com", + ) + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()["results"]] + assert user_ids == [str(dave.id)] + response = client.get( + "/api/v1.0/users/?q=davig.bovman@worm.cop", + ) assert response.status_code == 200 user_ids = [user["id"] for user in response.json()["results"]] - assert user_ids == [str(nicole.id), str(frank.id)] + assert user_ids == [] def test_api_users_list_query_email_matching(): - """While filtering by email, results should be filtered and sorted by similarity""" + """While filtering by email, results should be filtered and sorted by Levenstein distance.""" user = factories.UserFactory() client = APIClient() client.force_login(user) - alice = factories.UserFactory(email="alice.johnson@example.gouv.fr") - factories.UserFactory(email="jane.smith@example.gouv.fr") - michael_wilson = factories.UserFactory(email="michael.wilson@example.gouv.fr") - factories.UserFactory(email="david.jones@example.gouv.fr") - michael_brown = factories.UserFactory(email="michael.brown@example.gouv.fr") - factories.UserFactory(email="sophia.taylor@example.gouv.fr") + user1 = factories.UserFactory(email="alice.johnson@example.gouv.fr") + user2 = factories.UserFactory(email="alice.johnnson@example.gouv.fr") + user3 = factories.UserFactory(email="alice.kohlson@example.gouv.fr") + user4 = factories.UserFactory(email="alicia.johnnson@example.gouv.fr") + user5 = factories.UserFactory(email="alicia.johnnson@example.gov.uk") + factories.UserFactory(email="alice.thomson@example.gouv.fr") response = client.get( - "/api/v1.0/users/?q=michael.johnson@example.gouv.f", + "/api/v1.0/users/?q=alice.johnson@example.gouv.fr", ) assert response.status_code == 200 user_ids = [user["id"] for user in response.json()["results"]] - assert user_ids == [str(michael_wilson.id)] - - response = client.get("/api/v1.0/users/?q=michael.johnson@example.gouv.fr") - - assert response.status_code == 200 - user_ids = [user["id"] for user in response.json()["results"]] - assert user_ids == [str(michael_wilson.id), str(alice.id), str(michael_brown.id)] + assert user_ids == [str(user1.id), str(user2.id), str(user3.id), str(user4.id)] - response = client.get( - "/api/v1.0/users/?q=ajohnson@example.gouv.f", - ) - assert response.status_code == 200 - user_ids = [user["id"] for user in response.json()["results"]] - assert user_ids == [str(alice.id)] + response = client.get("/api/v1.0/users/?q=alicia.johnnson@example.gouv.fr") - response = client.get( - "/api/v1.0/users/?q=michael.wilson@example.gouv.f", - ) assert response.status_code == 200 user_ids = [user["id"] for user in response.json()["results"]] - assert user_ids == [str(michael_wilson.id)] + assert user_ids == [str(user4.id), str(user2.id), str(user1.id), str(user5.id)] def test_api_users_list_query_email_exclude_doc_user(): """ - Authenticated users should be able to list users - and filter by email and exclude users who have access to a document. + Authenticated users should be able to list users while filtering by email + and excluding users who have access to a document. """ user = factories.UserFactory() document = factories.DocumentFactory() @@ -122,17 +115,19 @@ def test_api_users_list_query_email_exclude_doc_user(): client = APIClient() client.force_login(user) - nicole = factories.UserFactory(email="nicole_foole@work.com") - frank = factories.UserFactory(email="frank_poole@work.com") + nicole_fool = factories.UserFactory(email="nicole_fool@work.com") + nicole_pool = factories.UserFactory(email="nicole_pool@work.com") factories.UserFactory(email="heywood_floyd@work.com") - factories.UserDocumentAccessFactory(document=document, user=frank) + factories.UserDocumentAccessFactory(document=document, user=nicole_pool) - response = client.get("/api/v1.0/users/?q=oole&document_id=" + str(document.id)) + response = client.get( + "/api/v1.0/users/?q=nicole_fool@work.com&document_id=" + str(document.id) + ) assert response.status_code == 200 user_ids = [user["id"] for user in response.json()["results"]] - assert user_ids == [str(nicole.id)] + assert user_ids == [str(nicole_fool.id)] def test_api_users_retrieve_me_anonymous():