Skip to content

Commit

Permalink
🚸(backend) on user search match emails by Levenstein distance
Browse files Browse the repository at this point in the history
When the query looks like an email (includes @) we search by
Levenstein distance because we are just trying to prevent typing
errors, not searching anymore.

It is important to still propose results with a short Levenstein
distance because it is frequent to forget a double letter in
someone's name for example "Pacoud" or even "pacou" instead of
"Paccoud" and we want to prevent duplicates or failing on
invitation.

We consider the query string to be an email as soon as it contains
a "@" character. Trying harder to identify a string that is really
an email would lead to weird behaviors like [email protected] looking
like and email but if we continue typing [email protected] not
looking like an email... before [email protected] finally looking
like an email. The result would be jumping from one type of search
to the other. As soon as there is a "@" in the query, we can be
sure that the user is not looking for a name anymore and we can
switch to matching by Levenstein distance.
  • Loading branch information
sampaccoud committed Jan 25, 2025
1 parent 5be2bc7 commit b077411
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 60 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@ and this project adheres to

## Added

- github actions to managed Crowdin workflow
- github actions to manage Crowdin workflow

## Changed

- 🚸(backend) match emails by Levenstein distance when searching users #575

## [2.0.1] - 2025-01-17

Expand Down
47 changes: 27 additions & 20 deletions src/backend/core/api/viewsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
Subquery,
Value,
)
from django.db.models.expressions import RawSQL
from django.http import Http404

import rest_framework as drf
Expand Down Expand Up @@ -150,29 +151,35 @@ def get_queryset(self):
"""
queryset = self.queryset

if self.action == "list":
# Exclude all users already in the given document
if document_id := self.request.GET.get("document_id", ""):
queryset = queryset.exclude(documentaccess__document_id=document_id)

# Filter users by email similarity
if query := self.request.GET.get("q", ""):
# For performance reasons we filter first by similarity, which relies on an index,
# then only calculate precise similarity scores for sorting purposes
queryset = queryset.filter(email__trigram_word_similar=query)

queryset = queryset.annotate(
similarity=TrigramSimilarity("email", query)
)
# When the query only is on the name part, we should try to make many proposals
# But when the query looks like an email we should only propose serious matches
threshold = 0.6 if "@" in query else 0.1
if self.action != "list":
return queryset

queryset = queryset.filter(similarity__gt=threshold).order_by(
"-similarity", "email"
# Exclude all users already in the given document
if document_id := self.request.GET.get("document_id", ""):
queryset = queryset.exclude(documentaccess__document_id=document_id)

if not (query := self.request.GET.get("q", "")):
return queryset

# For emails, match emails by Levenstein distance to prevent typing errors
if "@" in query:
return (
queryset.annotate(
distance=RawSQL("levenshtein(email::text, %s::text)", (query,))
)
.filter(distance__lte=3)
.order_by("distance", "email")
)

return queryset
# Use trigram similarity for non-email-like queries
# For performance reasons we filter first by similarity, which relies on an
# index, then only calculate precise similarity scores for sorting purposes
return (
queryset.filter(email__trigram_word_similar=query)
.annotate(similarity=TrigramSimilarity("email", query))
.filter(similarity__gt=0.3)
.order_by("-similarity", "email")
)

@drf.decorators.action(
detail=False,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Generated by Django 5.1.4 on 2025-01-25 08:38

from django.db import migrations

class Migration(migrations.Migration):

dependencies = [
('core', '0012_make_document_creator_and_invitation_issuer_optional'),
]

operations = [
migrations.RunSQL(
"CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;",
reverse_sql="DROP EXTENSION IF EXISTS fuzzystrmatch;",
),
]
73 changes: 34 additions & 39 deletions src/backend/core/tests/test_api_users.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,17 @@ def test_api_users_list_authenticated():

def test_api_users_list_query_email():
"""
Authenticated users should be able to list users
and filter by email.
Authenticated users should be able to list users and filter by email.
Only results with a Levenstein distance less than 3 with the query should be returned.
We want to match by Levenstein distance because we want to prevent typing errors.
"""
user = factories.UserFactory()

client = APIClient()
client.force_login(user)

dave = factories.UserFactory(email="[email protected]")
nicole = factories.UserFactory(email="[email protected]")
frank = factories.UserFactory(email="[email protected]")
factories.UserFactory(email="[email protected]")
factories.UserFactory(email="[email protected]")

response = client.get(
"/api/v1.0/users/[email protected]",
Expand All @@ -62,77 +61,73 @@ def test_api_users_list_query_email():
user_ids = [user["id"] for user in response.json()["results"]]
assert user_ids == [str(dave.id)]

response = client.get("/api/v1.0/users/?q=oole")
response = client.get(
"/api/v1.0/users/[email protected]",
)
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()["results"]]
assert user_ids == [str(dave.id)]

response = client.get(
"/api/v1.0/users/[email protected]",
)
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()["results"]]
assert user_ids == [str(nicole.id), str(frank.id)]
assert user_ids == []


def test_api_users_list_query_email_matching():
"""While filtering by email, results should be filtered and sorted by similarity"""
"""While filtering by email, results should be filtered and sorted by Levenstein distance."""
user = factories.UserFactory()

client = APIClient()
client.force_login(user)

alice = factories.UserFactory(email="[email protected]")
factories.UserFactory(email="jane.smith@example.gouv.fr")
michael_wilson = factories.UserFactory(email="michael.wilson@example.gouv.fr")
factories.UserFactory(email="david.jones@example.gouv.fr")
michael_brown = factories.UserFactory(email="michael.brown@example.gouv.fr")
factories.UserFactory(email="sophia.taylor@example.gouv.fr")
user1 = factories.UserFactory(email="[email protected]")
user2 = factories.UserFactory(email="alice.johnnson@example.gouv.fr")
user3 = factories.UserFactory(email="alice.kohlson@example.gouv.fr")
user4 = factories.UserFactory(email="alicia.johnnson@example.gouv.fr")
user5 = factories.UserFactory(email="alicia.johnnson@example.gov.uk")
factories.UserFactory(email="alice.thomson@example.gouv.fr")

response = client.get(
"/api/v1.0/users/?q=michael[email protected].f",
"/api/v1.0/users/?q=alice[email protected].fr",
)
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()["results"]]
assert user_ids == [str(michael_wilson.id)]

response = client.get("/api/v1.0/users/[email protected]")

assert response.status_code == 200
user_ids = [user["id"] for user in response.json()["results"]]
assert user_ids == [str(michael_wilson.id), str(alice.id), str(michael_brown.id)]
assert user_ids == [str(user1.id), str(user2.id), str(user3.id), str(user4.id)]

response = client.get(
"/api/v1.0/users/[email protected]",
)
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()["results"]]
assert user_ids == [str(alice.id)]
response = client.get("/api/v1.0/users/[email protected]")

response = client.get(
"/api/v1.0/users/[email protected]",
)
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()["results"]]
assert user_ids == [str(michael_wilson.id)]
assert user_ids == [str(user4.id), str(user2.id), str(user1.id), str(user5.id)]


def test_api_users_list_query_email_exclude_doc_user():
"""
Authenticated users should be able to list users
and filter by email and exclude users who have access to a document.
Authenticated users should be able to list users while filtering by email
and excluding users who have access to a document.
"""
user = factories.UserFactory()
document = factories.DocumentFactory()

client = APIClient()
client.force_login(user)

nicole = factories.UserFactory(email="nicole_foole@work.com")
frank = factories.UserFactory(email="frank_poole@work.com")
nicole_fool = factories.UserFactory(email="nicole_fool@work.com")
nicole_pool = factories.UserFactory(email="nicole_pool@work.com")
factories.UserFactory(email="[email protected]")

factories.UserDocumentAccessFactory(document=document, user=frank)
factories.UserDocumentAccessFactory(document=document, user=nicole_pool)

response = client.get("/api/v1.0/users/?q=oole&document_id=" + str(document.id))
response = client.get(
"/api/v1.0/users/[email protected]&document_id=" + str(document.id)
)

assert response.status_code == 200
user_ids = [user["id"] for user in response.json()["results"]]
assert user_ids == [str(nicole.id)]
assert user_ids == [str(nicole_fool.id)]


def test_api_users_retrieve_me_anonymous():
Expand Down

0 comments on commit b077411

Please sign in to comment.