-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
🚸(backend) on user search match emails by Levenstein distance
When the query looks like an email (includes @) we search by Levenstein distance because we are just trying to prevent typing errors, not searching anymore. It is important to still propose results with a short Levenstein distance because it is frequent to forget a double letter in someone's name for example "Pacoud" or even "pacou" instead of "Paccoud" and we want to prevent duplicates or failing on invitation. We consider the query string to be an email as soon as it contains a "@" character. Trying harder to identify a string that is really an email would lead to weird behaviors like [email protected] looking like and email but if we continue typing [email protected] not looking like an email... before [email protected] finally looking like an email. The result would be jumping from one type of search to the other. As soon as there is a "@" in the query, we can be sure that the user is not looking for a name anymore and we can switch to matching by Levenstein distance.
- Loading branch information
1 parent
5be2bc7
commit b077411
Showing
4 changed files
with
82 additions
and
60 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
16 changes: 16 additions & 0 deletions
16
src/backend/core/migrations/0013_activate_fuzzystrmatch_extension.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Generated by Django 5.1.4 on 2025-01-25 08:38 | ||
|
||
from django.db import migrations | ||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('core', '0012_make_document_creator_and_invitation_issuer_optional'), | ||
] | ||
|
||
operations = [ | ||
migrations.RunSQL( | ||
"CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;", | ||
reverse_sql="DROP EXTENSION IF EXISTS fuzzystrmatch;", | ||
), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,18 +42,17 @@ def test_api_users_list_authenticated(): | |
|
||
def test_api_users_list_query_email(): | ||
""" | ||
Authenticated users should be able to list users | ||
and filter by email. | ||
Authenticated users should be able to list users and filter by email. | ||
Only results with a Levenstein distance less than 3 with the query should be returned. | ||
We want to match by Levenstein distance because we want to prevent typing errors. | ||
""" | ||
user = factories.UserFactory() | ||
|
||
client = APIClient() | ||
client.force_login(user) | ||
|
||
dave = factories.UserFactory(email="[email protected]") | ||
nicole = factories.UserFactory(email="[email protected]") | ||
frank = factories.UserFactory(email="[email protected]") | ||
factories.UserFactory(email="[email protected]") | ||
factories.UserFactory(email="[email protected]") | ||
|
||
response = client.get( | ||
"/api/v1.0/users/[email protected]", | ||
|
@@ -62,77 +61,73 @@ def test_api_users_list_query_email(): | |
user_ids = [user["id"] for user in response.json()["results"]] | ||
assert user_ids == [str(dave.id)] | ||
|
||
response = client.get("/api/v1.0/users/?q=oole") | ||
response = client.get( | ||
"/api/v1.0/users/[email protected]", | ||
) | ||
assert response.status_code == 200 | ||
user_ids = [user["id"] for user in response.json()["results"]] | ||
assert user_ids == [str(dave.id)] | ||
|
||
response = client.get( | ||
"/api/v1.0/users/[email protected]", | ||
) | ||
assert response.status_code == 200 | ||
user_ids = [user["id"] for user in response.json()["results"]] | ||
assert user_ids == [str(nicole.id), str(frank.id)] | ||
assert user_ids == [] | ||
|
||
|
||
def test_api_users_list_query_email_matching(): | ||
"""While filtering by email, results should be filtered and sorted by similarity""" | ||
"""While filtering by email, results should be filtered and sorted by Levenstein distance.""" | ||
user = factories.UserFactory() | ||
|
||
client = APIClient() | ||
client.force_login(user) | ||
|
||
alice = factories.UserFactory(email="[email protected]") | ||
factories.UserFactory(email="jane.smith@example.gouv.fr") | ||
michael_wilson = factories.UserFactory(email="michael.wilson@example.gouv.fr") | ||
factories.UserFactory(email="david.jones@example.gouv.fr") | ||
michael_brown = factories.UserFactory(email="michael.brown@example.gouv.fr") | ||
factories.UserFactory(email="sophia.taylor@example.gouv.fr") | ||
user1 = factories.UserFactory(email="[email protected]") | ||
user2 = factories.UserFactory(email="alice.johnnson@example.gouv.fr") | ||
user3 = factories.UserFactory(email="alice.kohlson@example.gouv.fr") | ||
user4 = factories.UserFactory(email="alicia.johnnson@example.gouv.fr") | ||
user5 = factories.UserFactory(email="alicia.johnnson@example.gov.uk") | ||
factories.UserFactory(email="alice.thomson@example.gouv.fr") | ||
|
||
response = client.get( | ||
"/api/v1.0/users/?q=michael[email protected].f", | ||
"/api/v1.0/users/?q=alice[email protected].fr", | ||
) | ||
assert response.status_code == 200 | ||
user_ids = [user["id"] for user in response.json()["results"]] | ||
assert user_ids == [str(michael_wilson.id)] | ||
|
||
response = client.get("/api/v1.0/users/[email protected]") | ||
|
||
assert response.status_code == 200 | ||
user_ids = [user["id"] for user in response.json()["results"]] | ||
assert user_ids == [str(michael_wilson.id), str(alice.id), str(michael_brown.id)] | ||
assert user_ids == [str(user1.id), str(user2.id), str(user3.id), str(user4.id)] | ||
|
||
response = client.get( | ||
"/api/v1.0/users/[email protected]", | ||
) | ||
assert response.status_code == 200 | ||
user_ids = [user["id"] for user in response.json()["results"]] | ||
assert user_ids == [str(alice.id)] | ||
response = client.get("/api/v1.0/users/[email protected]") | ||
|
||
response = client.get( | ||
"/api/v1.0/users/[email protected]", | ||
) | ||
assert response.status_code == 200 | ||
user_ids = [user["id"] for user in response.json()["results"]] | ||
assert user_ids == [str(michael_wilson.id)] | ||
assert user_ids == [str(user4.id), str(user2.id), str(user1.id), str(user5.id)] | ||
|
||
|
||
def test_api_users_list_query_email_exclude_doc_user(): | ||
""" | ||
Authenticated users should be able to list users | ||
and filter by email and exclude users who have access to a document. | ||
Authenticated users should be able to list users while filtering by email | ||
and excluding users who have access to a document. | ||
""" | ||
user = factories.UserFactory() | ||
document = factories.DocumentFactory() | ||
|
||
client = APIClient() | ||
client.force_login(user) | ||
|
||
nicole = factories.UserFactory(email="nicole_foole@work.com") | ||
frank = factories.UserFactory(email="frank_poole@work.com") | ||
nicole_fool = factories.UserFactory(email="nicole_fool@work.com") | ||
nicole_pool = factories.UserFactory(email="nicole_pool@work.com") | ||
factories.UserFactory(email="[email protected]") | ||
|
||
factories.UserDocumentAccessFactory(document=document, user=frank) | ||
factories.UserDocumentAccessFactory(document=document, user=nicole_pool) | ||
|
||
response = client.get("/api/v1.0/users/?q=oole&document_id=" + str(document.id)) | ||
response = client.get( | ||
"/api/v1.0/users/[email protected]&document_id=" + str(document.id) | ||
) | ||
|
||
assert response.status_code == 200 | ||
user_ids = [user["id"] for user in response.json()["results"]] | ||
assert user_ids == [str(nicole.id)] | ||
assert user_ids == [str(nicole_fool.id)] | ||
|
||
|
||
def test_api_users_retrieve_me_anonymous(): | ||
|