Skip to content

Commit 7e5e69b

Browse files
committed
🚸(backend) use unaccented full name for user search
We have the user full name through OIDC in the database, but the search only used the email field. This change allows to search for a user by their first and/or last name (fix #929). Given that user names are more likely than emails to include diacritics, it unaccents both the query and the database entry for search (fix #1091). It also unaccents for email so that internationalized domain names are managed whether or not the accent is included in the search.
1 parent 0d0e17c commit 7e5e69b

File tree

3 files changed

+135
-6
lines changed

3 files changed

+135
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ and this project adheres to
1818
- ♻️(frontend) preserve @ character when esc is pressed after typing it #1512
1919
- ♻️(frontend) make summary button fixed to remain visible during scroll #1581
2020
- ♻️(frontend) pdf embed use full width #1526
21+
- 🚸(backend) use unaccented full name for user search #1637
2122

2223
### Fixed
2324

src/backend/core/api/viewsets.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""API endpoints"""
2+
23
# pylint: disable=too-many-lines
34

45
import base64
@@ -10,6 +11,7 @@
1011

1112
from django.conf import settings
1213
from django.contrib.postgres.aggregates import ArrayAgg
14+
from django.contrib.postgres.lookups import Unaccent
1315
from django.contrib.postgres.search import TrigramSimilarity
1416
from django.core.cache import cache
1517
from django.core.exceptions import ValidationError
@@ -18,7 +20,7 @@
1820
from django.db import connection, transaction
1921
from django.db import models as db
2022
from django.db.models.expressions import RawSQL
21-
from django.db.models.functions import Left, Length
23+
from django.db.models.functions import Greatest, Left, Length
2224
from django.http import Http404, StreamingHttpResponse
2325
from django.urls import reverse
2426
from django.utils.functional import cached_property
@@ -36,6 +38,7 @@
3638
from rest_framework.permissions import AllowAny
3739

3840
from core import authentication, choices, enums, models
41+
from core.api.filters import remove_accents
3942
from core.services.ai_services import AIService
4043
from core.services.collaboration_services import CollaborationService
4144
from core.services.converter_services import (
@@ -187,13 +190,15 @@ def get_queryset(self):
187190
queryset = queryset.exclude(documentaccess__document_id=document_id)
188191

189192
filter_data = filterset.form.cleaned_data
190-
query = filter_data["q"]
193+
query = remove_accents(filter_data["q"])
191194

192195
# For emails, match emails by Levenstein distance to prevent typing errors
193196
if "@" in query:
194197
return (
195198
queryset.annotate(
196-
distance=RawSQL("levenshtein(email::text, %s::text)", (query,))
199+
distance=RawSQL(
200+
"levenshtein(unaccent(email::text), %s::text)", (query,)
201+
)
197202
)
198203
.filter(distance__lte=3)
199204
.order_by("distance", "email")[: settings.API_USERS_LIST_LIMIT]
@@ -202,11 +207,15 @@ def get_queryset(self):
202207
# Use trigram similarity for non-email-like queries
203208
# For performance reasons we filter first by similarity, which relies on an
204209
# index, then only calculate precise similarity scores for sorting purposes
210+
205211
return (
206-
queryset.filter(email__trigram_word_similar=query)
207-
.annotate(similarity=TrigramSimilarity("email", query))
212+
queryset.annotate(
213+
sim_email=TrigramSimilarity(Unaccent("email"), query),
214+
sim_name=TrigramSimilarity(Unaccent("full_name"), query),
215+
)
216+
.annotate(similarity=Greatest("sim_email", "sim_name"))
208217
.filter(similarity__gt=0.2)
209-
.order_by("-similarity", "email")[: settings.API_USERS_LIST_LIMIT]
218+
.order_by("-similarity")[: settings.API_USERS_LIST_LIMIT]
210219
)
211220

212221
@drf.decorators.action(

src/backend/core/tests/test_api_users.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,125 @@ def test_api_users_list_query_email():
7676
assert user_ids == []
7777

7878

79+
def test_api_users_list_query_email_with_internationalized_domain_names():
80+
"""
81+
Authenticated users should be able to list users and filter by email.
82+
It should work even if the email address contains an internationalized domain name.
83+
"""
84+
user = factories.UserFactory()
85+
86+
client = APIClient()
87+
client.force_login(user)
88+
89+
jean = factories.UserFactory(email="jean.martin@éducation.fr")
90+
kurokawa = factories.UserFactory(email="contact@黒川.日本")
91+
92+
response = client.get(
93+
"/api/v1.0/users/[email protected]",
94+
)
95+
assert response.status_code == 200
96+
user_ids = [user["id"] for user in response.json()]
97+
assert user_ids == [str(jean.id)]
98+
99+
response = client.get(
100+
"/api/v1.0/users/?q=jean.martin@éducation.fr",
101+
)
102+
assert response.status_code == 200
103+
user_ids = [user["id"] for user in response.json()]
104+
assert user_ids == [str(jean.id)]
105+
106+
response = client.get(
107+
"/api/v1.0/users/?q=contact@黒川.日本",
108+
)
109+
assert response.status_code == 200
110+
user_ids = [user["id"] for user in response.json()]
111+
assert user_ids == [str(kurokawa.id)]
112+
113+
114+
def test_api_users_list_query_full_name():
115+
"""
116+
Authenticated users should be able to list users and filter by full name.
117+
Only results with a Trigram similarity greater than 0.2 with the query should be returned.
118+
"""
119+
user = factories.UserFactory()
120+
121+
client = APIClient()
122+
client.force_login(user)
123+
124+
dave = factories.UserFactory(email="[email protected]", full_name="David Bowman")
125+
126+
response = client.get(
127+
"/api/v1.0/users/?q=David",
128+
)
129+
assert response.status_code == 200
130+
user_ids = [user["id"] for user in response.json()]
131+
assert user_ids == [str(dave.id)]
132+
133+
response = client.get(
134+
"/api/v1.0/users/?q=Bowman",
135+
)
136+
assert response.status_code == 200
137+
user_ids = [user["id"] for user in response.json()]
138+
assert user_ids == [str(dave.id)]
139+
140+
response = client.get(
141+
"/api/v1.0/users/?q=Bovin",
142+
)
143+
assert response.status_code == 200
144+
user_ids = [user["id"] for user in response.json()]
145+
assert user_ids == []
146+
147+
148+
def test_api_users_list_query_accented_full_name():
149+
"""
150+
Authenticated users should be able to list users and filter by full name with accents.
151+
Only results with a Trigram similarity greater than 0.2 with the query should be returned.
152+
"""
153+
user = factories.UserFactory()
154+
155+
client = APIClient()
156+
client.force_login(user)
157+
158+
fred = factories.UserFactory(
159+
email="[email protected]", full_name="Frédérique Lefèvre"
160+
)
161+
162+
response = client.get(
163+
"/api/v1.0/users/?q=Frédérique",
164+
)
165+
assert response.status_code == 200
166+
user_ids = [user["id"] for user in response.json()]
167+
assert user_ids == [str(fred.id)]
168+
169+
response = client.get(
170+
"/api/v1.0/users/?q=Frederique",
171+
)
172+
assert response.status_code == 200
173+
user_ids = [user["id"] for user in response.json()]
174+
assert user_ids == [str(fred.id)]
175+
176+
response = client.get(
177+
"/api/v1.0/users/?q=Lefèvre",
178+
)
179+
assert response.status_code == 200
180+
user_ids = [user["id"] for user in response.json()]
181+
assert user_ids == [str(fred.id)]
182+
183+
response = client.get(
184+
"/api/v1.0/users/?q=Lefevre",
185+
)
186+
assert response.status_code == 200
187+
user_ids = [user["id"] for user in response.json()]
188+
assert user_ids == [str(fred.id)]
189+
190+
response = client.get(
191+
"/api/v1.0/users/?q=François Lorfebvre",
192+
)
193+
assert response.status_code == 200
194+
users = [user["full_name"] for user in response.json()]
195+
assert users == []
196+
197+
79198
def test_api_users_list_limit(settings):
80199
"""
81200
Authenticated users should be able to list users and the number of results

0 commit comments

Comments
 (0)