Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ and this project adheres to

- ✅(e2e) fix e2e test for other browsers #1799

### Changed

- 🚸(backend) sort user search results by proximity with the active user #1802
Copy link
Member

@sampaccoud sampaccoud Jan 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe create an issue now in django-lasuite so we don't forget to upstream it as it will have to be shared with all our apps once it has proven its efficiency?


## [4.4.0] - 2026-01-13

### Added
Expand Down
70 changes: 66 additions & 4 deletions src/backend/core/api/viewsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,12 @@
get_visited_document_ids_of,
)
from core.tasks.mail import send_ask_for_access_mail
from core.utils import extract_attachments, filter_descendants
from core.utils import (
extract_attachments,
extract_email_domain_parts,
filter_descendants,
users_sharing_documents_with,
)

from . import permissions, serializers, utils
from .filters import DocumentFilter, ListDocumentFilter, UserSearchFilter
Expand Down Expand Up @@ -218,18 +223,75 @@ def get_queryset(self):

# Use trigram similarity for non-email-like queries
# For performance reasons we filter first by similarity, which relies on an
# index, then only calculate precise similarity scores for sorting purposes
# index, then only calculate precise similarity scores for sorting purposes.
#
# Additionally results are reordered to prefer users "closer" to the current
# user: users they recently shared documents with, same full domain, same
# partial domain (e.g. both end with "gouv.fr"). To achieve that without
# complex SQL, we build a proximity score in Python and return the
# top N results.
current_user = self.request.user
shared_map = users_sharing_documents_with(current_user)

user_full_domain, user_partial_domain = extract_email_domain_parts(
current_user.email or ""
)

return (
candidates = list(
queryset.annotate(
sim_email=TrigramSimilarity("email", query),
sim_name=TrigramSimilarity("full_name", query),
)
.annotate(similarity=Greatest("sim_email", "sim_name"))
.filter(similarity__gt=0.2)
.order_by("-similarity")[: settings.API_USERS_LIST_LIMIT]
.order_by("-similarity")
)

# Build ordering key for each candidate
def _sort_key(u):
# shared priority: most recent first
# Use shared_last_at timestamp numeric for secondary ordering when shared.
shared_last_at = shared_map.get(u.id)
if shared_last_at:
is_shared = 1
shared_score = int(shared_last_at.timestamp())
else:
is_shared = 0
shared_score = 0

# domain proximity
candidate_full_domain, candidate_partial_domain = (
extract_email_domain_parts(u.email or "")
)

same_full_domain = (
1
if candidate_full_domain and candidate_full_domain == user_full_domain
else 0
)

same_partial_domain = (
1
if candidate_partial_domain
and candidate_partial_domain == user_partial_domain
else 0
)

# similarity fallback
sim = getattr(u, "similarity", 0) or 0

return (
is_shared,
shared_score,
same_full_domain,
same_partial_domain,
sim,
)

candidates.sort(key=_sort_key, reverse=True)

return candidates[: settings.API_USERS_LIST_LIMIT]

@drf.decorators.action(
detail=False,
methods=["get"],
Expand Down
79 changes: 78 additions & 1 deletion src/backend/core/tests/test_api_users.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
Test users API endpoints in the impress core app.
"""

from django.utils import timezone

import pytest
from rest_framework.test import APIClient

Expand Down Expand Up @@ -201,10 +203,85 @@ def test_api_users_list_query_accented_full_name():
assert users == []


def test_api_users_list_sorted_by_closest_match():
"""
Authenticated users should be able to list users and the results should be
sorted by closest match to the query.

Sorting criteria are :
- Shared documents with the user (most recent first)
- Same full email domain (example.gouv.fr)
- Same partial email domain (gouv.fr)

Case in point: the logged-in user has recently shared documents
with pierre.dupont@beta.gouv.fr and less recently with pierre.durand@impots.gouv.fr.

Other users named Pierre also exist:
- pierre.thomas@example.com
- pierre.petit@anct.gouv.fr
- pierre.robert@culture.gouv.fr

The search results should be ordered as follows:

# Shared with first
- pierre.dupond@beta.gouv.fr # Most recent first
- pierre.durand@impots.gouv.fr
# Same full domain second
- pierre.petit@anct.gouv.fr
# Same partial domain third
- pierre.robert@culture.gouv.fr
# Others last
- paul.thomas@example.com
"""

user = factories.UserFactory(
email="martin.bernard@anct.gouv.fr", full_name="Martin Bernard"
)

client = APIClient()
client.force_login(user)

pierre_1 = factories.UserFactory(email="pierre.dupont@beta.gouv.fr")
pierre_2 = factories.UserFactory(email="pierre.durand@impots.gouv.fr")
pierre_3 = factories.UserFactory(email="pierre.thomas@example.com")
pierre_4 = factories.UserFactory(email="pierre.petit@anct.gouv.fr")
pierre_5 = factories.UserFactory(email="pierre.robert@culture.gouv.fr")

document_1 = factories.DocumentFactory(creator=user)
document_2 = factories.DocumentFactory(creator=user)
factories.UserDocumentAccessFactory(user=user, document=document_1)
factories.UserDocumentAccessFactory(user=user, document=document_2)

now = timezone.now()
last_week = now - timezone.timedelta(days=7)
last_month = now - timezone.timedelta(days=30)

# The factory cannot set the created_at directly, so we force it after creation
p1_d1 = factories.UserDocumentAccessFactory(user=pierre_1, document=document_1)
p1_d1.created_at = last_week
p1_d1.save()

p2_d2 = factories.UserDocumentAccessFactory(user=pierre_2, document=document_2)
p2_d2.created_at = last_month
p2_d2.save()

response = client.get("/api/v1.0/users/?q=Pierre")
assert response.status_code == 200
user_ids = [user["email"] for user in response.json()]

assert user_ids == [
str(pierre_1.email),
str(pierre_2.email),
str(pierre_4.email),
str(pierre_5.email),
str(pierre_3.email),
]


def test_api_users_list_limit(settings):
"""
Authenticated users should be able to list users and the number of results
should be limited to 10.
should be limited to API_USERS_LIST_LIMIT (by default 5).
"""
user = factories.UserFactory()

Expand Down
16 changes: 16 additions & 0 deletions src/backend/core/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,19 @@ def test_utils_get_ancestor_to_descendants_map_multiple_paths():
"000100020005": {"000100020005"},
"00010003": {"00010003"},
}


def test_utils_extract_email_domain_parts_when_email_is_valid():
"""Test extraction of email domain parts."""
email = "firstname.lastname@numerique.gouv.fr"
full_domain, partial_domain = utils.extract_email_domain_parts(email)
assert full_domain == "numerique.gouv.fr"
assert partial_domain == "gouv.fr"


def test_utils_extract_email_domain_parts_when_email_is_empty():
"""Test extraction of email domain parts in case of an empty email."""
empty_email = ""
full_domain, partial_domain = utils.extract_email_domain_parts(empty_email)
assert full_domain == ""
assert partial_domain == ""
64 changes: 64 additions & 0 deletions src/backend/core/tests/test_utils_users_sharing_documents_with.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""
Unit tests for the users_sharing_documents_with utility function.
"""

from django.utils import timezone

import pytest

from core import factories, utils

pytestmark = pytest.mark.django_db


def test_utils_users_sharing_documents_with():
"""Test users_sharing_documents_with function."""

user = factories.UserFactory(
email="martin.bernard@anct.gouv.fr", full_name="Martin Bernard"
)

pierre_1 = factories.UserFactory(
email="pierre.dupont@beta.gouv.fr", full_name="Pierre Dupont"
)
pierre_2 = factories.UserFactory(
email="pierre.durand@impots.gouv.fr", full_name="Pierre Durand"
)

now = timezone.now()
yesterday = now - timezone.timedelta(days=1)
last_week = now - timezone.timedelta(days=7)
last_month = now - timezone.timedelta(days=30)

document_1 = factories.DocumentFactory(creator=user)
document_2 = factories.DocumentFactory(creator=user)
document_3 = factories.DocumentFactory(creator=user)

factories.UserDocumentAccessFactory(user=user, document=document_1)
factories.UserDocumentAccessFactory(user=user, document=document_2)
factories.UserDocumentAccessFactory(user=user, document=document_3)

# The factory cannot set the created_at directly, so we force it after creation
doc_1_pierre_1 = factories.UserDocumentAccessFactory(
user=pierre_1, document=document_1, created_at=last_week
)
doc_1_pierre_1.created_at = last_week
doc_1_pierre_1.save()
doc_2_pierre_2 = factories.UserDocumentAccessFactory(
user=pierre_2, document=document_2
)
doc_2_pierre_2.created_at = last_month
doc_2_pierre_2.save()

doc_3_pierre_2 = factories.UserDocumentAccessFactory(
user=pierre_2, document=document_3
)
doc_3_pierre_2.created_at = yesterday
doc_3_pierre_2.save()

shared_map = utils.users_sharing_documents_with(user)

assert shared_map == {
pierre_1.id: last_week,
pierre_2.id: yesterday,
}
42 changes: 41 additions & 1 deletion src/backend/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@
import re
from collections import defaultdict

from django.core.exceptions import ValidationError
from django.core.validators import validate_email
from django.db import models as db

import pycrdt
from bs4 import BeautifulSoup

from core import enums
from core import enums, models


def get_ancestor_to_descendants_map(paths, steplen):
Expand Down Expand Up @@ -96,3 +100,39 @@ def extract_attachments(content):

xml_content = base64_yjs_to_xml(content)
return re.findall(enums.MEDIA_STORAGE_URL_EXTRACT, xml_content)


def users_sharing_documents_with(user):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit afraid of this, because on each search query, you will load all user accesses, I don't have a better proposition and maybe we should let it like that but we should at least add a "timer" log to be able to see if this takes too much time. Maybe a cache could also help?

"""
Returns a map of users sharing documents with the given user,
sorted by last shared date.
"""

user_docs_qs = models.DocumentAccess.objects.filter(user=user).values_list(
"document_id", flat=True
)
shared_qs = (
models.DocumentAccess.objects.filter(document_id__in=user_docs_qs)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would using a Subquery be better here? (I mean, to prevent data from being passed to Django for nothing)

.exclude(user=user)
.values("user")
.annotate(last_shared=db.Max("created_at"))
)
return {item["user"]: item["last_shared"] for item in shared_qs}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return {item["user"]: item["last_shared"] for item in shared_qs}
return {item["user"]: item["last_shared"] for item in shared_qs.iterator()}

I would suggest to use an iterator here.



def extract_email_domain_parts(email):
"""
Extracts the full domain and partial domain from an email address as a tuple.
The partial domain consists of the last two segments of the domain, eg. "gouv.fr".

If the email is invalid (eg, is an empty string), returns empty strings.
"""
try:
validate_email(email)
except ValidationError:
return "", ""

domain = email.split("@", 1)[1].lower()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Such split is not "safe", the safe way is using email.headerregistry.Address but for performances sake, I guess we should keep it like this: maybe add a comment and rename the function unsafe_extract_email_domain_parts to say you did it this way on purpose could help, and might prevent the use of this method for another context :)

Note: django-lasuite already provides a get_domain_from_email

parts = domain.split(".")
partial_domain = ".".join(parts[-2:]) if len(parts) >= 2 else domain
return domain, partial_domain
Loading