Skip to content

Commit 000da55

Browse files
committed
✨(backend) Fix trashbin & mimetype issues
In Drive the search should find files in the trashbin for a limited amount of time. So the soft-delete cannot disable a indexed entry Be more strict on mimetype patterns. Signed-off-by: Fabre Florian <[email protected]>
1 parent 8ff700b commit 000da55

File tree

3 files changed

+94
-22
lines changed

3 files changed

+94
-22
lines changed

src/backend/core/services/search_indexers.py

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,32 @@ def get_visited_items_ids_of(queryset, user):
115115
return [str(id) for id in docs.values_list("pk", flat=True)]
116116

117117

118+
def match_mimetype_glob(mimetype, pattern):
119+
"""
120+
Returns true if the mimetype match with the pattern.
121+
If a pattern ends with / all subtypes are valid, if not it have to
122+
perfectly match. e.g :
123+
- application/pdf only match "application/pdf" and not "application/pdf+bin"
124+
- application/ match any "application/*"
125+
"""
126+
if len(mimetype) < 1:
127+
return False
128+
129+
if pattern.endswith("/"):
130+
return mimetype.startswith(pattern)
131+
132+
return mimetype == pattern
133+
134+
135+
def is_allowed_mimetype(mimetype, patterns):
136+
"""
137+
Returns true if the mimetype is not empty and matches any of the allowed patterns.
138+
"""
139+
return len(mimetype) > 0 and any(
140+
match_mimetype_glob(mimetype, pattern) for pattern in patterns
141+
)
142+
143+
118144
class BaseItemIndexer(ABC):
119145
"""
120146
Base class for item indexers.
@@ -166,17 +192,17 @@ def index(self, queryset=None, batch_size=None):
166192
167193
Args:
168194
queryset (optional): Document queryset
169-
Defaults to all documents without filter.
195+
Defaults to all documents without the main workspaces.
170196
batch_size (int, optional): Number of documents per batch.
171197
Defaults to settings.SEARCH_INDEXER_BATCH_SIZE.
172198
"""
173199
last_id = None
174200
count = 0
175201
batch_size = batch_size or self.batch_size
176-
queryset = queryset or models.Item.objects.all()
177-
queryset = queryset.filter(
202+
queryset = queryset or models.Item.objects.filter(
178203
main_workspace=False,
179-
).order_by("id")
204+
)
205+
queryset = queryset.order_by("id")
180206

181207
while True:
182208
if last_id is not None:
@@ -280,8 +306,7 @@ def has_text(self, item):
280306
return (
281307
item.upload_state == models.ItemUploadStateChoices.READY
282308
and item.type == models.ItemTypeChoices.FILE
283-
and len(mimetype) > 0
284-
and any(mimetype.startswith(allowed) for allowed in self.allowed_mimetypes)
309+
and is_allowed_mimetype(mimetype, self.allowed_mimetypes)
285310
)
286311

287312
def serialize_item(self, item, accesses):
@@ -296,12 +321,15 @@ def serialize_item(self, item, accesses):
296321
dict: A JSON-serializable dictionary.
297322
"""
298323
doc_path = str(item.path)
299-
is_deleted = bool(item.ancestors_deleted_at or item.deleted_at)
300324
content = ""
301325

302-
# There is no endpoint in Find API for deleted items so we index it
326+
# The deleted items are still accessible in Drive (not in Docs !)
327+
# See in V2 for handling hard deleted ones
328+
is_active = True
329+
330+
# There is no endpoint in Find API for inactive items so we index it
303331
# again with an empty content.
304-
if not is_deleted and self.has_text(item):
332+
if is_active and self.has_text(item):
305333
content = self.to_text(item)
306334

307335
return {
@@ -319,7 +347,7 @@ def serialize_item(self, item, accesses):
319347
"groups": list(accesses.get(doc_path, {}).get("teams", set())),
320348
"reach": str(item.link_reach),
321349
"size": item.size or 0,
322-
"is_active": not is_deleted,
350+
"is_active": is_active,
323351
}
324352

325353
def search_query(self, data, token) -> requests.Response:

src/backend/core/tasks/search.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,10 @@ def batch_file_indexer_task(timestamp):
5555
indexer = get_file_indexer()
5656

5757
if indexer:
58-
queryset = models.Item.objects.filter(updated_at__gte=timestamp)
58+
queryset = models.Item.objects.filter(
59+
updated_at__gte=timestamp,
60+
main_workspace=False,
61+
)
5962

6063
count = indexer.index(queryset)
6164
logger.info("Indexed %d files", count)

src/backend/core/tests/test_services_search_indexers.py

Lines changed: 52 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
get_ancestor_to_descendants_map,
2121
get_file_indexer,
2222
get_visited_items_ids_of,
23+
is_allowed_mimetype,
2324
)
2425

2526
pytestmark = pytest.mark.django_db
@@ -251,15 +252,43 @@ def test_services_search_allowed_mimetypes_is_invalid(
251252
assert expected in str(exc_info.value)
252253

253254

255+
def test_services_is_allowed_mimetype():
256+
"""
257+
Check if the given mimetype matches with the allowed ones.
258+
"""
259+
assert is_allowed_mimetype("", ["text/"]) is False
260+
assert is_allowed_mimetype("text/plain", []) is False
261+
assert is_allowed_mimetype("text/plain", ["text/html"]) is False
262+
assert is_allowed_mimetype("text/plain", ["text/"]) is True
263+
assert is_allowed_mimetype("application/pdf", ["text/", "application/pdf"]) is True
264+
assert (
265+
is_allowed_mimetype("application/pdf+bin", ["text/", "application/pdf"])
266+
is False
267+
)
268+
assert is_allowed_mimetype("application/pdf+bin", ["text/", "application/"]) is True
269+
270+
254271
@pytest.mark.parametrize(
255272
"kwargs, expected",
256273
[
257274
({"mimetype": "text/plain"}, True),
258-
({"mimetype": "text/html"}, True),
259-
({"mimetype": "application/html"}, False),
260275
({"mimetype": "application/pdf"}, True),
261-
({"mimetype": "application/"}, False),
262-
({"mimetype": ""}, False),
276+
({"mimetype": "application/pdf+bin"}, False),
277+
(
278+
{
279+
"mimetype": "text/plain",
280+
"type": models.ItemTypeChoices.FOLDER,
281+
"upload_bytes": None,
282+
},
283+
False,
284+
),
285+
(
286+
{
287+
"mimetype": "text/plain",
288+
"update_upload_state": models.ItemUploadStateChoices.ANALYZING,
289+
},
290+
False,
291+
),
263292
],
264293
)
265294
def test_services_search_has_text(indexer_settings, kwargs, expected):
@@ -272,11 +301,16 @@ def test_services_search_has_text(indexer_settings, kwargs, expected):
272301
"application/pdf",
273302
]
274303

304+
params = {
305+
"upload_bytes": b"This is a text file content",
306+
"type": models.ItemTypeChoices.FILE,
307+
"update_upload_state": models.ItemUploadStateChoices.READY,
308+
}
309+
310+
params.update(kwargs)
311+
275312
item = factories.ItemFactory(
276-
upload_bytes=b"This is a text file content",
277-
type=models.ItemTypeChoices.FILE,
278-
update_upload_state=models.ItemUploadStateChoices.READY,
279-
**kwargs,
313+
**params,
280314
)
281315

282316
assert expected == SearchIndexer().has_text(item)
@@ -422,13 +456,18 @@ def test_services_search_indexers_serialize_item_textfile():
422456

423457

424458
@pytest.mark.usefixtures("indexer_settings")
425-
def test_services_search_indexers_serialize_document_deleted():
426-
"""Deleted documents are marked as just in the serialized json."""
459+
def test_services_search_indexers_serialize_document_soft_deleted():
460+
"""
461+
Deleted documents are NOT marked as inactive in the serialized json because
462+
they can be accessed through the trashbin for several days.
463+
"""
427464
folder = factories.ItemFactory(type=models.ItemTypeChoices.FOLDER)
428465
item = factories.ItemFactory(
429466
parent=folder,
430467
mimetype="text/plain",
431468
type=models.ItemTypeChoices.FILE,
469+
upload_bytes=b"This is a text file content",
470+
update_upload_state=models.ItemUploadStateChoices.READY,
432471
)
433472

434473
folder.soft_delete()
@@ -437,7 +476,9 @@ def test_services_search_indexers_serialize_document_deleted():
437476
indexer = SearchIndexer()
438477
result = indexer.serialize_item(item, {})
439478

440-
assert result["is_active"] is False
479+
# Still accessible through the thrashbin
480+
assert result["is_active"] is True
481+
assert result["content"] == "This is a text file content"
441482

442483

443484
@responses.activate

0 commit comments

Comments
 (0)