Skip to content

Commit eac0ffb

Browse files
committed
✨(backend) enable limit for the indexed content
Use SEARCH_INDEXER_CONTENT_MAX_SIZE as limit (in bytes) for the file content. Fix default configuration of OIDC_STORE_ACCESS_TOKEN Signed-off-by: Fabre Florian <[email protected]>
1 parent 0953ce8 commit eac0ffb

File tree

7 files changed

+56
-9
lines changed

7 files changed

+56
-9
lines changed

docs/env.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ This document lists all configurable environment variables for the Drive applica
9191
| `SEARCH_INDEXER_MIMETYPES` | Find application endpoint for search | `None` |
9292
| `SEARCH_INDEXER_QUERY_URL` | Find application endpoint for search | `None` |
9393
| `SEARCH_INDEXER_SECRET` | Token for indexation queries | `None` |
94-
| `SEARCH_INDEXER_UPLOAD_MAX_SIZE` | Maximum size for an indexable file | `2097152` |
94+
| `SEARCH_INDEXER_CONTENT_MAX_SIZE` | Maximum size for an indexable file | `2097152` |
9595
| `SEARCH_INDEXER_URL` | Find application endpoint for indexation | `None` |
9696
| `SEARCH_INDEXER_QUERY_LIMIT` | Maximum number of results expected from search endpoint | 50 |
9797
| `SENTRY_DSN` | Sentry DSN for error tracking | `None` |

docs/setup-find.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ SEARCH_INDEXER_QUERY_LIMIT=50
3232

3333
# Limit the mimetypes and size of indexable files
3434
SEARCH_INDEXER_ALLOWED_MIMETYPES=["text/"]
35-
SEARCH_INDEXER_UPLOAD_MAX_SIZE=2 * 2**20 # 2Mb
35+
SEARCH_INDEXER_CONTENT_MAX_SIZE=2 * 2**20 # 2Mb
3636
```
3737

3838
We also need to enable the **OIDC Token** refresh or the authentication will fail quickly.

env.d/development/common

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,3 +79,7 @@ WOPI_SRC_BASE_URL=http://app-dev:8000
7979
SEARCH_INDEXER_SECRET=find-api-key-for-driv-with-exactly-50-chars-length # Key generated by create_demo in Find app.
8080
SEARCH_INDEXER_URL="http://find:8000/api/v1.0/documents/index/"
8181
SEARCH_INDEXER_QUERY_URL="http://find:8000/api/v1.0/documents/search/"
82+
83+
# Store OIDC tokens in the session
84+
OIDC_STORE_ACCESS_TOKEN = True
85+
OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session.

src/backend/core/api/viewsets.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,6 +1009,10 @@ def search(self, request, *args, **kwargs):
10091009
- A fulltext search through the opensearch indexation app "find" if the backend is
10101010
enabled (see SEARCH_INDEXER_CLASS)
10111011
- A filtering by the model fields 'title' & 'type'.
1012+
1013+
Note : Even if the indexer is disabled this view will do OIDC refresh calls
1014+
anyway. Think about using a decorator with args to prevent this when the
1015+
SEARCH_INDEXER_CLASS setting is not configured.
10121016
"""
10131017
queryset = self.queryset
10141018
indexer = get_file_indexer()

src/backend/core/services/search_indexers.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def __init__(self):
154154
Initialize the indexer.
155155
"""
156156
self.batch_size = settings.SEARCH_INDEXER_BATCH_SIZE
157-
self.max_upload_size = settings.SEARCH_INDEXER_UPLOAD_MAX_SIZE
157+
self.max_content_size = settings.SEARCH_INDEXER_CONTENT_MAX_SIZE
158158
self.indexer_url = settings.SEARCH_INDEXER_URL
159159
self.indexer_secret = settings.SEARCH_INDEXER_SECRET
160160
self.search_url = settings.SEARCH_INDEXER_QUERY_URL
@@ -297,15 +297,17 @@ def to_text(self, item):
297297

298298
raise SuspiciousFileOperation(f"Unrecognized mimetype {mimetype}")
299299

300-
def has_text(self, item):
300+
def can_serialize_content(self, item):
301301
"""
302302
Return True if the file mimetype can be converted into text for indexation
303303
"""
304304
mimetype = item.mimetype or ""
305+
filesize = item.size or 0
305306

306307
return (
307308
item.upload_state == models.ItemUploadStateChoices.READY
308309
and item.type == models.ItemTypeChoices.FILE
310+
and filesize < self.max_content_size
309311
and is_allowed_mimetype(mimetype, self.allowed_mimetypes)
310312
)
311313

@@ -329,7 +331,7 @@ def serialize_item(self, item, accesses):
329331

330332
# There is no endpoint in Find API for inactive items so we index it
331333
# again with an empty content.
332-
if is_active and self.has_text(item):
334+
if is_active and self.can_serialize_content(item):
333335
content = self.to_text(item)
334336

335337
return {

src/backend/core/tests/test_services_search_indexers.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ def test_services_is_allowed_mimetype():
291291
),
292292
],
293293
)
294-
def test_services_search_has_text(indexer_settings, kwargs, expected):
294+
def test_services_search_can_serialize_content(indexer_settings, kwargs, expected):
295295
"""
296296
Only allowed mimetypes of uploaded files in ready state can have an indexable
297297
content.
@@ -313,7 +313,7 @@ def test_services_search_has_text(indexer_settings, kwargs, expected):
313313
**params,
314314
)
315315

316-
assert expected == SearchIndexer().has_text(item)
316+
assert expected == SearchIndexer().can_serialize_content(item)
317317

318318

319319
@pytest.mark.usefixtures("indexer_settings")
@@ -646,6 +646,43 @@ def test_services_search_indexers_ignore_content_if_not_ready(mock_push):
646646
}
647647

648648

649+
@patch.object(SearchIndexer, "push")
650+
@pytest.mark.usefixtures("indexer_settings")
651+
def test_services_search_indexers_ignore_content_if_too_big(
652+
mock_push, indexer_settings
653+
):
654+
"""
655+
Should not fill the content data when the file is over the limit
656+
setting SEARCH_INDEXER_CONTENT_MAX_SIZE
657+
"""
658+
indexer_settings.SEARCH_INDEXER_CONTENT_MAX_SIZE = 50
659+
660+
item = factories.ItemFactory(
661+
mimetype="text/plain",
662+
type=models.ItemTypeChoices.FILE,
663+
update_upload_state=models.ItemUploadStateChoices.READY,
664+
upload_bytes="a" * 49,
665+
)
666+
667+
# too big
668+
too_big_item = factories.ItemFactory(
669+
mimetype="text/plain",
670+
type=models.ItemTypeChoices.FILE,
671+
update_upload_state=models.ItemUploadStateChoices.READY,
672+
upload_bytes="a" * 50,
673+
)
674+
675+
assert SearchIndexer().index() == 2
676+
677+
assert mock_push.call_count == 1
678+
679+
results = {item["id"]: item["content"] for item in mock_push.call_args[0][0]}
680+
assert results == {
681+
str(item.id): "a" * 49,
682+
str(too_big_item.id): "",
683+
}
684+
685+
649686
@patch.object(SearchIndexer, "push")
650687
@pytest.mark.usefixtures("indexer_settings")
651688
def test_services_search_indexers_ancestors_link_reach(mock_push):

src/backend/drive/settings.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,9 +124,9 @@ class Base(Configuration):
124124
SEARCH_INDEXER_QUERY_LIMIT = values.PositiveIntegerValue(
125125
default=50, environ_name="SEARCH_INDEXER_QUERY_LIMIT", environ_prefix=None
126126
)
127-
SEARCH_INDEXER_UPLOAD_MAX_SIZE = values.PositiveIntegerValue(
127+
SEARCH_INDEXER_CONTENT_MAX_SIZE = values.PositiveIntegerValue(
128128
2 * (2**20), # 2MB
129-
environ_name="SEARCH_INDEXER_UPLOAD_MAX_SIZE",
129+
environ_name="SEARCH_INDEXER_CONTENT_MAX_SIZE",
130130
environ_prefix=None,
131131
)
132132
SEARCH_INDEXER_ALLOWED_MIMETYPES = values.ListValue(

0 commit comments

Comments
 (0)