Skip to content

Commit ae9468b

Browse files
lcianclaude
andcommitted
ref(difs): Clean up Objectstore write path after review
Simplify parallel multipart upload: use offset-based reads for correct part ordering, return just storage_path instead of a redundant tuple, and inline the retry helper. Restructure ProGuard clone to handle file vs storage_path explicitly. Simplify maybe_renew_debug_files TTI bump to warn-and-continue instead of skipping renewal. Remove tests superseded by integration coverage. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent b207ce9 commit ae9468b

6 files changed

Lines changed: 50 additions & 388 deletions

File tree

src/sentry/api/endpoints/debug_files.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -836,17 +836,18 @@ def _clone_proguard_debug_file_for_reupload(
836836
meta = build_proguard_reupload_dif_meta(debug_file, requested_debug_id)
837837
if debug_file.file is not None:
838838
dif, created = create_dif_from_id(project, meta, file=debug_file.file)
839-
else:
840-
# Objectstore streams are not seekable, but create_dif_from_id needs to
841-
# read the file twice (checksum + upload). Buffer into a temp file.
839+
if debug_file.storage_path is not None:
842840
source_fileobj = debug_file.getfile()
843841
try:
842+
# Spool into a temporary file to get a seekable stream.
844843
with tempfile.SpooledTemporaryFile(max_size=5 * 1024 * 1024) as tmp:
845844
shutil.copyfileobj(source_fileobj, tmp)
846845
tmp.seek(0)
847846
dif, created = create_dif_from_id(project, meta, fileobj=tmp)
848847
finally:
849848
source_fileobj.close()
849+
else:
850+
raise ValueError("ProjectDebugFile has neither file nor storage_path")
850851

851852
if created:
852853
record_last_upload(project)

src/sentry/debug_files/debug_files.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,30 +23,31 @@ def maybe_renew_debug_files(debug_files: Sequence[ProjectDebugFile]) -> None:
2323
days=options.get("system.debug-files-renewal-age-threshold-days")
2424
)
2525

26+
# We first check if any file needs renewal, before going to the database.
27+
needs_bump = any(dif.date_accessed <= threshold_date for dif in debug_files)
28+
if not needs_bump:
29+
return
30+
2631
ids_to_renew = []
2732
for dif in debug_files:
2833
if dif.date_accessed > threshold_date:
2934
continue
35+
ids_to_renew.append(dif.id)
3036

31-
# For Objectstore-backed files, issue a HEAD request to bump the TTI.
32-
if dif.storage_path is not None:
37+
# For Objectstore-backed files, issue a HEAD request to bump TTI.
38+
if dif.storage_path is not None and dif.date_accessed <= threshold_date:
3339
try:
3440
dif._get_objectstore_session().head(dif.storage_path)
3541
except Exception:
36-
logger.exception(
37-
"debugfile.objectstore_tti_renewal_failed",
42+
logger.warning(
43+
"Failed to bump TTI for debug file",
3844
extra={
39-
"project_debug_file_id": dif.id,
45+
"dif_id": dif.id,
4046
"project_id": dif.project_id,
4147
"storage_path": dif.storage_path,
4248
},
49+
exc_info=True,
4350
)
44-
continue
45-
46-
ids_to_renew.append(dif.id)
47-
48-
if not ids_to_renew:
49-
return
5051

5152
# Update `date_accessed` in the db.
5253
with metrics.timer("debug_files_renewal"):

src/sentry/demo_mode/tasks.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -270,8 +270,6 @@ def _sync_project_debug_file(
270270
date_accessed=source_project_debug_file.date_accessed,
271271
)
272272
except IntegrityError as e:
273-
if target_project is not None and target_storage_path is not None:
274-
get_debug_files_session(target_org.id, target_project.id).delete(target_storage_path)
275273
sentry_sdk.capture_exception(e)
276274
return None
277275

src/sentry/models/debugfile.py

Lines changed: 31 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
from sentry.models.files.utils import MAX_FILE_SIZE, clear_cached_files
4545
from sentry.objectstore import get_debug_files_session
4646
from sentry.utils import json, metrics
47+
from sentry.utils.concurrent import ContextPropagatingThreadPoolExecutor
4748
from sentry.utils.zip import safe_extract_zip
4849

4950
if TYPE_CHECKING:
@@ -54,12 +55,11 @@
5455
logger = logging.getLogger(__name__)
5556

5657
DIF_MIMETYPES = {v: k for k, v in KNOWN_DIF_FORMATS.items()}
57-
OBJECTSTORE_MULTIPART_PART_SIZE = 32 * 1024 * 1024
58-
OBJECTSTORE_MULTIPART_MAX_RETRIES = 3
59-
OBJECTSTORE_MULTIPART_MAX_WORKERS = 4
6058

6159
_proguard_file_re = re.compile(r"/proguard/(?:mapping-)?(.*?)\.txt$")
6260

61+
OBJECTSTORE_MULTIPART_PART_SIZE = 32 * 1024 * 1024 # 32 MiB
62+
6363

6464
class BadDif(Exception):
6565
pass
@@ -168,7 +168,6 @@ class Meta:
168168
__repr__ = sane_repr("object_name", "cpu_name", "debug_id")
169169

170170
def get_checksum(self) -> str:
171-
"""Returns the SHA-1 checksum, reading from the denormalized column or the linked File."""
172171
if self.storage_path is not None:
173172
assert self.checksum is not None
174173
return self.checksum
@@ -178,7 +177,6 @@ def get_checksum(self) -> str:
178177
raise ValueError("ProjectDebugFile has neither file nor storage_path")
179178

180179
def get_content_type(self) -> str:
181-
"""Returns the MIME content type, reading from the denormalized column or the linked File."""
182180
if self.storage_path is not None:
183181
assert self.content_type is not None
184182
return str(self.content_type)
@@ -187,7 +185,6 @@ def get_content_type(self) -> str:
187185
raise ValueError("ProjectDebugFile has neither file nor storage_path")
188186

189187
def get_file_size(self) -> int:
190-
"""Returns the file size in bytes, reading from the denormalized column or the linked File."""
191188
if self.storage_path is not None:
192189
assert self.file_size is not None
193190
return int(self.file_size)
@@ -196,7 +193,6 @@ def get_file_size(self) -> int:
196193
raise ValueError("ProjectDebugFile has neither file nor storage_path")
197194

198195
def get_date_created(self) -> datetime:
199-
"""Returns the creation timestamp, reading from the denormalized column or the linked File."""
200196
if self.storage_path is not None:
201197
assert self.date_created is not None
202198
return self.date_created
@@ -205,7 +201,6 @@ def get_date_created(self) -> datetime:
205201
raise ValueError("ProjectDebugFile has neither file nor storage_path")
206202

207203
def get_headers(self) -> dict[str, str]:
208-
"""Returns file headers (currently just Content-Type), from the denormalized column or the linked File."""
209204
if self.storage_path is not None:
210205
assert self.content_type is not None
211206
return {"Content-Type": self.content_type}
@@ -264,14 +259,12 @@ def features(self) -> frozenset[str]:
264259
return frozenset((self.data or {}).get("features", []))
265260

266261
def _get_objectstore_session(self) -> Session:
267-
"""Returns an Objectstore session scoped to this debug file's project."""
268262
from sentry.models.project import Project
269263

270264
org_id = Project.objects.get_from_cache(id=self.project_id).organization_id
271265
return get_debug_files_session(org=org_id, project=self.project_id)
272266

273267
def getfile(self) -> IO[bytes]:
274-
"""Returns a file-like object with the debug file contents, from Objectstore or the linked File."""
275268
if self.storage_path is not None:
276269
try:
277270
response = self._get_objectstore_session().get(self.storage_path)
@@ -284,7 +277,6 @@ def getfile(self) -> IO[bytes]:
284277
raise ValueError("ProjectDebugFile has neither file nor storage_path")
285278

286279
def save_to(self, path: str) -> None:
287-
"""Downloads the debug file contents to a local path, atomically via a temp file."""
288280
if self.storage_path is not None:
289281
path = os.path.abspath(path)
290282
base = os.path.dirname(path)
@@ -404,57 +396,48 @@ def create_dif_from_file(
404396
return create_dif_from_id(project, result[0], file=file)
405397

406398

407-
def _put_objectstore_part_with_retry(
408-
upload: MultipartUpload, chunk: bytes, part_number: int
409-
) -> CompletePart:
410-
"""Uploads a single multipart part, retrying with exponential backoff on transient errors."""
411-
for attempt in range(OBJECTSTORE_MULTIPART_MAX_RETRIES):
412-
try:
413-
return upload.put_part(chunk, part_number=part_number, content_length=len(chunk))
414-
except (RequestError, HTTPError):
415-
if attempt == OBJECTSTORE_MULTIPART_MAX_RETRIES - 1:
416-
raise
417-
time.sleep(2**attempt)
418-
419-
raise AssertionError("unreachable")
420-
421-
422399
def _upload_dif_to_objectstore(
423400
session: Session,
424401
fileobj: IO[bytes],
425402
content_type: str,
426403
file_size: int,
427-
) -> tuple[str, int]:
428-
"""Uploads a debug file to Objectstore via parallel multipart upload, aborting on failure.
429-
430-
Parts are uploaded concurrently with up to OBJECTSTORE_MULTIPART_MAX_WORKERS threads.
431-
Each worker reads its chunk from fileobj under a lock, so memory usage is bounded to
432-
roughly max_workers * OBJECTSTORE_MULTIPART_PART_SIZE.
433-
434-
Returns (storage_path, uploaded_size).
435-
"""
436-
from sentry.utils.concurrent import ContextPropagatingThreadPoolExecutor
437-
404+
) -> str:
405+
"""Uploads a debug file to Objectstore via parallel multipart upload, returning the key under which the file was uploaded."""
438406
upload = session.initiate_multipart_upload(content_type=content_type)
439-
read_lock = threading.Lock()
407+
408+
lock = threading.Lock()
440409
num_parts = max(1, math.ceil(file_size / OBJECTSTORE_MULTIPART_PART_SIZE))
441410

442-
def _read_and_upload_part(part_number: int) -> CompletePart | None:
443-
with read_lock:
411+
def put_part_with_retry(
412+
upload: MultipartUpload, chunk: bytes, part_number: int
413+
) -> CompletePart:
414+
for attempt in range(3):
415+
try:
416+
return upload.put_part(chunk, part_number=part_number, content_length=len(chunk))
417+
except (RequestError, HTTPError):
418+
if attempt == 2:
419+
raise
420+
time.sleep(2**attempt)
421+
raise AssertionError("unreachable")
422+
423+
def read_and_put_part(part_number: int) -> CompletePart | None:
424+
offset = (part_number - 1) * OBJECTSTORE_MULTIPART_PART_SIZE
425+
with lock:
426+
fileobj.seek(offset)
444427
chunk = fileobj.read(OBJECTSTORE_MULTIPART_PART_SIZE)
445428
if not chunk:
446429
return None
447-
return _put_objectstore_part_with_retry(upload, chunk, part_number)
430+
return put_part_with_retry(upload, chunk, part_number)
448431

449432
try:
450433
with ContextPropagatingThreadPoolExecutor(
451-
max_workers=OBJECTSTORE_MULTIPART_MAX_WORKERS
434+
max_workers=4,
452435
) as executor:
453-
futures = [executor.submit(_read_and_upload_part, i + 1) for i in range(num_parts)]
436+
futures = [executor.submit(read_and_put_part, i + 1) for i in range(num_parts)]
454437
parts = [part for f in futures if (part := f.result()) is not None]
455438

456439
storage_path = upload.complete(parts)
457-
return storage_path, file_size
440+
return storage_path
458441
except Exception:
459442
try:
460443
upload.abort()
@@ -554,19 +537,17 @@ def create_dif_from_id(
554537
session = get_debug_files_session(project.organization_id, project.id)
555538
if file is not None:
556539
with file.getfile() as source_fileobj:
557-
storage_path, uploaded_file_size = _upload_dif_to_objectstore(
540+
storage_path = _upload_dif_to_objectstore(
558541
session, source_fileobj, content_type, file_size
559542
)
560543
elif fileobj is not None:
561-
storage_path, uploaded_file_size = _upload_dif_to_objectstore(
562-
session, fileobj, content_type, file_size
563-
)
544+
storage_path = _upload_dif_to_objectstore(session, fileobj, content_type, file_size)
564545
else:
565546
raise RuntimeError("missing file object")
566547

567548
metrics.distribution(
568549
"storage.put.size",
569-
uploaded_file_size,
550+
file_size,
570551
tags={"usecase": "debug-files", "compression": "none"},
571552
unit="byte",
572553
)
@@ -575,7 +556,7 @@ def create_dif_from_id(
575556
file=None,
576557
storage_path=storage_path,
577558
content_type=content_type,
578-
file_size=uploaded_file_size,
559+
file_size=file_size,
579560
date_created=timezone.now(),
580561
checksum=checksum,
581562
debug_id=meta.debug_id,

tests/sentry/api/endpoints/test_debug_files.py

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
import zipfile
22
from io import BytesIO
3-
from unittest.mock import patch
43
from uuid import uuid4
54

65
from django.core.files.uploadedfile import SimpleUploadedFile
76
from django.urls import reverse
8-
from django.utils import timezone
97

108
from sentry.models.debugfile import ProjectDebugFile
119
from sentry.models.files.file import File
@@ -157,36 +155,6 @@ def test_access_control(self) -> None:
157155
response = self.client.get(f"{url}?id={download_id}")
158156
assert response.status_code == 404
159157

160-
def test_download_closes_streaming_file(self) -> None:
161-
class CloseTrackingBytesIO(BytesIO):
162-
was_closed = False
163-
164-
def close(self) -> None:
165-
self.was_closed = True
166-
super().close()
167-
168-
debug_file = ProjectDebugFile.objects.create(
169-
project_id=self.project.id,
170-
file=None,
171-
storage_path="debug-files/key",
172-
content_type="text/x-proguard+plain",
173-
file_size=len(PROGUARD_SOURCE),
174-
date_created=timezone.now(),
175-
checksum="e6d3c5185dac63eddfdc1a5edfffa32d46103b44",
176-
object_name="proguard-mapping",
177-
cpu_name="any",
178-
debug_id=PROGUARD_UUID,
179-
data={"features": ["mapping"]},
180-
)
181-
payload = CloseTrackingBytesIO(PROGUARD_SOURCE)
182-
183-
with patch.object(ProjectDebugFile, "getfile", return_value=payload):
184-
response = self.client.get(f"{self.url}?id={debug_file.id}")
185-
assert response.status_code == 200, response.content
186-
assert close_streaming_response(response) == PROGUARD_SOURCE
187-
188-
assert payload.was_closed
189-
190158
def test_dsyms_requests(self) -> None:
191159
response = self._upload_proguard(self.url, PROGUARD_UUID)
192160
assert response.status_code == 201, response.content

0 commit comments

Comments
 (0)