Skip to content

Commit eddb717

Browse files
committed
Implement memray build failure detector
1 parent 9baf533 commit eddb717

File tree

15 files changed

+1132
-75
lines changed

15 files changed

+1132
-75
lines changed

backend/app/models.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,3 +198,30 @@ class AdminSession(Base):
198198
Index("idx_admin_sessions_active", "is_active"),
199199
Index("idx_admin_sessions_expires", "expires_at"),
200200
)
201+
202+
203+
class MemrayBuildFailure(Base):
204+
__tablename__ = "memray_build_failures"
205+
206+
id = Column(Integer, primary_key=True, autoincrement=True)
207+
commit_sha = Column(String(40), ForeignKey("commits.sha"), nullable=False)
208+
binary_id = Column(String(50), ForeignKey("binaries.id"), nullable=False)
209+
environment_id = Column(String(50), ForeignKey("environments.id"), nullable=False)
210+
error_message = Column(Text, nullable=False)
211+
failure_timestamp = Column(
212+
DateTime, nullable=False, default=lambda: datetime.now(UTC).replace(tzinfo=None)
213+
)
214+
commit_timestamp = Column(DateTime, nullable=False) # Timestamp of the failing commit
215+
216+
commit = relationship("Commit")
217+
binary = relationship("Binary")
218+
environment = relationship("Environment")
219+
220+
__table_args__ = (
221+
UniqueConstraint(
222+
"binary_id", "environment_id", name="unique_binary_env_failure"
223+
),
224+
Index("idx_memray_failures_timestamp", "failure_timestamp"),
225+
Index("idx_memray_failures_commit_timestamp", "commit_timestamp"),
226+
Index("idx_memray_failures_binary_env", "binary_id", "environment_id"),
227+
)

backend/app/routers/admin.py

Lines changed: 103 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,17 @@
2222
Binary,
2323
Environment,
2424
Run,
25-
AdminUser,
2625
AuthToken,
2726
BenchmarkResult,
27+
MemrayBuildFailure,
2828
Commit,
2929
)
3030
from ..schemas import (
3131
BinaryCreate,
3232
Binary as BinarySchema,
3333
EnvironmentCreate,
3434
Environment as EnvironmentSchema,
35+
MemrayFailurePublic,
3536
)
3637
from .. import crud
3738
from pydantic import BaseModel
@@ -124,6 +125,7 @@ class BenchmarkResultResponse(BaseModel):
124125
has_flamegraph: bool
125126

126127

128+
127129
router = APIRouter(prefix="/api/admin", tags=["admin"])
128130

129131

@@ -620,7 +622,7 @@ async def get_token_analytics(
620622
total_tokens = total_result.scalar()
621623

622624
active_result = await db.execute(
623-
select(func.count(AuthToken.id)).where(AuthToken.is_active == True)
625+
select(func.count(AuthToken.id)).where(AuthToken.is_active)
624626
)
625627
active_tokens = active_result.scalar()
626628

@@ -1217,3 +1219,102 @@ async def get_table_schema(
12171219
status_code=status.HTTP_404_NOT_FOUND,
12181220
detail=f"Table not found or error accessing schema: {str(e)}",
12191221
)
1222+
1223+
1224+
1225+
1226+
@router.get("/memray-failures", response_model=List[MemrayFailurePublic])
1227+
async def get_memray_failures(
1228+
current_user: AdminSession = Depends(require_admin_auth),
1229+
db: AsyncSession = Depends(get_database),
1230+
):
1231+
"""Get all memray build failures."""
1232+
result = await db.execute(
1233+
select(
1234+
MemrayBuildFailure.id,
1235+
MemrayBuildFailure.commit_sha,
1236+
MemrayBuildFailure.binary_id,
1237+
MemrayBuildFailure.environment_id,
1238+
Binary.name.label("binary_name"),
1239+
Environment.name.label("environment_name"),
1240+
MemrayBuildFailure.error_message,
1241+
MemrayBuildFailure.failure_timestamp,
1242+
MemrayBuildFailure.commit_timestamp,
1243+
)
1244+
.join(Binary)
1245+
.join(Environment)
1246+
.order_by(desc(MemrayBuildFailure.failure_timestamp))
1247+
)
1248+
failures = result.fetchall()
1249+
1250+
return [
1251+
{
1252+
"id": failure.id,
1253+
"commit_sha": failure.commit_sha,
1254+
"binary_id": failure.binary_id,
1255+
"environment_id": failure.environment_id,
1256+
"binary_name": failure.binary_name,
1257+
"environment_name": failure.environment_name,
1258+
"error_message": failure.error_message,
1259+
"failure_timestamp": failure.failure_timestamp,
1260+
"commit_timestamp": failure.commit_timestamp,
1261+
}
1262+
for failure in failures
1263+
]
1264+
1265+
1266+
@router.delete("/memray-failures/{failure_id}")
1267+
async def delete_memray_failure(
1268+
failure_id: int,
1269+
current_user: AdminSession = Depends(require_admin_auth),
1270+
db: AsyncSession = Depends(get_database),
1271+
):
1272+
"""Delete a memray build failure record."""
1273+
result = await db.execute(
1274+
select(MemrayBuildFailure).where(MemrayBuildFailure.id == failure_id)
1275+
)
1276+
failure = result.scalars().first()
1277+
1278+
if not failure:
1279+
raise HTTPException(status_code=404, detail="Memray failure not found")
1280+
1281+
await db.delete(failure)
1282+
await db.commit()
1283+
1284+
return {"message": "Memray failure deleted successfully"}
1285+
1286+
1287+
@router.get("/memray-failures/summary")
1288+
async def get_memray_failures_summary(
1289+
current_user: AdminSession = Depends(require_admin_auth),
1290+
db: AsyncSession = Depends(get_database),
1291+
):
1292+
"""Get summary of current memray failures by environment."""
1293+
result = await db.execute(
1294+
select(
1295+
MemrayBuildFailure.binary_id,
1296+
MemrayBuildFailure.environment_id,
1297+
Binary.name.label("binary_name"),
1298+
Environment.name.label("environment_name"),
1299+
MemrayBuildFailure.commit_sha,
1300+
MemrayBuildFailure.failure_timestamp,
1301+
MemrayBuildFailure.commit_timestamp,
1302+
)
1303+
.join(Binary)
1304+
.join(Environment)
1305+
.order_by(desc(MemrayBuildFailure.failure_timestamp))
1306+
)
1307+
failures = result.fetchall()
1308+
1309+
return [
1310+
{
1311+
"binary_id": failure.binary_id,
1312+
"binary_name": failure.binary_name,
1313+
"environment_id": failure.environment_id,
1314+
"environment_name": failure.environment_name,
1315+
"commit_sha": failure.commit_sha,
1316+
"failure_timestamp": failure.failure_timestamp,
1317+
"commit_timestamp": failure.commit_timestamp,
1318+
}
1319+
for failure in failures
1320+
]

backend/app/routers/public.py

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from sqlalchemy.ext.asyncio import AsyncSession
1010

1111
from ..database import get_database
12-
from ..models import AdminUser
12+
from ..models import AdminUser, MemrayBuildFailure, Binary, Environment
1313
from ..schemas import AdminUserPublic
1414

1515
logger = logging.getLogger(__name__)
@@ -25,9 +25,56 @@ async def get_maintainers(
2525
# Only return active admin users
2626
result = await db.execute(
2727
select(AdminUser)
28-
.where(AdminUser.is_active == True)
28+
.where(AdminUser.is_active)
2929
.order_by(AdminUser.added_at)
3030
)
3131
admin_users = result.scalars().all()
3232

3333
return admin_users
34+
35+
36+
@router.get("/memray-status")
37+
async def get_memray_status(
38+
db: AsyncSession = Depends(get_database),
39+
):
40+
"""Get current memray build status - public endpoint."""
41+
# Get current failures (there should be at most one per binary+environment due to unique constraint)
42+
result = await db.execute(
43+
select(
44+
MemrayBuildFailure.binary_id,
45+
MemrayBuildFailure.environment_id,
46+
Binary.name.label("binary_name"),
47+
Environment.name.label("environment_name"),
48+
MemrayBuildFailure.commit_timestamp,
49+
MemrayBuildFailure.commit_sha,
50+
MemrayBuildFailure.error_message,
51+
MemrayBuildFailure.failure_timestamp,
52+
)
53+
.join(Binary)
54+
.join(Environment)
55+
.order_by(MemrayBuildFailure.commit_timestamp.desc())
56+
)
57+
failures = result.fetchall()
58+
59+
# Build summary of affected environments
60+
affected_environments = []
61+
for failure in failures:
62+
affected_environments.append({
63+
"binary_id": failure.binary_id,
64+
"environment_id": failure.environment_id,
65+
"binary_name": failure.binary_name,
66+
"environment_name": failure.environment_name,
67+
"latest_failure": failure.commit_timestamp,
68+
"commit_sha": failure.commit_sha,
69+
"error_message": failure.error_message,
70+
"failure_timestamp": failure.failure_timestamp,
71+
})
72+
73+
has_failures = len(affected_environments) > 0
74+
75+
return {
76+
"has_failures": has_failures,
77+
"failure_count": len(affected_environments),
78+
"affected_environments": affected_environments,
79+
"message": "Memray build issues detected" if has_failures else "All environments healthy",
80+
}

backend/app/routers/upload.py

Lines changed: 123 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22

33
from fastapi import APIRouter, Depends, HTTPException
44
from sqlalchemy.ext.asyncio import AsyncSession
5-
from sqlalchemy import select, delete, desc, func, update, and_
5+
from sqlalchemy import select, desc, func, update, and_
66
from sqlalchemy.exc import IntegrityError
77
from datetime import datetime
88
import logging
99

1010
from .. import schemas, crud, models
11-
from ..database import get_database, transaction_scope
11+
from ..database import get_database
1212
from ..auth import get_current_token
1313

1414
router = APIRouter(prefix="/api", tags=["upload"])
@@ -75,7 +75,7 @@ async def cleanup_old_flamegraphs_if_needed(
7575
)
7676

7777
try:
78-
result = await db.execute(cleanup_query)
78+
await db.execute(cleanup_query)
7979
await db.commit()
8080

8181
verify_result = await db.execute(count_query)
@@ -261,7 +261,7 @@ def clean_flag(flag):
261261
)
262262

263263
try:
264-
new_run = await crud.create_run(db, run_data)
264+
await crud.create_run(db, run_data)
265265
logger.info(f"Successfully created run record: {run_id}")
266266

267267
# Create benchmark results
@@ -355,3 +355,122 @@ def clean_flag(flag):
355355
raise HTTPException(
356356
status_code=500, detail=f"Failed to upload worker run: {str(e)}"
357357
)
358+
359+
360+
@router.post("/report-memray-failure", response_model=dict)
361+
async def report_memray_failure(
362+
failure_data: schemas.MemrayFailureReport,
363+
db: AsyncSession = Depends(get_database),
364+
current_token: models.AuthToken = Depends(get_current_token),
365+
):
366+
"""Report a memray build failure for a specific commit and environment."""
367+
logger = logging.getLogger(__name__)
368+
369+
logger.info(
370+
f"Authenticated memray failure report from token '{current_token.name}' for commit {failure_data.commit_sha[:8]}, "
371+
f"binary_id='{failure_data.binary_id}', environment_id='{failure_data.environment_id}'"
372+
)
373+
374+
# Validate binary exists
375+
binary = await crud.get_binary_by_id(db, binary_id=failure_data.binary_id)
376+
if not binary:
377+
logger.error(f"Memray failure report failed: Binary '{failure_data.binary_id}' not found")
378+
raise HTTPException(
379+
status_code=400,
380+
detail=f"Binary '{failure_data.binary_id}' not found."
381+
)
382+
383+
# Validate environment exists
384+
environment = await crud.get_environment_by_id(db, environment_id=failure_data.environment_id)
385+
if not environment:
386+
logger.error(f"Memray failure report failed: Environment '{failure_data.environment_id}' not found")
387+
raise HTTPException(
388+
status_code=400,
389+
detail=f"Environment '{failure_data.environment_id}' not found."
390+
)
391+
392+
# Create or get commit
393+
commit = await crud.get_commit_by_sha(db, sha=failure_data.commit_sha)
394+
if not commit:
395+
logger.info(f"Commit {failure_data.commit_sha[:8]} not found, creating new commit record")
396+
# Create minimal commit record - we'll update it with full metadata later if needed
397+
commit_data = schemas.CommitCreate(
398+
sha=failure_data.commit_sha,
399+
timestamp=failure_data.commit_timestamp,
400+
message="Commit with memray build failure",
401+
author="Unknown",
402+
python_version=schemas.PythonVersion(major=3, minor=12, patch=0) # Default values
403+
)
404+
try:
405+
commit = await crud.create_commit(db, commit_data)
406+
logger.info(f"Successfully created minimal commit record for {failure_data.commit_sha[:8]}")
407+
except Exception as e:
408+
logger.error(f"Failed to create commit record for {failure_data.commit_sha[:8]}: {e}")
409+
raise HTTPException(
410+
status_code=500, detail=f"Failed to create commit record: {str(e)}"
411+
)
412+
413+
# Check if this failure is newer than any existing failure for this binary+environment
414+
existing_failure = await db.execute(
415+
select(models.MemrayBuildFailure)
416+
.where(
417+
models.MemrayBuildFailure.binary_id == failure_data.binary_id,
418+
models.MemrayBuildFailure.environment_id == failure_data.environment_id
419+
)
420+
)
421+
existing_failure = existing_failure.scalars().first()
422+
423+
if existing_failure:
424+
# Check if the new failure is from a newer commit
425+
if failure_data.commit_timestamp <= existing_failure.commit_timestamp:
426+
logger.info(
427+
f"Ignoring memray failure for {failure_data.commit_sha[:8]} as it's not newer than existing failure"
428+
)
429+
return {
430+
"message": "Memray failure ignored (not newer than existing failure)",
431+
"commit_sha": failure_data.commit_sha,
432+
"binary_id": failure_data.binary_id,
433+
"environment_id": failure_data.environment_id,
434+
}
435+
436+
# Update existing failure with newer information
437+
existing_failure.commit_sha = failure_data.commit_sha
438+
existing_failure.error_message = failure_data.error_message
439+
existing_failure.failure_timestamp = datetime.now()
440+
existing_failure.commit_timestamp = failure_data.commit_timestamp
441+
442+
try:
443+
await db.commit()
444+
logger.info(f"Updated existing memray failure record for binary '{failure_data.binary_id}', environment '{failure_data.environment_id}'")
445+
except Exception as e:
446+
logger.error(f"Failed to update memray failure record: {e}")
447+
raise HTTPException(
448+
status_code=500, detail=f"Failed to update memray failure record: {str(e)}"
449+
)
450+
else:
451+
# Create new failure record
452+
failure_record = models.MemrayBuildFailure(
453+
commit_sha=failure_data.commit_sha,
454+
binary_id=failure_data.binary_id,
455+
environment_id=failure_data.environment_id,
456+
error_message=failure_data.error_message,
457+
failure_timestamp=datetime.now(),
458+
commit_timestamp=failure_data.commit_timestamp
459+
)
460+
461+
try:
462+
db.add(failure_record)
463+
await db.commit()
464+
logger.info(f"Created new memray failure record for commit {failure_data.commit_sha[:8]}")
465+
except Exception as e:
466+
logger.error(f"Failed to create memray failure record: {e}")
467+
raise HTTPException(
468+
status_code=500, detail=f"Failed to create memray failure record: {str(e)}"
469+
)
470+
471+
return {
472+
"message": "Memray failure reported successfully",
473+
"commit_sha": failure_data.commit_sha,
474+
"binary_id": failure_data.binary_id,
475+
"environment_id": failure_data.environment_id,
476+
}

0 commit comments

Comments
 (0)