feat: async PS liveness tracking and ProcessingServiceQuerySet API

mihow · claude · mihow · commit 77dd024b132c · 2026-02-26T18:10:55.000-08:00
Add structured queryset methods and a heartbeat mechanism so async
(pull-mode) processing services stay in sync with their actual liveness.

ProcessingService:
- New ProcessingServiceQuerySet with async_services() / sync_services()
  methods — single canonical filter for endpoint_url null-or-empty, used
  everywhere instead of ad-hoc Q expressions
- is_async property (derived from endpoint_url, no DB column)
- Docstrings reference Job.dispatch_mode ASYNC_API / SYNC_API for context

Liveness tracking:
- PROCESSING_SERVICE_LAST_SEEN_MAX = 60s constant (12× the worker's 5s
  poll interval) — async services are considered offline after this
- check_processing_services_online task now handles both modes:
  sync → active /readyz poll; async → bulk mark stale via async_services()
- _mark_pipeline_pull_services_seen() helper in jobs/views.py: single bulk
  UPDATE via job.pipeline.processing_services.async_services(), called at
  the top of both /jobs/{id}/tasks/ and /jobs/{id}/result/ so every worker
  poll cycle refreshes last_seen without needing a separate registration

Async job cleanup (from carlosg/redisatomic):
- Rename _cleanup_job_if_needed → cleanup_async_job_if_needed and export
  it so Job.cancel() can call it directly without a local import
- JobLogHandler: refresh_from_db before appending to avoid last-writer-
  wins race across concurrent worker processes
- Job.logger: update existing handler's job reference instead of always
  adding a new handler (process-level singleton leak fix)

Co-Authored-By: Claude &lt;noreply@anthropic.com&gt;
diff --git a/ami/jobs/models.py b/ami/jobs/models.py
@@ -15,7 +15,7 @@
 
 from ami.base.models import BaseModel
 from ami.base.schemas import ConfigurableStage, ConfigurableStageParam
-from ami.jobs.tasks import run_job
+from ami.jobs.tasks import cleanup_async_job_if_needed, run_job
 from ami.main.models import Deployment, Project, SourceImage, SourceImageCollection
 from ami.ml.models import Pipeline
 from ami.ml.post_processing.registry import get_postprocessing_task
@@ -336,7 +336,11 @@ def emit(self, record: logging.LogRecord):
         # Log to the current app logger
         logger.log(record.levelno, self.format(record))
 
-        # Write to the logs field on the job instance
+        # Write to the logs field on the job instance.
+        # Refresh from DB first to reduce the window for concurrent overwrites — each
+        # worker holds its own stale in-memory copy of `logs`, so without a refresh the
+        # last writer always wins and earlier entries are silently dropped.
+        self.job.refresh_from_db(fields=["logs"])
         timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
         msg = f"[{timestamp}] {record.levelname} {self.format(record)}"
         if msg not in self.job.logs.stdout:
@@ -355,7 +359,6 @@ def emit(self, record: logging.LogRecord):
             self.job.save(update_fields=["logs"], update_progress=False)
         except Exception as e:
             logger.error(f"Failed to save logs for job #{self.job.pk}: {e}")
-            pass
 
 
 @dataclass
@@ -975,24 +978,20 @@ def cancel(self):
         and transition through CANCELING → REVOKED. For other jobs,
         revoke the Celery task.
         """
-        from ami.jobs.tasks import _cleanup_job_if_needed
-
         self.status = JobState.CANCELING
         self.save()
 
-        if self.dispatch_mode == JobDispatchMode.ASYNC_API:
-            # For async jobs, the Celery task has already completed (it just queued
-            # images to NATS). Clean up NATS/Redis resources to stop task delivery,
-            # then mark as REVOKED.
-            _cleanup_job_if_needed(self)
-            self.status = JobState.REVOKED
-            self.finished_at = datetime.datetime.now()
-            self.save()
-        elif self.task_id:
+        cleanup_async_job_if_needed(self)
+        if self.task_id:
             task = run_job.AsyncResult(self.task_id)
             if task:
                 task.revoke(terminate=True)
                 self.save()
+            if self.dispatch_mode == JobDispatchMode.ASYNC_API:
+                # For async jobs we need to set the status to revoked here since the task already
+                # finished (it only queues the images).
+                self.status = JobState.REVOKED
+                self.save()
         else:
             self.status = JobState.REVOKED
             self.save()
@@ -1102,11 +1101,15 @@ def get_default_progress(cls) -> JobProgress:
     def logger(self) -> logging.Logger:
         _logger = logging.getLogger(f"ami.jobs.{self.pk}")
 
-        # Only add JobLogHandler if not already present
-        if not any(isinstance(h, JobLogHandler) for h in _logger.handlers):
-            # Also log output to a field on thie model instance
+        # Update or add JobLogHandler, always pointing to the current instance.
+        # The logger is a process-level singleton so its handler may reference a stale
+        # job instance from a previous task execution in this worker process.
+        handler = next((h for h in _logger.handlers if isinstance(h, JobLogHandler)), None)
+        if handler is None:
             logger.info("Adding JobLogHandler to logger for job %s", self.pk)
             _logger.addHandler(JobLogHandler(self))
+        else:
+            handler.job = self
         _logger.propagate = False
         return _logger
 
diff --git a/ami/jobs/tasks.py b/ami/jobs/tasks.py
@@ -178,6 +178,26 @@ def process_nats_pipeline_result(self, job_id: int, result_data: dict, reply_sub
         job.logger.error(error)
 
 
+def _fail_job(job_id: int, reason: str) -> None:
+    from ami.jobs.models import Job, JobState
+    from ami.ml.orchestration.jobs import cleanup_async_job_resources
+
+    try:
+        with transaction.atomic():
+            job = Job.objects.select_for_update().get(pk=job_id)
+            if job.status in (JobState.CANCELING, *JobState.final_states()):
+                return
+            job.status = JobState.FAILURE
+            job.finished_at = datetime.datetime.now()
+            job.save(update_fields=["status", "finished_at"])
+
+        job.logger.error(f"Job {job_id} marked as FAILURE: {reason}")
+        cleanup_async_job_resources(job.pk, job.logger)
+    except Job.DoesNotExist:
+        logger.error(f"Cannot fail job {job_id}: not found")
+        cleanup_async_job_resources(job_id, logger)
+
+
 def _ack_task_via_nats(reply_subject: str, job_logger: logging.Logger) -> None:
     try:
 
@@ -293,10 +313,10 @@ def _update_job_progress(
     # Clean up async resources for completed jobs that use NATS/Redis
     if job.progress.is_complete():
         job = Job.objects.get(pk=job_id)  # Re-fetch outside transaction
-        _cleanup_job_if_needed(job)
+        cleanup_async_job_if_needed(job)
 
 
-def _cleanup_job_if_needed(job) -> None:
+def cleanup_async_job_if_needed(job) -> None:
     """
     Clean up async resources (NATS/Redis) if this job uses them.
 
@@ -312,7 +332,7 @@ def _cleanup_job_if_needed(job) -> None:
         # import here to avoid circular imports
         from ami.ml.orchestration.jobs import cleanup_async_job_resources
 
-        cleanup_async_job_resources(job)
+        cleanup_async_job_resources(job.pk, job.logger)
 
 
 @task_prerun.connect(sender=run_job)
@@ -351,7 +371,7 @@ def update_job_status(sender, task_id, task, state: str, retval=None, **kwargs):
 
     # Clean up async resources for revoked jobs
     if state == JobState.REVOKED:
-        _cleanup_job_if_needed(job)
+        cleanup_async_job_if_needed(job)
 
 
 @task_failure.connect(sender=run_job, retry=False)
@@ -366,7 +386,7 @@ def update_job_failure(sender, task_id, exception, *args, **kwargs):
     job.save()
 
     # Clean up async resources for failed jobs
-    _cleanup_job_if_needed(job)
+    cleanup_async_job_if_needed(job)
 
 
 def log_time(start: float = 0, msg: str | None = None) -> tuple[float, Callable]:
diff --git a/ami/jobs/views.py b/ami/jobs/views.py
@@ -30,6 +30,28 @@
 logger = logging.getLogger(__name__)
 
 
+def _mark_pipeline_pull_services_seen(job: "Job") -> None:
+    """
+    Record a heartbeat for all async (pull-mode) processing services linked to the job's pipeline.
+
+    Called on every task-fetch and result-submit request so that the worker's polling activity
+    keeps last_seen/last_seen_live current. The periodic check_processing_services_online task
+    will mark services offline if this heartbeat stops arriving within PROCESSING_SERVICE_LAST_SEEN_MAX.
+
+    Note: caller identity is not verified here — any authenticated token can hit these endpoints.
+    A future application-token scheme (see PR #1117) will allow tying requests to a specific
+    processing service so the heartbeat can be scoped more precisely.
+    """
+    import datetime
+
+    if not job.pipeline_id:
+        return
+    job.pipeline.processing_services.async_services().update(
+        last_seen=datetime.datetime.now(),
+        last_seen_live=True,
+    )
+
+
 class JobFilterSet(filters.FilterSet):
     """Custom filterset to enable pipeline name filtering."""
 
@@ -245,6 +267,9 @@ def tasks(self, request, pk=None):
         if not job.pipeline:
             raise ValidationError("This job does not have a pipeline configured")
 
+        # Record heartbeat for async processing services on this pipeline
+        _mark_pipeline_pull_services_seen(job)
+
         # Get tasks from NATS JetStream
         from ami.ml.orchestration.nats_queue import TaskQueueManager
 
@@ -272,6 +297,9 @@ def result(self, request, pk=None):
 
         job = self.get_object()
 
+        # Record heartbeat for async processing services on this pipeline
+        _mark_pipeline_pull_services_seen(job)
+
         # Validate request data is a list
         if isinstance(request.data, list):
             results = request.data
diff --git a/ami/ml/models/processing_service.py b/ami/ml/models/processing_service.py
@@ -23,7 +23,29 @@
 logger = logging.getLogger(__name__)
 
 
-class ProcessingServiceManager(models.Manager.from_queryset(BaseQuerySet)):
+class ProcessingServiceQuerySet(BaseQuerySet):
+    def async_services(self) -> "ProcessingServiceQuerySet":
+        """
+        Filter to pull-mode (async) processing services — those with no endpoint URL.
+
+        These correspond to jobs with dispatch_mode=ASYNC_API. Instead of Antenna calling
+        out to them, they poll Antenna for tasks and push results back. Their liveness is
+        tracked via heartbeats from mark_seen() rather than active health checks.
+        """
+        return self.filter(models.Q(endpoint_url__isnull=True) | models.Q(endpoint_url__exact=""))
+
+    def sync_services(self) -> "ProcessingServiceQuerySet":
+        """
+        Filter to push-mode (sync) processing services — those with a configured endpoint URL.
+
+        These correspond to jobs with dispatch_mode=SYNC_API. Antenna actively calls their
+        /readyz and /process endpoints. Their liveness is tracked by the periodic
+        check_processing_services_online Celery task.
+        """
+        return self.exclude(models.Q(endpoint_url__isnull=True) | models.Q(endpoint_url__exact=""))
+
+
+class ProcessingServiceManager(models.Manager.from_queryset(ProcessingServiceQuerySet)):
     """Custom manager for ProcessingService to handle specific queries."""
 
     def create(self, **kwargs) -> "ProcessingService":
@@ -47,6 +69,15 @@ class ProcessingService(BaseModel):
 
     objects = ProcessingServiceManager()
 
+    @property
+    def is_async(self) -> bool:
+        """
+        True if this is a pull-mode (async) service with no endpoint URL, corresponding to
+        jobs with dispatch_mode=ASYNC_API. False for push-mode services with a configured
+        endpoint, corresponding to jobs with dispatch_mode=SYNC_API.
+        """
+        return not self.endpoint_url
+
     def __str__(self):
         endpoint_display = self.endpoint_url or "async"
         return f'#{self.pk} "{self.name}" ({endpoint_display})'
diff --git a/ami/ml/tasks.py b/ami/ml/tasks.py
@@ -98,16 +98,22 @@ def remove_duplicate_classifications(project_id: int | None = None, dry_run: boo
 @celery_app.task(soft_time_limit=10, time_limit=20)
 def check_processing_services_online():
     """
-    Check the status of all v1 synchronous processing services and update the last_seen/last_seen_live fields.
-    Asynchronous (pull-mode) services are updated via mark_seen() when they register pipelines.
+    Check the status of all processing services and update last_seen/last_seen_live fields.
+
+    - Sync services (dispatch_mode=SYNC_API, endpoint URL set): actively polled via /readyz.
+    - Async services (dispatch_mode=ASYNC_API, no endpoint URL): heartbeat is updated by
+      mark_seen() on registration and by _mark_pipeline_pull_services_seen() on task polling.
+      This task marks them offline if last_seen has exceeded PROCESSING_SERVICE_LAST_SEEN_MAX.
 
     @TODO make this async to check all services in parallel
     """
-    from ami.ml.models import ProcessingService
+    import datetime
+
+    from ami.ml.models import PROCESSING_SERVICE_LAST_SEEN_MAX, ProcessingService
 
-    logger.info("Checking which synchronous processing services are online.")
+    logger.info("Checking which processing services are online.")
 
-    services = ProcessingService.objects.exclude(endpoint_url__isnull=True).exclude(endpoint_url__exact="").all()
+    services = ProcessingService.objects.sync_services()
 
     for service in services:
         logger.info(f"Checking service {service}")
@@ -117,3 +123,12 @@ def check_processing_services_online():
         except Exception as e:
             logger.error(f"Error checking service {service}: {e}")
             continue
+
+    stale_cutoff = datetime.datetime.now() - PROCESSING_SERVICE_LAST_SEEN_MAX
+    stale = ProcessingService.objects.async_services().filter(last_seen_live=True, last_seen__lt=stale_cutoff)
+    count = stale.count()
+    if count:
+        logger.info(
+            f"Marking {count} async service(s) offline (no heartbeat within {PROCESSING_SERVICE_LAST_SEEN_MAX})."
+        )
+        stale.update(last_seen_live=False)