@@ -84,15 +84,13 @@ def process_nats_pipeline_result(self, job_id: int, result_data: dict, reply_sub
8484
8585 state_manager = AsyncJobStateManager (job_id )
8686
87- progress_info = state_manager .update_state (
88- processed_image_ids , stage = "process" , request_id = self .request .id , failed_image_ids = failed_image_ids
89- )
87+ progress_info = state_manager .update_state (processed_image_ids , stage = "process" , failed_image_ids = failed_image_ids )
9088 if not progress_info :
91- logger .warning (
92- f"Another task is already processing results for job { job_id } . "
93- f"Retrying task { self . request . id } in 5 seconds..."
94- )
95- raise self . retry ( countdown = 5 , max_retries = 10 )
89+ logger .error ( f"Redis state missing for job { job_id } — job may have been cleaned up prematurely." )
90+ # Acknowledge the task to prevent retries, since we don't know the state
91+ _ack_task_via_nats ( reply_subject , logger )
92+ # TODO: cancel the job to fail fast once PR #1144 is merged
93+ return
9694
9795 try :
9896 complete_state = JobState .SUCCESS
@@ -126,6 +124,7 @@ def process_nats_pipeline_result(self, job_id: int, result_data: dict, reply_sub
126124 _ack_task_via_nats (reply_subject , logger )
127125 return
128126
127+ acked = False
129128 try :
130129 # Save to database (this is the slow operation)
131130 detections_count , classifications_count , captures_count = 0 , 0 , 0
@@ -145,20 +144,18 @@ def process_nats_pipeline_result(self, job_id: int, result_data: dict, reply_sub
145144 captures_count = len (pipeline_result .source_images )
146145
147146 _ack_task_via_nats (reply_subject , job .logger )
147+ acked = True
148148 # Update job stage with calculated progress
149149
150150 progress_info = state_manager .update_state (
151151 processed_image_ids ,
152152 stage = "results" ,
153- request_id = self .request .id ,
154153 )
155154
156155 if not progress_info :
157- logger .warning (
158- f"Another task is already processing results for job { job_id } . "
159- f"Retrying task { self .request .id } in 5 seconds..."
160- )
161- raise self .retry (countdown = 5 , max_retries = 10 )
156+ job .logger .error (f"Redis state missing for job { job_id } — job may have been cleaned up prematurely." )
157+ # TODO: cancel the job to fail fast once PR #1144 is merged
158+ return
162159
163160 # update complete state based on latest progress info after saving results
164161 complete_state = JobState .SUCCESS
@@ -176,9 +173,11 @@ def process_nats_pipeline_result(self, job_id: int, result_data: dict, reply_sub
176173 )
177174
178175 except Exception as e :
179- job .logger .error (
180- f"Failed to process pipeline result for job { job_id } : { e } . NATS will redeliver the task message."
181- )
176+ error = f"Error processing pipeline result for job { job_id } : { e } "
177+ if not acked :
178+ error += ". NATS will re-deliver the task message."
179+
180+ job .logger .error (error )
182181
183182
184183def _ack_task_via_nats (reply_subject : str , job_logger : logging .Logger ) -> None :
@@ -256,9 +255,33 @@ def _update_job_progress(
256255 state_params ["classifications" ] = current_classifications + new_classifications
257256 state_params ["captures" ] = current_captures + new_captures
258257
258+ # Don't overwrite a stage with a stale progress value.
259+ # This guards against the race where a slower worker calls _update_job_progress
260+ # after a faster worker has already marked further progress.
261+ try :
262+ existing_stage = job .progress .get_stage (stage )
263+ progress_percentage = max (existing_stage .progress , progress_percentage )
264+ # Explicitly preserve FAILURE: once a stage is marked FAILURE it should
265+ # never regress to a non-failure state, regardless of enum ordering.
266+ if existing_stage .status == JobState .FAILURE :
267+ complete_state = JobState .FAILURE
268+ except (ValueError , AttributeError ):
269+ pass # Stage doesn't exist yet; proceed normally
270+
271+ # Determine the status to write:
272+ # - Stage complete (100%): use complete_state (SUCCESS or FAILURE)
273+ # - Stage incomplete but FAILURE already determined: keep FAILURE visible
274+ # - Stage incomplete, no failure: mark as in-progress (STARTED)
275+ if progress_percentage >= 1.0 :
276+ status = complete_state
277+ elif complete_state == JobState .FAILURE :
278+ status = JobState .FAILURE
279+ else :
280+ status = JobState .STARTED
281+
259282 job .progress .update_stage (
260283 stage ,
261- status = complete_state if progress_percentage >= 1.0 else JobState . STARTED ,
284+ status = status ,
262285 progress = progress_percentage ,
263286 ** state_params ,
264287 )
0 commit comments