RolnickLab
diff --git a/‎ami/base/permissions.py‎
Lines changed: 67 additions & 0 deletions b/‎ami/base/permissions.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎ami/jobs/tasks.py‎
Lines changed: 41 additions & 18 deletions b/‎ami/jobs/tasks.py‎
Lines changed: 41 additions & 18 deletions
diff --git a/‎ami/jobs/test_tasks.py‎
Lines changed: 33 additions & 24 deletions b/‎ami/jobs/test_tasks.py‎
Lines changed: 33 additions & 24 deletions
diff --git a/‎ami/main/api/serializers.py‎
Lines changed: 51 additions & 3 deletions b/‎ami/main/api/serializers.py‎
Lines changed: 51 additions & 3 deletions
@@ -77,6 +77,73 @@ def add_collection_level_permissions(user: User | None, response_data: dict, mod
     return response_data
 
 
+def add_m2m_object_permissions(user, instance, project, response_data: dict) -> dict:
+    """
+    Add object-level permissions for models with an M2M relationship to Project.
+
+    The default permission resolution (BaseModel._get_object_perms) relies on
+    get_project(), which returns None for M2M-to-Project models (TaxaList, etc.)
+    because there's no single owning project. This function resolves permissions
+    against a specific project from the request context instead.
+
+    Validates that the instance actually belongs to the given project before
+    granting any permissions (prevents cross-project permission leaks).
+
+    This is a temporary approach for the M2M permission gap described in #1120.
+    Once that issue is resolved, this should be replaced by a generic permission
+    class (Pattern B: Bare M2M) that handles TaxaList, Taxon, ProcessingService,
+    Pipeline, and other M2M-to-Project models uniformly.
+    """
+    perms = set(response_data.get("user_permissions", []))
+
+    if not project or not instance.projects.filter(pk=project.pk).exists():
+        response_data["user_permissions"] = list(perms)
+        return response_data
+
+    if user.is_superuser:
+        perms.update(["update", "delete"])
+    else:
+        model_name = instance._meta.model_name
+        all_perms = get_perms(user, project)
+        for perm in all_perms:
+            if perm.endswith(f"_{model_name}"):
+                action = perm.split("_", 1)[0]
+                if action in {"update", "delete"}:
+                    perms.add(action)
+
+    response_data["user_permissions"] = list(perms)
+    return response_data
+
+
+class IsProjectMemberOrReadOnly(permissions.BasePermission):
+    """
+    Safe methods are allowed for everyone.
+    Unsafe methods (POST, PUT, PATCH, DELETE) require the requesting user to be
+    a member of the active project (resolved via ProjectMixin.get_active_project).
+    """
+
+    def has_permission(self, request, view):
+        if request.method in permissions.SAFE_METHODS:
+            return True
+
+        if not request.user or not request.user.is_authenticated:
+            return False
+
+        if request.user.is_superuser:  # type: ignore[union-attr]
+            return True
+
+        # view must provide get_active_project (i.e. use ProjectMixin)
+        get_active_project = getattr(view, "get_active_project", None)
+        if not get_active_project:
+            return False
+
+        project = get_active_project()
+        if not project:
+            return False
+
+        return project.members.filter(pk=request.user.pk).exists()
+
+
 class ObjectPermission(permissions.BasePermission):
     """
     Generic permission class that delegates to the model's `check_permission(user, action)` method.
 
@@ -84,15 +84,13 @@ def process_nats_pipeline_result(self, job_id: int, result_data: dict, reply_sub
 
     state_manager = AsyncJobStateManager(job_id)
 
-    progress_info = state_manager.update_state(
-        processed_image_ids, stage="process", request_id=self.request.id, failed_image_ids=failed_image_ids
-    )
+    progress_info = state_manager.update_state(processed_image_ids, stage="process", failed_image_ids=failed_image_ids)
     if not progress_info:
-        logger.warning(
-            f"Another task is already processing results for job {job_id}. "
-            f"Retrying task {self.request.id} in 5 seconds..."
-        )
-        raise self.retry(countdown=5, max_retries=10)
+        logger.error(f"Redis state missing for job {job_id} — job may have been cleaned up prematurely.")
+        # Acknowledge the task to prevent retries, since we don't know the state
+        _ack_task_via_nats(reply_subject, logger)
+        # TODO: cancel the job to fail fast once PR #1144 is merged
+        return
 
     try:
         complete_state = JobState.SUCCESS
@@ -126,6 +124,7 @@ def process_nats_pipeline_result(self, job_id: int, result_data: dict, reply_sub
         _ack_task_via_nats(reply_subject, logger)
         return
 
+    acked = False
     try:
         # Save to database (this is the slow operation)
         detections_count, classifications_count, captures_count = 0, 0, 0
@@ -145,20 +144,18 @@ def process_nats_pipeline_result(self, job_id: int, result_data: dict, reply_sub
             captures_count = len(pipeline_result.source_images)
 
         _ack_task_via_nats(reply_subject, job.logger)
+        acked = True
         # Update job stage with calculated progress
 
         progress_info = state_manager.update_state(
             processed_image_ids,
             stage="results",
-            request_id=self.request.id,
         )
 
         if not progress_info:
-            logger.warning(
-                f"Another task is already processing results for job {job_id}. "
-                f"Retrying task {self.request.id} in 5 seconds..."
-            )
-            raise self.retry(countdown=5, max_retries=10)
+            job.logger.error(f"Redis state missing for job {job_id} — job may have been cleaned up prematurely.")
+            # TODO: cancel the job to fail fast once PR #1144 is merged
+            return
 
         # update complete state based on latest progress info after saving results
         complete_state = JobState.SUCCESS
@@ -176,9 +173,11 @@ def process_nats_pipeline_result(self, job_id: int, result_data: dict, reply_sub
         )
 
     except Exception as e:
-        job.logger.error(
-            f"Failed to process pipeline result for job {job_id}: {e}. NATS will redeliver the task message."
-        )
+        error = f"Error processing pipeline result for job {job_id}: {e}"
+        if not acked:
+            error += ". NATS will re-deliver the task message."
+
+        job.logger.error(error)
 
 
 def _ack_task_via_nats(reply_subject: str, job_logger: logging.Logger) -> None:
@@ -256,9 +255,33 @@ def _update_job_progress(
             state_params["classifications"] = current_classifications + new_classifications
             state_params["captures"] = current_captures + new_captures
 
+        # Don't overwrite a stage with a stale progress value.
+        # This guards against the race where a slower worker calls _update_job_progress
+        # after a faster worker has already marked further progress.
+        try:
+            existing_stage = job.progress.get_stage(stage)
+            progress_percentage = max(existing_stage.progress, progress_percentage)
+            # Explicitly preserve FAILURE: once a stage is marked FAILURE it should
+            # never regress to a non-failure state, regardless of enum ordering.
+            if existing_stage.status == JobState.FAILURE:
+                complete_state = JobState.FAILURE
+        except (ValueError, AttributeError):
+            pass  # Stage doesn't exist yet; proceed normally
+
+        # Determine the status to write:
+        # - Stage complete (100%): use complete_state (SUCCESS or FAILURE)
+        # - Stage incomplete but FAILURE already determined: keep FAILURE visible
+        # - Stage incomplete, no failure: mark as in-progress (STARTED)
+        if progress_percentage >= 1.0:
+            status = complete_state
+        elif complete_state == JobState.FAILURE:
+            status = JobState.FAILURE
+        else:
+            status = JobState.STARTED
+
         job.progress.update_stage(
             stage,
-            status=complete_state if progress_percentage >= 1.0 else JobState.STARTED,
+            status=status,
             progress=progress_percentage,
             **state_params,
         )
 
@@ -6,25 +6,26 @@
 """
 
 import logging
+from concurrent.futures import ThreadPoolExecutor
 from unittest.mock import AsyncMock, MagicMock, patch
 
 from django.core.cache import cache
-from django.test import TestCase
+from django.test import TransactionTestCase
 from rest_framework.test import APITestCase
 
 from ami.base.serializers import reverse_with_params
 from ami.jobs.models import Job, JobDispatchMode, JobState, MLJob
 from ami.jobs.tasks import process_nats_pipeline_result
 from ami.main.models import Detection, Project, SourceImage, SourceImageCollection
 from ami.ml.models import Pipeline
-from ami.ml.orchestration.async_job_state import AsyncJobStateManager, _lock_key
+from ami.ml.orchestration.async_job_state import AsyncJobStateManager
 from ami.ml.schemas import PipelineResultsError, PipelineResultsResponse, SourceImageResponse
 from ami.users.models import User
 
 logger = logging.getLogger(__name__)
 
 
-class TestProcessNatsPipelineResultError(TestCase):
+class TestProcessNatsPipelineResultError(TransactionTestCase):
     """E2E tests for process_nats_pipeline_result with error handling."""
 
     def setUp(self):
@@ -237,38 +238,46 @@ def test_process_nats_pipeline_result_mixed_results(self, mock_manager_class):
         self.assertEqual(mock_manager.acknowledge_task.call_count, 3)
 
     @patch("ami.jobs.tasks.TaskQueueManager")
-    def test_process_nats_pipeline_result_error_concurrent_locking(self, mock_manager_class):
+    def test_process_nats_pipeline_result_concurrent_updates(self, mock_manager_class):
         """
-        Test that error results respect locking mechanism.
+        Test that concurrent workers update state independently without contention.
 
-        Verifies race condition handling when multiple workers
-        process error results simultaneously.
+        Without a lock, two workers processing different images can both call
+        update_state and receive valid progress — no retry needed, no blocking.
         """
-        # Simulate lock held by another task
-        lock_key = _lock_key(self.job.pk)
-        cache.set(lock_key, "other-task-id", timeout=60)
-
-        # Create error result
-        error_data = self._create_error_result(image_id=str(self.images[0].pk))
-        reply_subject = "tasks.reply.test789"
+        mock_manager = self._setup_mock_nats(mock_manager_class)
 
-        # Task should raise retry exception when lock not acquired
-        # The task internally calls self.retry() which raises a Retry exception
-        from celery.exceptions import Retry
+        with ThreadPoolExecutor(max_workers=2) as executor:
+            # Worker 1 processes images[0]
+            result_1 = executor.submit(
+                process_nats_pipeline_result.apply,
+                kwargs={
+                    "job_id": self.job.pk,
+                    "result_data": self._create_error_result(image_id=str(self.images[0].pk)),
+                    "reply_subject": "reply.concurrent.1",
+                },
+            )
 
-        with self.assertRaises(Retry):
-            process_nats_pipeline_result.apply(
+            # Worker 2 processes images[1] — no retry, no lock to wait for
+            result_2 = executor.submit(
+                process_nats_pipeline_result.apply,
                 kwargs={
                     "job_id": self.job.pk,
-                    "result_data": error_data,
-                    "reply_subject": reply_subject,
-                }
+                    "result_data": self._create_error_result(image_id=str(self.images[1].pk)),
+                    "reply_subject": "reply.concurrent.2",
+                },
             )
 
-        # Assert: Progress was NOT updated (lock not acquired)
+        self.assertTrue(result_1.result().successful())
+        self.assertTrue(result_2.result().successful())
+
+        # Both images should be marked as processed
         manager = AsyncJobStateManager(self.job.pk)
         progress = manager.get_progress("process")
-        self.assertEqual(progress.processed, 0)
+        self.assertIsNotNone(progress)
+        self.assertEqual(progress.processed, 2)
+        self.assertEqual(progress.total, 3)
+        self.assertEqual(mock_manager.acknowledge_task.call_count, 2)
 
     @patch("ami.jobs.tasks.TaskQueueManager")
     def test_process_nats_pipeline_result_error_job_not_found(self, mock_manager_class):
 
@@ -6,6 +6,7 @@
 from rest_framework.request import Request
 
 from ami.base.fields import DateStringField
+from ami.base.permissions import add_m2m_object_permissions
 from ami.base.serializers import DefaultSerializer, MinimalNestedModelSerializer, reverse_with_params
 from ami.base.views import get_active_project
 from ami.jobs.models import Job
@@ -633,13 +634,23 @@ def get_occurrences(self, obj):
         )
 
 
-class TaxaListSerializer(serializers.ModelSerializer):
+class TaxaListSerializer(DefaultSerializer):
     taxa = serializers.SerializerMethodField()
-    projects = serializers.PrimaryKeyRelatedField(queryset=Project.objects.all(), many=True)
+    taxa_count = serializers.SerializerMethodField()
+    projects = serializers.SerializerMethodField()
 
     class Meta:
         model = TaxaList
-        fields = ["id", "name", "description", "taxa", "projects"]
+        fields = [
+            "id",
+            "name",
+            "description",
+            "taxa",
+            "taxa_count",
+            "projects",
+            "created_at",
+            "updated_at",
+        ]
 
     def get_taxa(self, obj):
         """
@@ -651,6 +662,43 @@ def get_taxa(self, obj):
             params={"taxa_list_id": obj.pk},
         )
 
+    def get_taxa_count(self, obj):
+        """
+        Return the number of taxa in this list.
+        Uses annotated_taxa_count if available (from ViewSet) for performance.
+        """
+        return getattr(obj, "annotated_taxa_count", obj.taxa.count())
+
+    def get_permissions(self, instance, instance_data):
+        request = self.context["request"]
+        project = get_active_project(request=request)
+        return add_m2m_object_permissions(request.user, instance, project, instance_data)
+
+    def get_projects(self, obj):
+        """
+        Return list of project IDs this taxa list belongs to.
+        This is read-only and managed by the server.
+        """
+        return list(obj.projects.values_list("id", flat=True))
+
+
+class TaxaListTaxonInputSerializer(serializers.Serializer):
+    """Serializer for adding a taxon to a taxa list."""
+
+    taxon_id = serializers.IntegerField(required=True)
+
+    def validate_taxon_id(self, value):
+        """Validate that the taxon exists."""
+        if not Taxon.objects.filter(id=value).exists():
+            raise serializers.ValidationError("Taxon does not exist.")
+        return value
+
+
+class TaxaListTaxonSerializer(TaxonNoParentNestedSerializer):
+    """Serializer for taxa in a taxa list (simplified taxon representation)."""
+
+    pass
+
 
 class CaptureTaxonSerializer(DefaultSerializer):
     parent = TaxonNoParentNestedSerializer(read_only=True)