terrastackai · fMurugi · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 29, 2026
@@ -18,14 +18,14 @@
 from gfmstudio.fine_tuning.utils.webhook_event_handlers import (
     handle_dataset_factory_webhooks,
     handle_fine_tuning_webhooks,
+    update_tune_status,
 )
 from gfmstudio.inference.services import (
     invoke_cancel_inference_handler,
     invoke_tune_upload_handler,
 )
 from gfmstudio.inference.v2.services import invoke_inference_v2_pipelines_handler
 from gfmstudio.log import logger
-
 INF_SERVICE_NAME = "inference_gateway"
 FT_SERVICE_NAME = "geoft"
 celery_app = Celery(
@@ -55,14 +55,14 @@
 )
 def deploy_tuning_job_celery_task(**kwargs):
     # Inject the monitoring task into kwargs to avoid circular import
-    kwargs['_monitor_task'] = monitor_k8_job_completion_task
+    kwargs["_monitor_task"] = monitor_k8_job_completion_task
     return asyncio.run(deploy_tuning_job(**kwargs))
 
 
 @celery_app.task(
     name="monitor_k8_job_completion_task",
     queue=FT_SERVICE_NAME,
-    bind=True,  # Bind to get access to self for retry
+    bind=True,
     max_retries=30,  # Allow many retries
     default_retry_delay=30,  # Start with 30 seconds
 )
@@ -72,31 +72,48 @@ def monitor_k8_job_completion_task(self, ftune_id: str):
     max_wait = settings.KJOB_MAX_WAIT_SECONDS or 7200
 
     try:
-        k8s_job_status, _ = asyncio.run(check_k8s_job_status(ftune_id))
+        k8s_job_status, _ = asyncio.run(
+            check_k8s_job_status(ftune_id)
+        )
     except Exception as exc:
         if "not found" in str(exc):
             # Job not found, consider it done (likely already completed and deleted)
-            logger.debug(f"{ftune_id}: Job not found, assuming completed and cleaned up")
+            logger.debug(
+                f"{ftune_id}: Job not found, assuming completed and cleaned up"
+            )
             return "Completed"
         # Unexpected error, retry with exponential backoff
         logger.warning(f"{ftune_id}: Error checking job status, will retry: {exc}")
-        raise self.retry(exc=exc, countdown=min(2 ** self.request.retries * 30, max_wait))
-
-    # Handle None status (job not found after retries)
+        raise self.retry(exc=exc, countdown=min(2**self.request.retries * 30, max_wait))
+
     if k8s_job_status is None:
-        # Job doesn't exist - either completed and deleted, or never created
-        logger.debug(f"{ftune_id}: Job status is None, assuming completed and cleaned up")
+        logger.debug(
+            f"{ftune_id}: Job status is None, assuming completed and cleaned up"
+        )
         return "Completed"
 
+    if k8s_job_status == "Unknown":
+        logger.info(
+            f"{ftune_id}: Job status is Unknown (resources deleted), assuming completed and cleaned up"
+        )
+        return "Completed"
+
     if k8s_job_status in ["Complete", "Failed"]:
-        # Job is done
         logger.info(f"{ftune_id}: Job finished with status: {k8s_job_status}")
         return k8s_job_status
-
+
+    if k8s_job_status == "Running":
+        try:
+            asyncio.run(update_tune_status(ftune_id, "In_progress"))
+        except Exception as e:
+            logger.warning(f"{ftune_id}: Failed to update status to In_progress: {e}")
+
     # Job still running, retry with exponential backoff
     # countdown: 30s, 60s, 120s, 240s, 480s, 960s (max with default 600s setting)
-    countdown = min(2 ** self.request.retries * 30, max_wait)
-    logger.info(f"{ftune_id}: Job status={k8s_job_status}, will check again in {countdown}s")
+    countdown = min(2**self.request.retries * 30, max_wait)
+    logger.info(
+        f"{ftune_id}: Job status={k8s_job_status}, will check again in {countdown}s"
+    )
     raise self.retry(countdown=countdown)
 
 
@@ -105,7 +122,7 @@ def monitor_k8_job_completion_task(self, ftune_id: str):
     queue=FT_SERVICE_NAME,
 )
 def deploy_hpo_tuning_celery_task(**kwargs):
-    kwargs['_monitor_task'] = monitor_k8_job_completion_task
+    kwargs["_monitor_task"] = monitor_k8_job_completion_task
     return asyncio.run(deploy_hpo_tuning_job(**kwargs))
 
 

@@ -475,15 +475,108 @@ async def monitor_k8_job_completion(ftune_id: str, monitor_task=None):
     logger.info(f"{ftune_id}: Scheduled monitoring task for job completion")
 
 
+async def get_pod_phase(job_name: str) -> str | None:
+    """Check the status of a pod associated with a Kubernetes job.
+
+    This function checks if the pod is actually running, not just pending.
+    Useful for determining if a job is truly in progress or just waiting for resources.
+
+    Parameters
+    ----------
+    job_name : str
+        The Kubernetes job name
+
+    Returns
+    -------
+    str
+        The pod phase: 'Running', 'Pending', 'Succeeded', 'Failed', 'Unknown', or None if no pod found
+    """
+    try:
+        await ensure_logged_in(f"kubectl get job --namespace={settings.NAMESPACE}")
+
+        # Get pod status using the job-name label
+        command = [
+            "kubectl",
+            "get",
+            "pods",
+            "-l",
+            f"job-name={job_name}",
+            "-o",
+            "jsonpath={.items[0].status.phase}",
+        ]
+
+        result = await run_subprocess_cmds(command=command)
+        return result[0].strip() if result and result[0] else None
+
+    except Exception as e:
+        # Handle case where job/pod has been deleted by webhook
+        logger.debug(f"{job_name}: Error checking pod status (likely deleted): {e}")
+        return None
+
+async def get_job_conditions(job_name: str) -> str | None:
+    """
+    Get the conditions of a Kubernetes job.
+    Parameters
+    ----------
+    job_name : str
+        The name of the job to check.
+    Returns
+    -------
+    str
+        The conditions of the job.
+    None
+        If the job has no conditions.
+    """
+    try:
+        cmd =[
+            "kubectl",
+            "get",
+            "job",
+            job_name,
+            "-o",
+            "jsonpath={.status.conditions[0].type}",
+
+        ]
+        result= await run_subprocess_cmds(cmd)
+        return result[0].strip() if result and result[0] else None
+    except Exception as e:
+        logger.debug(f"Error checking job conditions: {e}")
+        return None
+
+async def get_k8s_status(job_name: str) -> str:
+    """Get the status of a Kubernetes job.
+
+    Parameters
+    ----------
+    job_name : str
+        The name of the job to check.
+    Returns
+        str
+        The status of the job.
+    """
+    condition = await get_job_conditions(job_name)
+    if condition in ["Complete","Failed"]:
+        return condition
+    # Job exists but no terminal condition → check pod
+    pod_phase = await get_pod_phase(job_name)
+    if pod_phase:
+        return pod_phase
+    return "Unknown"
+
 async def check_k8s_job_status(tune_id: str, retry_label_lookup=True):
     """Function to check Kubernetes job status
+
+    This function checks both the job status and optionally the pod phase to determine
+    if a job is truly running or just waiting for resources (pending).
 
     Parameters
     ----------
     tune_id : str
         Tune id
     retry_label_lookup: bool
-        Wheather to retry lookup with labels.
+        Whether to retry lookup with labels.
+    check_pod_phase: bool
+        Whether to check the pod phase to distinguish between pending and running states.
 
     Returns
     -------
@@ -496,22 +589,10 @@ async def check_k8s_job_status(tune_id: str, retry_label_lookup=True):
     # Log in
     await ensure_logged_in(f"kubectl get job --namespace={settings.NAMESPACE}")
 
-    command = [
-        "kubectl",
-        "get",
-        "job",
-        kjob_id,
-        "-o",
-        "jsonpath={.status.conditions[*].type}",
-    ]
-
-    result = await run_subprocess_cmds(command=command)
-    logger.info(f"kubectl run cmds result: {command} ---> {result}")
+    # Direct resolution via unified status function
+    status = await get_k8s_status(kjob_id)
 
-    if result and result[0] != "":
-        # Job has completion status (Complete or Failed)
-        status = result[0].strip()
-        logger.info(f"{kjob_id}: Status for job {status}")
+    if status not in ["Running"]:
         return status, kjob_id
 
     else:
@@ -543,7 +624,7 @@ async def check_k8s_job_status(tune_id: str, retry_label_lookup=True):
                     return "Running", job_name
                 return result if result else ("Running", job_name)
 
-        # Job exists but has no conditions - verify it exists and treat as Running
+        # Job exists but has no conditions - verify it exists and check pod status
         verify_cmd = [
             "kubectl",
             "get",
@@ -555,13 +636,13 @@ async def check_k8s_job_status(tune_id: str, retry_label_lookup=True):
         verify_result = await run_subprocess_cmds(command=verify_cmd)
 
         if verify_result and verify_result[0]:
-            # Job exists but no status conditions yet - it's running or pending
-            logger.info(f"{kjob_id}: Job exists but no status conditions yet, treating as Running")
+            # Job exists but no status conditions yet
+            # Check if we should verify the pod phase
+            logger.info(f"{kjob_id}: Job exists but no status yet → Running")
             return "Running", kjob_id
-        else:
-            # Job doesn't exist at all
-            logger.warning(f"{kjob_id}: Job not found in cluster")
-            return None, tune_id
+        # Job doesn't exist at all
+        logger.warning(f"{kjob_id}: Job not found in cluster")
+        return None, tune_id
 
 
 async def delete_k8s_job_resources(tune_id: str):

@@ -11,18 +11,19 @@
 import string
 from typing import Any, Dict, Optional, Tuple
 
+import asyncio
 import yaml
 from asyncer import asyncify
 from fastapi import HTTPException
 from jinja2 import BaseLoader, Environment, runtime
 from sqlalchemy.orm import Session
 
-from gfmstudio.celery_worker import deploy_tuning_job_celery_task
+
 from gfmstudio.common.api import crud
 from gfmstudio.config import settings
 from gfmstudio.fine_tuning import schemas
 from gfmstudio.fine_tuning.core import object_storage, tunes
-from gfmstudio.fine_tuning.core.kubernetes import deploy_tuning_job
+from gfmstudio.fine_tuning.core.kubernetes import(deploy_tuning_job,check_k8s_job_status) 
 from gfmstudio.fine_tuning.core.schema import TuneTemplateParameters
 from gfmstudio.fine_tuning.core.tuning_config_utils import (
     convert_to_jinja2_compatible_braces,
@@ -39,6 +40,10 @@
 )
 from gfmstudio.fine_tuning.models import BaseModels, GeoDataset, Tunes, TuneTemplate
 from gfmstudio.fine_tuning.utils.geoserver_handlers import convert_to_geoserver_sld
+from gfmstudio.celery_worker import deploy_tuning_job_celery_task
+
+tune_crud = crud.ItemCrud(model=Tunes)
+from gfmstudio.common.api import crud, utils
 
 logger = logging.getLogger(__name__)
 
@@ -731,9 +736,8 @@ async def submit_tune_job(
     detail = None
 
     try:
-        if settings.CELERY_TASKS_ENABLED:
-            # Submit via Celery
-            deploy_tuning_job_celery_task.apply_async(
+        if settings.CELERY_TASKS_ENABLED:            
+            result = deploy_tuning_job_celery_task.apply_async(
                 kwargs={
                     "ftune_id": tune_id,
                     "ftune_config_file": config_path,
@@ -742,17 +746,41 @@ async def submit_tune_job(
                 },
                 task_id=tune_id,
             )
-            ftune_job_id = f"kjob-{tune_id}".lower()
-            status = "In_progress"
+
+            try:
+                job_result = await asyncio.to_thread(result.get,timeout=5)
+                ftune_job_id, job_status = job_result
+
+                if job_status == "Error":
+                    status = "Failed"
+                else:
+                    ftune_job_id = f"kjob-{tune_id}".lower()
+                    status = "Pending"
+            except Exception as e:
+                logger.debug(f"{tune_id}: Job creation in progress: {e}")
+                ftune_job_id = f"kjob-{tune_id}".lower()
+                status = "Pending"
         else:
-            # Submit directly
             ftune_job_id, updated_status = await deploy_tuning_job(
                 ftune_id=tune_id,
                 ftune_config_file=config_path,
                 ftuning_runtime_image=runtime_image,
                 tune_type=schemas.TuneOptionEnum.K8_JOB,
             )
-            status = updated_status or "Submitted"
+
+            if updated_status == "Error":
+                status = "Failed"
+            elif updated_status == "In_progress":
+                k8s_status, _ = await check_k8s_job_status(tune_id, check_pod_phase=True)
+
+                if k8s_status == "Running":
+                    status = "In_progress"
+                elif k8s_status == "Pending":
+                    status = "Pending"
+                else:
+                    status = updated_status
+            else:
+                status = updated_status or "Submitted"
 
         logger.info(f"Tune job {ftune_job_id} submitted with status: {status}")
 
@@ -763,3 +791,4 @@ async def submit_tune_job(
         status = "Failed"
 
     return ftune_job_id, status, detail
+