-
Notifications
You must be signed in to change notification settings - Fork 4
✨ feat: add pod status checking to distinguish pending vs running jobs #51
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 18 commits
34d0160
50a8377
38eb99f
f9f8302
daef4a5
2132ab1
51df810
3e2b703
032b1c5
84b7394
c929fa0
0ae5786
8fb844d
031a4f6
ea40330
76151db
f295ca3
e2a2c2b
a536f2f
b38de23
fce8ecb
053e025
bd518d0
ef8ba3c
4db5d05
dfd9de2
3dbe783
8f8aae2
b02a45c
f5007f9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -475,15 +475,108 @@ async def monitor_k8_job_completion(ftune_id: str, monitor_task=None): | |
| logger.info(f"{ftune_id}: Scheduled monitoring task for job completion") | ||
|
|
||
|
|
||
| async def get_pod_phase(job_name: str) -> str | None: | ||
| """Check the status of a pod associated with a Kubernetes job. | ||
|
|
||
| This function checks if the pod is actually running, not just pending. | ||
| Useful for determining if a job is truly in progress or just waiting for resources. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| job_name : str | ||
| The Kubernetes job name | ||
|
|
||
| Returns | ||
| ------- | ||
| str | ||
| The pod phase: 'Running', 'Pending', 'Succeeded', 'Failed', 'Unknown', or None if no pod found | ||
| """ | ||
| try: | ||
| await ensure_logged_in(f"kubectl get job --namespace={settings.NAMESPACE}") | ||
|
|
||
| # Get pod status using the job-name label | ||
| command = [ | ||
| "kubectl", | ||
| "get", | ||
| "pods", | ||
| "-l", | ||
| f"job-name={job_name}", | ||
| "-o", | ||
| "jsonpath={.items[0].status.phase}", | ||
| ] | ||
|
|
||
| result = await run_subprocess_cmds(command=command) | ||
| return result[0].strip() if result and result[0] else None | ||
|
|
||
| except Exception as e: | ||
| # Handle case where job/pod has been deleted by webhook | ||
| logger.debug(f"{job_name}: Error checking pod status (likely deleted): {e}") | ||
| return None | ||
|
|
||
| async def get_job_conditions(job_name: str) -> str | None: | ||
| """ | ||
| Get the conditions of a Kubernetes job. | ||
| Parameters | ||
| ---------- | ||
| job_name : str | ||
| The name of the job to check. | ||
| Returns | ||
| ------- | ||
| str | ||
| The conditions of the job. | ||
| None | ||
| If the job has no conditions. | ||
| """ | ||
| try: | ||
| cmd =[ | ||
| "kubectl", | ||
| "get", | ||
| "job", | ||
| job_name, | ||
| "-o", | ||
| "jsonpath={.status.conditions[0].type}", | ||
|
|
||
| ] | ||
| result= await run_subprocess_cmds(cmd) | ||
| return result[0].strip() if result and result[0] else None | ||
| except Exception as e: | ||
| logger.debug(f"Error checking job conditions: {e}") | ||
| return None | ||
|
|
||
| async def get_k8s_status(job_name: str) -> str: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @fMurugi |
||
| """Get the status of a Kubernetes job. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| job_name : str | ||
| The name of the job to check. | ||
| Returns | ||
| str | ||
| The status of the job. | ||
| """ | ||
| condition = await get_job_conditions(job_name) | ||
| if condition in ["Complete","Failed"]: | ||
| return condition | ||
| # Job exists but no terminal condition → check pod | ||
| pod_phase = await get_pod_phase(job_name) | ||
| if pod_phase: | ||
| return pod_phase | ||
| return "Unknown" | ||
|
|
||
| async def check_k8s_job_status(tune_id: str, retry_label_lookup=True): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @fMurugi is there a missing argument here? should we rename the function to
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @fredotieno not sure about the argument. |
||
| """Function to check Kubernetes job status | ||
|
|
||
| This function checks both the job status and optionally the pod phase to determine | ||
| if a job is truly running or just waiting for resources (pending). | ||
|
|
||
| Parameters | ||
| ---------- | ||
| tune_id : str | ||
| Tune id | ||
| retry_label_lookup: bool | ||
| Wheather to retry lookup with labels. | ||
| Whether to retry lookup with labels. | ||
| check_pod_phase: bool | ||
| Whether to check the pod phase to distinguish between pending and running states. | ||
|
bglar marked this conversation as resolved.
Outdated
|
||
|
|
||
| Returns | ||
| ------- | ||
|
|
@@ -496,22 +589,10 @@ async def check_k8s_job_status(tune_id: str, retry_label_lookup=True): | |
| # Log in | ||
| await ensure_logged_in(f"kubectl get job --namespace={settings.NAMESPACE}") | ||
|
|
||
| command = [ | ||
| "kubectl", | ||
| "get", | ||
| "job", | ||
| kjob_id, | ||
| "-o", | ||
| "jsonpath={.status.conditions[*].type}", | ||
| ] | ||
|
|
||
| result = await run_subprocess_cmds(command=command) | ||
| logger.info(f"kubectl run cmds result: {command} ---> {result}") | ||
| # Direct resolution via unified status function | ||
| status = await get_k8s_status(kjob_id) | ||
|
|
||
| if result and result[0] != "": | ||
| # Job has completion status (Complete or Failed) | ||
| status = result[0].strip() | ||
| logger.info(f"{kjob_id}: Status for job {status}") | ||
| if status not in ["Running"]: | ||
| return status, kjob_id | ||
|
|
||
| else: | ||
|
|
@@ -543,7 +624,7 @@ async def check_k8s_job_status(tune_id: str, retry_label_lookup=True): | |
| return "Running", job_name | ||
| return result if result else ("Running", job_name) | ||
|
|
||
| # Job exists but has no conditions - verify it exists and treat as Running | ||
| # Job exists but has no conditions - verify it exists and check pod status | ||
| verify_cmd = [ | ||
| "kubectl", | ||
| "get", | ||
|
|
@@ -555,13 +636,13 @@ async def check_k8s_job_status(tune_id: str, retry_label_lookup=True): | |
| verify_result = await run_subprocess_cmds(command=verify_cmd) | ||
|
|
||
| if verify_result and verify_result[0]: | ||
| # Job exists but no status conditions yet - it's running or pending | ||
| logger.info(f"{kjob_id}: Job exists but no status conditions yet, treating as Running") | ||
| # Job exists but no status conditions yet | ||
| # Check if we should verify the pod phase | ||
| logger.info(f"{kjob_id}: Job exists but no status yet → Running") | ||
| return "Running", kjob_id | ||
| else: | ||
| # Job doesn't exist at all | ||
| logger.warning(f"{kjob_id}: Job not found in cluster") | ||
| return None, tune_id | ||
| # Job doesn't exist at all | ||
| logger.warning(f"{kjob_id}: Job not found in cluster") | ||
| return None, tune_id | ||
|
|
||
|
|
||
| async def delete_k8s_job_resources(tune_id: str): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a great introduction. Thinking of easily maintaining this in the future, creating a reference schema class for these statuses would be ideal ensuring that you would only have one place to change or update the values incase of a future change/update.
Something like ...
Then when returning the outputs from k8s,
And whenever in your code you are using the values you can easily use the values i.e
PodStatusPhase.FAILED