Skip to content

Enabling variables to control job batch limits #103

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ module "metaflow-computation" {
launch_template_http_endpoint = var.launch_template_http_endpoint
launch_template_http_tokens = var.launch_template_http_tokens
launch_template_http_put_response_hop_limit = var.launch_template_http_put_response_hop_limit
job_state_time_limit_timeout = var.job_state_time_limit_timeout
job_state_time_limit_reason = var.job_state_time_limit_reason

standard_tags = var.tags
}
Expand Down
13 changes: 13 additions & 0 deletions modules/computation/batch.tf
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ resource "aws_batch_compute_environment" "this" {
}
}



resource "aws_batch_job_queue" "this" {
name = local.batch_queue_name
state = "ENABLED"
Expand All @@ -83,5 +85,16 @@ resource "aws_batch_job_queue" "this" {
aws_batch_compute_environment.this.arn
]

dynamic "job_state_time_limit_action" {
for_each = local.job_state_valid ? [1] : []

content {
action= "CANCEL"
max_time_seconds = var.job_state_time_limit_timeout
/* For valid reasons go to docs --> https://docs.aws.amazon.com/batch/latest/userguide/job_stuck_in_runnable.html */
reason=var.job_state_time_limit_reason
state="RUNNABLE"
}
}
tags = var.standard_tags
}
2 changes: 2 additions & 0 deletions modules/computation/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,6 @@ locals {
ecs_instance_role_name = "${var.resource_prefix}ecs-iam-role${var.resource_suffix}"

enable_fargate_on_batch = var.batch_type == "fargate"

job_state_valid = ( var.job_state_time_limit_timeout != null && var.job_state_time_limit_reason != null )
}
10 changes: 10 additions & 0 deletions modules/computation/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,13 @@ variable "launch_template_image_id" {
nullable = true
default = null
}

variable "job_state_time_limit_timeout" {
type = number
description = "The time limit in seconds for the job to run before the action is taken"
}

variable "job_state_time_limit_reason" {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Point to Batch docs for valid reasons

type = string
description = "The reason for the job state time limit action"
}
14 changes: 14 additions & 0 deletions variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -199,3 +199,17 @@ variable "enable_key_rotation" {
description = "Enable key rotation for KMS keys"
default = false
}

variable "job_state_time_limit_timeout" {
type = number
description = "The time limit in seconds for the job to run before the action is taken"
validation {
condition = var.job_state_time_limit_timeout >= 600
error_message = "The value for 'job_state_time_limit_timeout' must be greater than 600."
}
}

variable "job_state_time_limit_reason" {
type = string
description = "The reason to log for the action being taken."
}