diff --git a/main.tf b/main.tf index 9b2aaee..ee1d495 100644 --- a/main.tf +++ b/main.tf @@ -97,6 +97,8 @@ module "metaflow-computation" { launch_template_http_endpoint = var.launch_template_http_endpoint launch_template_http_tokens = var.launch_template_http_tokens launch_template_http_put_response_hop_limit = var.launch_template_http_put_response_hop_limit + job_state_time_limit_timeout = var.job_state_time_limit_timeout + job_state_time_limit_reason = var.job_state_time_limit_reason standard_tags = var.tags } diff --git a/modules/computation/batch.tf b/modules/computation/batch.tf index 655e4a6..d6010d3 100644 --- a/modules/computation/batch.tf +++ b/modules/computation/batch.tf @@ -75,6 +75,8 @@ resource "aws_batch_compute_environment" "this" { } } + + resource "aws_batch_job_queue" "this" { name = local.batch_queue_name state = "ENABLED" @@ -83,5 +85,16 @@ resource "aws_batch_job_queue" "this" { aws_batch_compute_environment.this.arn ] + dynamic "job_state_time_limit_action" { + for_each = local.job_state_valid ? [1] : [] + + content { + action= "CANCEL" + max_time_seconds = var.job_state_time_limit_timeout + /* For valid reasons go to docs --> https://docs.aws.amazon.com/batch/latest/userguide/job_stuck_in_runnable.html */ + reason=var.job_state_time_limit_reason + state="RUNNABLE" + } + } tags = var.standard_tags } diff --git a/modules/computation/locals.tf b/modules/computation/locals.tf index bea84cb..9c40220 100644 --- a/modules/computation/locals.tf +++ b/modules/computation/locals.tf @@ -19,4 +19,6 @@ locals { ecs_instance_role_name = "${var.resource_prefix}ecs-iam-role${var.resource_suffix}" enable_fargate_on_batch = var.batch_type == "fargate" + + job_state_valid = ( var.job_state_time_limit_timeout != null && var.job_state_time_limit_reason != null ) } diff --git a/modules/computation/variables.tf b/modules/computation/variables.tf index 92d11ba..bca601f 100644 --- a/modules/computation/variables.tf +++ b/modules/computation/variables.tf @@ -102,3 +102,13 @@ variable "launch_template_image_id" { nullable = true default = null } + +variable "job_state_time_limit_timeout" { + type = number + description = "The time limit in seconds for the job to run before the action is taken" +} + +variable "job_state_time_limit_reason" { + type = string + description = "The reason for the job state time limit action" +} diff --git a/variables.tf b/variables.tf index 1738c0b..9c454a3 100644 --- a/variables.tf +++ b/variables.tf @@ -199,3 +199,17 @@ variable "enable_key_rotation" { description = "Enable key rotation for KMS keys" default = false } + +variable "job_state_time_limit_timeout" { + type = number + description = "The time limit in seconds for the job to run before the action is taken" + validation { + condition = var.job_state_time_limit_timeout >= 600 + error_message = "The value for 'job_state_time_limit_timeout' must be greater than 600." + } +} + +variable "job_state_time_limit_reason" { + type = string + description = "The reason to log for the action being taken." +}