diff --git a/.gitignore b/.gitignore
index 4759a205e3..c78b7d1e2c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@ eks-admin-cluster-role-binding.yaml
eks-admin-service-account.yaml
config-map-aws-auth*.yaml
kubeconfig_*
+.idea
#################################################################
# Default .gitignore content for all terraform-aws-modules below
diff --git a/README.md b/README.md
index d1b01cbb9e..6169d70799 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,8 @@ An example of harming update was the removal of several commonly used, but depre
By default, this module manages the `aws-auth` configmap for you (`manage_aws_auth=true`). To avoid the following [issue](https://github.com/aws/containers-roadmap/issues/654) where the EKS creation is `ACTIVE` but not ready. We implemented a "retry" logic with a fork of the http provider https://github.com/terraform-aws-modules/terraform-provider-http. This fork adds the support of a self-signed CA certificate. The original PR can be found at https://github.com/hashicorp/terraform-provider-http/pull/29.
+Setting `instance_refresh_enabled` to true will recreate your worker nodes without draining them first. It is recommended to install [aws-node-termination-handler](https://github.com/aws/aws-node-termination-handler) for proper node draining. Find the complete example here [instance_refresh](examples/instance_refresh).
+
## Usage example
A full example leveraging other community modules is contained in the [examples/basic directory](https://github.com/terraform-aws-modules/terraform-aws-eks/tree/master/examples/basic).
@@ -155,7 +157,6 @@ MIT Licensed. See [LICENSE](https://github.com/terraform-aws-modules/terraform-a
| [http](#provider\_http) | >= 2.3.0 |
| [kubernetes](#provider\_kubernetes) | >= 1.11.1 |
| [local](#provider\_local) | >= 1.4 |
-| [random](#provider\_random) | >= 2.1 |
## Modules
@@ -202,8 +203,6 @@ MIT Licensed. See [LICENSE](https://github.com/terraform-aws-modules/terraform-a
| [aws_security_group_rule.workers_ingress_self](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group_rule) | resource |
| [kubernetes_config_map.aws_auth](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map) | resource |
| [local_file.kubeconfig](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource |
-| [random_pet.workers](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/pet) | resource |
-| [random_pet.workers_launch_template](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/pet) | resource |
| [aws_ami.eks_worker](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source |
| [aws_ami.eks_worker_windows](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source |
| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |
diff --git a/examples/instance_refresh/main.tf b/examples/instance_refresh/main.tf
new file mode 100644
index 0000000000..137df25b4a
--- /dev/null
+++ b/examples/instance_refresh/main.tf
@@ -0,0 +1,234 @@
+provider "aws" {
+ region = var.region
+}
+
+data "aws_caller_identity" "current" {}
+
+data "aws_eks_cluster" "cluster" {
+ name = module.eks.cluster_id
+}
+
+data "aws_eks_cluster_auth" "cluster" {
+ name = module.eks.cluster_id
+}
+
+provider "kubernetes" {
+ host = data.aws_eks_cluster.cluster.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
+ token = data.aws_eks_cluster_auth.cluster.token
+ load_config_file = false
+}
+
+provider "helm" {
+ kubernetes {
+ host = data.aws_eks_cluster.cluster.endpoint
+ cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
+ token = data.aws_eks_cluster_auth.cluster.token
+ }
+}
+
+data "aws_availability_zones" "available" {
+}
+
+locals {
+ cluster_name = "test-refresh-${random_string.suffix.result}"
+}
+
+resource "random_string" "suffix" {
+ length = 8
+ special = false
+}
+
+module "vpc" {
+ source = "terraform-aws-modules/vpc/aws"
+ version = "~> 3.0.0"
+
+ name = local.cluster_name
+ cidr = "10.0.0.0/16"
+ azs = data.aws_availability_zones.available.names
+ public_subnets = ["10.0.4.0/24", "10.0.5.0/24", "10.0.6.0/24"]
+ enable_dns_hostnames = true
+}
+
+data "aws_iam_policy_document" "node_term" {
+ statement {
+ effect = "Allow"
+ actions = [
+ "ec2:DescribeInstances",
+ "autoscaling:DescribeAutoScalingInstances",
+ "autoscaling:DescribeTags",
+ ]
+ resources = [
+ "*",
+ ]
+ }
+ statement {
+ effect = "Allow"
+ actions = [
+ "autoscaling:CompleteLifecycleAction",
+ ]
+ resources = module.eks.workers_asg_arns
+ }
+ statement {
+ effect = "Allow"
+ actions = [
+ "sqs:DeleteMessage",
+ "sqs:ReceiveMessage"
+ ]
+ resources = [
+ module.node_term_sqs.sqs_queue_arn
+ ]
+ }
+}
+
+resource "aws_iam_policy" "node_term" {
+ name = "node-term-${local.cluster_name}"
+ policy = data.aws_iam_policy_document.node_term.json
+}
+
+resource "aws_iam_role_policy_attachment" "node_term_policy" {
+ policy_arn = aws_iam_policy.node_term.arn
+ role = module.eks.worker_iam_role_name
+}
+
+data "aws_iam_policy_document" "node_term_events" {
+ statement {
+ effect = "Allow"
+ principals {
+ type = "Service"
+ identifiers = [
+ "events.amazonaws.com",
+ "sqs.amazonaws.com",
+ ]
+ }
+ actions = [
+ "sqs:SendMessage",
+ ]
+ resources = [
+ "arn:aws:sqs:${var.region}:${data.aws_caller_identity.current.account_id}:${local.cluster_name}",
+ ]
+ }
+}
+
+module "node_term_sqs" {
+ source = "terraform-aws-modules/sqs/aws"
+ version = "~> 3.0.0"
+ name = local.cluster_name
+ message_retention_seconds = 300
+ policy = data.aws_iam_policy_document.node_term_events.json
+}
+
+resource "aws_cloudwatch_event_rule" "node_term_event_rule" {
+ name = "${local.cluster_name}-nth-rule"
+ description = "Node termination event rule"
+ event_pattern = jsonencode(
+ {
+ "source" : [
+ "aws.autoscaling"
+ ],
+ "detail-type" : [
+ "EC2 Instance-terminate Lifecycle Action"
+ ]
+ "resources" : module.eks.workers_asg_arns
+ }
+ )
+}
+
+resource "aws_cloudwatch_event_target" "node_term_event_target" {
+ rule = aws_cloudwatch_event_rule.node_term_event_rule.name
+ target_id = "ANTHandler"
+ arn = module.node_term_sqs.sqs_queue_arn
+}
+
+module "node_term_role" {
+ source = "terraform-aws-modules/iam/aws//modules/iam-assumable-role-with-oidc"
+ version = "4.1.0"
+ create_role = true
+ role_description = "IRSA role for ANTH, cluster ${local.cluster_name}"
+ role_name_prefix = local.cluster_name
+ provider_url = replace(module.eks.cluster_oidc_issuer_url, "https://", "")
+ role_policy_arns = [aws_iam_policy.node_term.arn]
+ oidc_fully_qualified_subjects = ["system:serviceaccount:${var.namespace}:${var.serviceaccount}"]
+}
+
+resource "helm_release" "anth" {
+ depends_on = [
+ module.eks
+ ]
+
+ name = "aws-node-termination-handler"
+ namespace = var.namespace
+ repository = "https://aws.github.io/eks-charts"
+ chart = "aws-node-termination-handler"
+ version = var.aws_node_termination_handler_chart_version
+ create_namespace = true
+
+ set {
+ name = "awsRegion"
+ value = var.region
+ }
+ set {
+ name = "serviceAccount.name"
+ value = var.serviceaccount
+ }
+ set {
+ name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
+ value = module.node_term_role.iam_role_arn
+ type = "string"
+ }
+ set {
+ name = "enableSqsTerminationDraining"
+ value = "true"
+ }
+ set {
+ name = "queueURL"
+ value = module.node_term_sqs.sqs_queue_id
+ }
+ set {
+ name = "logLevel"
+ value = "DEBUG"
+ }
+}
+
+# Creating the lifecycle-hook outside of the ASG resource's `initial_lifecycle_hook`
+# ensures that node termination does not require the lifecycle action to be completed,
+# and thus allows the ASG to be destroyed cleanly.
+resource "aws_autoscaling_lifecycle_hook" "node_term" {
+ name = "node_term-${local.cluster_name}"
+ autoscaling_group_name = module.eks.workers_asg_names[0]
+ lifecycle_transition = "autoscaling:EC2_INSTANCE_TERMINATING"
+ heartbeat_timeout = 300
+ default_result = "CONTINUE"
+}
+
+module "eks" {
+ source = "../.."
+ cluster_name = local.cluster_name
+ cluster_version = "1.19"
+ subnets = module.vpc.public_subnets
+ vpc_id = module.vpc.vpc_id
+ enable_irsa = true
+ worker_groups_launch_template = [
+ {
+ name = "refresh"
+ asg_max_size = 2
+ asg_desired_capacity = 2
+ instance_refresh_enabled = true
+ instance_refresh_triggers = ["tag"]
+ public_ip = true
+ metadata_http_put_response_hop_limit = 3
+ tags = [
+ {
+ key = "aws-node-termination-handler/managed"
+ value = ""
+ propagate_at_launch = true
+ },
+ {
+ key = "foo"
+ value = "buzz"
+ propagate_at_launch = true
+ },
+ ]
+ },
+ ]
+}
diff --git a/examples/instance_refresh/outputs.tf b/examples/instance_refresh/outputs.tf
new file mode 100644
index 0000000000..6b767427fb
--- /dev/null
+++ b/examples/instance_refresh/outputs.tf
@@ -0,0 +1,34 @@
+output "cluster_endpoint" {
+ description = "Endpoint for EKS control plane."
+ value = module.eks.cluster_endpoint
+}
+
+output "cluster_security_group_id" {
+ description = "Security group ids attached to the cluster control plane."
+ value = module.eks.cluster_security_group_id
+}
+
+output "kubectl_config" {
+ description = "kubectl config as generated by the module."
+ value = module.eks.kubeconfig
+}
+
+output "config_map_aws_auth" {
+ description = "A kubernetes configuration to authenticate to this EKS cluster."
+ value = module.eks.config_map_aws_auth
+}
+
+output "region" {
+ description = "AWS region."
+ value = var.region
+}
+
+output "sqs_queue_asg_notification_arn" {
+ description = "SQS queue ASG notification ARN"
+ value = module.node_term_sqs.sqs_queue_arn
+}
+
+output "sqs_queue_asg_notification_url" {
+ description = "SQS queue ASG notification URL"
+ value = module.node_term_sqs.sqs_queue_id
+}
diff --git a/examples/instance_refresh/variables.tf b/examples/instance_refresh/variables.tf
new file mode 100644
index 0000000000..96fc26df92
--- /dev/null
+++ b/examples/instance_refresh/variables.tf
@@ -0,0 +1,18 @@
+variable "region" {
+ default = "us-west-2"
+}
+
+variable "aws_node_termination_handler_chart_version" {
+ description = "Version of the aws-node-termination-handler Helm chart to install."
+ default = "0.15.0"
+}
+
+variable "namespace" {
+ description = "Namespace for the aws-node-termination-handler."
+ default = "kube-system"
+}
+
+variable "serviceaccount" {
+ description = "Serviceaccount for the aws-node-termination-handler."
+ default = "aws-node-termination-handler"
+}
diff --git a/examples/instance_refresh/versions.tf b/examples/instance_refresh/versions.tf
new file mode 100644
index 0000000000..67281c8d51
--- /dev/null
+++ b/examples/instance_refresh/versions.tf
@@ -0,0 +1,11 @@
+terraform {
+ required_version = ">= 0.13.1"
+
+ required_providers {
+ aws = ">= 3.22.0"
+ local = ">= 1.4"
+ random = ">= 2.1"
+ kubernetes = "~> 1.11"
+ helm = "~> 2.1.2"
+ }
+}
diff --git a/local.tf b/local.tf
index 44918625fb..3fa7c1f442 100644
--- a/local.tf
+++ b/local.tf
@@ -34,7 +34,7 @@ locals {
asg_max_size = "3" # Maximum worker capacity in the autoscaling group.
asg_min_size = "1" # Minimum worker capacity in the autoscaling group. NOTE: Change in this paramater will affect the asg_desired_capacity, like changing its value to 2 will change asg_desired_capacity value to 2 but bringing back it to 1 will not affect the asg_desired_capacity.
asg_force_delete = false # Enable forced deletion for the autoscaling group.
- asg_initial_lifecycle_hooks = [] # Initital lifecycle hook for the autoscaling group.
+ asg_initial_lifecycle_hooks = [] # Initial lifecycle hook for the autoscaling group.
default_cooldown = null # The amount of time, in seconds, after a scaling activity completes before another scaling activity can start.
health_check_type = null # Controls how health checking is done. Valid values are "EC2" or "ELB".
health_check_grace_period = null # Time in seconds after instance comes into service before checking health.
@@ -95,6 +95,11 @@ locals {
spot_max_price = "" # Maximum price per unit hour that the user is willing to pay for the Spot instances. Default is the on-demand price
max_instance_lifetime = 0 # Maximum number of seconds instances can run in the ASG. 0 is unlimited.
elastic_inference_accelerator = null # Type of elastic inference accelerator to be attached. Example values are eia1.medium, eia2.large, etc.
+ instance_refresh_enabled = false # Enable instance refresh for the worker autoscaling group.
+ instance_refresh_strategy = "Rolling" # Strategy to use for instance refresh. Default is 'Rolling' which the only valid value.
+ instance_refresh_min_healthy_percentage = 90 # The amount of capacity in the ASG that must remain healthy during an instance refresh, as a percentage of the ASG's desired capacity.
+ instance_refresh_instance_warmup = null # The number of seconds until a newly launched instance is configured and ready to use. Defaults to the ASG's health check grace period.
+ instance_refresh_triggers = [] # Set of additional property names that will trigger an Instance Refresh. A refresh will always be triggered by a change in any of launch_configuration, launch_template, or mixed_instances_policy.
}
workers_group_defaults = merge(
diff --git a/workers.tf b/workers.tf
index ae5a71113a..897805df98 100644
--- a/workers.tf
+++ b/workers.tf
@@ -162,6 +162,33 @@ resource "aws_autoscaling_group" "workers" {
}
}
+ # logic duplicated in workers_launch_template.tf
+ dynamic "instance_refresh" {
+ for_each = lookup(var.worker_groups[count.index],
+ "instance_refresh_enabled",
+ local.workers_group_defaults["instance_refresh_enabled"]) ? [1] : []
+ content {
+ strategy = lookup(
+ var.worker_groups[count.index], "instance_refresh_strategy",
+ local.workers_group_defaults["instance_refresh_strategy"]
+ )
+ preferences {
+ instance_warmup = lookup(
+ var.worker_groups[count.index], "instance_refresh_instance_warmup",
+ local.workers_group_defaults["instance_refresh_instance_warmup"]
+ )
+ min_healthy_percentage = lookup(
+ var.worker_groups[count.index], "instance_refresh_min_healthy_percentage",
+ local.workers_group_defaults["instance_refresh_min_healthy_percentage"]
+ )
+ }
+ triggers = lookup(
+ var.worker_groups[count.index], "instance_refresh_triggers",
+ local.workers_group_defaults["instance_refresh_triggers"]
+ )
+ }
+ }
+
lifecycle {
create_before_destroy = true
ignore_changes = [desired_capacity]
diff --git a/workers_launch_template.tf b/workers_launch_template.tf
index 14564c1708..f22a48f294 100644
--- a/workers_launch_template.tf
+++ b/workers_launch_template.tf
@@ -156,7 +156,6 @@ resource "aws_autoscaling_group" "workers_launch_template" {
instance_type = override.value
}
}
-
}
}
}
@@ -238,6 +237,33 @@ resource "aws_autoscaling_group" "workers_launch_template" {
}
}
+ # logic duplicated in workers.tf
+ dynamic "instance_refresh" {
+ for_each = lookup(var.worker_groups_launch_template[count.index],
+ "instance_refresh_enabled",
+ local.workers_group_defaults["instance_refresh_enabled"]) ? [1] : []
+ content {
+ strategy = lookup(
+ var.worker_groups_launch_template[count.index], "instance_refresh_strategy",
+ local.workers_group_defaults["instance_refresh_strategy"]
+ )
+ preferences {
+ instance_warmup = lookup(
+ var.worker_groups_launch_template[count.index], "instance_refresh_instance_warmup",
+ local.workers_group_defaults["instance_refresh_instance_warmup"]
+ )
+ min_healthy_percentage = lookup(
+ var.worker_groups_launch_template[count.index], "instance_refresh_min_healthy_percentage",
+ local.workers_group_defaults["instance_refresh_min_healthy_percentage"]
+ )
+ }
+ triggers = lookup(
+ var.worker_groups_launch_template[count.index], "instance_refresh_triggers",
+ local.workers_group_defaults["instance_refresh_triggers"]
+ )
+ }
+ }
+
lifecycle {
create_before_destroy = true
ignore_changes = [desired_capacity]