diff --git a/.gitignore b/.gitignore index 4759a205e3..c78b7d1e2c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ eks-admin-cluster-role-binding.yaml eks-admin-service-account.yaml config-map-aws-auth*.yaml kubeconfig_* +.idea ################################################################# # Default .gitignore content for all terraform-aws-modules below diff --git a/README.md b/README.md index d1b01cbb9e..6169d70799 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,8 @@ An example of harming update was the removal of several commonly used, but depre By default, this module manages the `aws-auth` configmap for you (`manage_aws_auth=true`). To avoid the following [issue](https://github.com/aws/containers-roadmap/issues/654) where the EKS creation is `ACTIVE` but not ready. We implemented a "retry" logic with a fork of the http provider https://github.com/terraform-aws-modules/terraform-provider-http. This fork adds the support of a self-signed CA certificate. The original PR can be found at https://github.com/hashicorp/terraform-provider-http/pull/29. +Setting `instance_refresh_enabled` to true will recreate your worker nodes without draining them first. It is recommended to install [aws-node-termination-handler](https://github.com/aws/aws-node-termination-handler) for proper node draining. Find the complete example here [instance_refresh](examples/instance_refresh). + ## Usage example A full example leveraging other community modules is contained in the [examples/basic directory](https://github.com/terraform-aws-modules/terraform-aws-eks/tree/master/examples/basic). @@ -155,7 +157,6 @@ MIT Licensed. See [LICENSE](https://github.com/terraform-aws-modules/terraform-a | [http](#provider\_http) | >= 2.3.0 | | [kubernetes](#provider\_kubernetes) | >= 1.11.1 | | [local](#provider\_local) | >= 1.4 | -| [random](#provider\_random) | >= 2.1 | ## Modules @@ -202,8 +203,6 @@ MIT Licensed. See [LICENSE](https://github.com/terraform-aws-modules/terraform-a | [aws_security_group_rule.workers_ingress_self](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group_rule) | resource | | [kubernetes_config_map.aws_auth](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map) | resource | | [local_file.kubeconfig](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | -| [random_pet.workers](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/pet) | resource | -| [random_pet.workers_launch_template](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/pet) | resource | | [aws_ami.eks_worker](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | | [aws_ami.eks_worker_windows](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | | [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | diff --git a/examples/instance_refresh/main.tf b/examples/instance_refresh/main.tf new file mode 100644 index 0000000000..137df25b4a --- /dev/null +++ b/examples/instance_refresh/main.tf @@ -0,0 +1,234 @@ +provider "aws" { + region = var.region +} + +data "aws_caller_identity" "current" {} + +data "aws_eks_cluster" "cluster" { + name = module.eks.cluster_id +} + +data "aws_eks_cluster_auth" "cluster" { + name = module.eks.cluster_id +} + +provider "kubernetes" { + host = data.aws_eks_cluster.cluster.endpoint + cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data) + token = data.aws_eks_cluster_auth.cluster.token + load_config_file = false +} + +provider "helm" { + kubernetes { + host = data.aws_eks_cluster.cluster.endpoint + cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data) + token = data.aws_eks_cluster_auth.cluster.token + } +} + +data "aws_availability_zones" "available" { +} + +locals { + cluster_name = "test-refresh-${random_string.suffix.result}" +} + +resource "random_string" "suffix" { + length = 8 + special = false +} + +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + version = "~> 3.0.0" + + name = local.cluster_name + cidr = "10.0.0.0/16" + azs = data.aws_availability_zones.available.names + public_subnets = ["10.0.4.0/24", "10.0.5.0/24", "10.0.6.0/24"] + enable_dns_hostnames = true +} + +data "aws_iam_policy_document" "node_term" { + statement { + effect = "Allow" + actions = [ + "ec2:DescribeInstances", + "autoscaling:DescribeAutoScalingInstances", + "autoscaling:DescribeTags", + ] + resources = [ + "*", + ] + } + statement { + effect = "Allow" + actions = [ + "autoscaling:CompleteLifecycleAction", + ] + resources = module.eks.workers_asg_arns + } + statement { + effect = "Allow" + actions = [ + "sqs:DeleteMessage", + "sqs:ReceiveMessage" + ] + resources = [ + module.node_term_sqs.sqs_queue_arn + ] + } +} + +resource "aws_iam_policy" "node_term" { + name = "node-term-${local.cluster_name}" + policy = data.aws_iam_policy_document.node_term.json +} + +resource "aws_iam_role_policy_attachment" "node_term_policy" { + policy_arn = aws_iam_policy.node_term.arn + role = module.eks.worker_iam_role_name +} + +data "aws_iam_policy_document" "node_term_events" { + statement { + effect = "Allow" + principals { + type = "Service" + identifiers = [ + "events.amazonaws.com", + "sqs.amazonaws.com", + ] + } + actions = [ + "sqs:SendMessage", + ] + resources = [ + "arn:aws:sqs:${var.region}:${data.aws_caller_identity.current.account_id}:${local.cluster_name}", + ] + } +} + +module "node_term_sqs" { + source = "terraform-aws-modules/sqs/aws" + version = "~> 3.0.0" + name = local.cluster_name + message_retention_seconds = 300 + policy = data.aws_iam_policy_document.node_term_events.json +} + +resource "aws_cloudwatch_event_rule" "node_term_event_rule" { + name = "${local.cluster_name}-nth-rule" + description = "Node termination event rule" + event_pattern = jsonencode( + { + "source" : [ + "aws.autoscaling" + ], + "detail-type" : [ + "EC2 Instance-terminate Lifecycle Action" + ] + "resources" : module.eks.workers_asg_arns + } + ) +} + +resource "aws_cloudwatch_event_target" "node_term_event_target" { + rule = aws_cloudwatch_event_rule.node_term_event_rule.name + target_id = "ANTHandler" + arn = module.node_term_sqs.sqs_queue_arn +} + +module "node_term_role" { + source = "terraform-aws-modules/iam/aws//modules/iam-assumable-role-with-oidc" + version = "4.1.0" + create_role = true + role_description = "IRSA role for ANTH, cluster ${local.cluster_name}" + role_name_prefix = local.cluster_name + provider_url = replace(module.eks.cluster_oidc_issuer_url, "https://", "") + role_policy_arns = [aws_iam_policy.node_term.arn] + oidc_fully_qualified_subjects = ["system:serviceaccount:${var.namespace}:${var.serviceaccount}"] +} + +resource "helm_release" "anth" { + depends_on = [ + module.eks + ] + + name = "aws-node-termination-handler" + namespace = var.namespace + repository = "https://aws.github.io/eks-charts" + chart = "aws-node-termination-handler" + version = var.aws_node_termination_handler_chart_version + create_namespace = true + + set { + name = "awsRegion" + value = var.region + } + set { + name = "serviceAccount.name" + value = var.serviceaccount + } + set { + name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn" + value = module.node_term_role.iam_role_arn + type = "string" + } + set { + name = "enableSqsTerminationDraining" + value = "true" + } + set { + name = "queueURL" + value = module.node_term_sqs.sqs_queue_id + } + set { + name = "logLevel" + value = "DEBUG" + } +} + +# Creating the lifecycle-hook outside of the ASG resource's `initial_lifecycle_hook` +# ensures that node termination does not require the lifecycle action to be completed, +# and thus allows the ASG to be destroyed cleanly. +resource "aws_autoscaling_lifecycle_hook" "node_term" { + name = "node_term-${local.cluster_name}" + autoscaling_group_name = module.eks.workers_asg_names[0] + lifecycle_transition = "autoscaling:EC2_INSTANCE_TERMINATING" + heartbeat_timeout = 300 + default_result = "CONTINUE" +} + +module "eks" { + source = "../.." + cluster_name = local.cluster_name + cluster_version = "1.19" + subnets = module.vpc.public_subnets + vpc_id = module.vpc.vpc_id + enable_irsa = true + worker_groups_launch_template = [ + { + name = "refresh" + asg_max_size = 2 + asg_desired_capacity = 2 + instance_refresh_enabled = true + instance_refresh_triggers = ["tag"] + public_ip = true + metadata_http_put_response_hop_limit = 3 + tags = [ + { + key = "aws-node-termination-handler/managed" + value = "" + propagate_at_launch = true + }, + { + key = "foo" + value = "buzz" + propagate_at_launch = true + }, + ] + }, + ] +} diff --git a/examples/instance_refresh/outputs.tf b/examples/instance_refresh/outputs.tf new file mode 100644 index 0000000000..6b767427fb --- /dev/null +++ b/examples/instance_refresh/outputs.tf @@ -0,0 +1,34 @@ +output "cluster_endpoint" { + description = "Endpoint for EKS control plane." + value = module.eks.cluster_endpoint +} + +output "cluster_security_group_id" { + description = "Security group ids attached to the cluster control plane." + value = module.eks.cluster_security_group_id +} + +output "kubectl_config" { + description = "kubectl config as generated by the module." + value = module.eks.kubeconfig +} + +output "config_map_aws_auth" { + description = "A kubernetes configuration to authenticate to this EKS cluster." + value = module.eks.config_map_aws_auth +} + +output "region" { + description = "AWS region." + value = var.region +} + +output "sqs_queue_asg_notification_arn" { + description = "SQS queue ASG notification ARN" + value = module.node_term_sqs.sqs_queue_arn +} + +output "sqs_queue_asg_notification_url" { + description = "SQS queue ASG notification URL" + value = module.node_term_sqs.sqs_queue_id +} diff --git a/examples/instance_refresh/variables.tf b/examples/instance_refresh/variables.tf new file mode 100644 index 0000000000..96fc26df92 --- /dev/null +++ b/examples/instance_refresh/variables.tf @@ -0,0 +1,18 @@ +variable "region" { + default = "us-west-2" +} + +variable "aws_node_termination_handler_chart_version" { + description = "Version of the aws-node-termination-handler Helm chart to install." + default = "0.15.0" +} + +variable "namespace" { + description = "Namespace for the aws-node-termination-handler." + default = "kube-system" +} + +variable "serviceaccount" { + description = "Serviceaccount for the aws-node-termination-handler." + default = "aws-node-termination-handler" +} diff --git a/examples/instance_refresh/versions.tf b/examples/instance_refresh/versions.tf new file mode 100644 index 0000000000..67281c8d51 --- /dev/null +++ b/examples/instance_refresh/versions.tf @@ -0,0 +1,11 @@ +terraform { + required_version = ">= 0.13.1" + + required_providers { + aws = ">= 3.22.0" + local = ">= 1.4" + random = ">= 2.1" + kubernetes = "~> 1.11" + helm = "~> 2.1.2" + } +} diff --git a/local.tf b/local.tf index 44918625fb..3fa7c1f442 100644 --- a/local.tf +++ b/local.tf @@ -34,7 +34,7 @@ locals { asg_max_size = "3" # Maximum worker capacity in the autoscaling group. asg_min_size = "1" # Minimum worker capacity in the autoscaling group. NOTE: Change in this paramater will affect the asg_desired_capacity, like changing its value to 2 will change asg_desired_capacity value to 2 but bringing back it to 1 will not affect the asg_desired_capacity. asg_force_delete = false # Enable forced deletion for the autoscaling group. - asg_initial_lifecycle_hooks = [] # Initital lifecycle hook for the autoscaling group. + asg_initial_lifecycle_hooks = [] # Initial lifecycle hook for the autoscaling group. default_cooldown = null # The amount of time, in seconds, after a scaling activity completes before another scaling activity can start. health_check_type = null # Controls how health checking is done. Valid values are "EC2" or "ELB". health_check_grace_period = null # Time in seconds after instance comes into service before checking health. @@ -95,6 +95,11 @@ locals { spot_max_price = "" # Maximum price per unit hour that the user is willing to pay for the Spot instances. Default is the on-demand price max_instance_lifetime = 0 # Maximum number of seconds instances can run in the ASG. 0 is unlimited. elastic_inference_accelerator = null # Type of elastic inference accelerator to be attached. Example values are eia1.medium, eia2.large, etc. + instance_refresh_enabled = false # Enable instance refresh for the worker autoscaling group. + instance_refresh_strategy = "Rolling" # Strategy to use for instance refresh. Default is 'Rolling' which the only valid value. + instance_refresh_min_healthy_percentage = 90 # The amount of capacity in the ASG that must remain healthy during an instance refresh, as a percentage of the ASG's desired capacity. + instance_refresh_instance_warmup = null # The number of seconds until a newly launched instance is configured and ready to use. Defaults to the ASG's health check grace period. + instance_refresh_triggers = [] # Set of additional property names that will trigger an Instance Refresh. A refresh will always be triggered by a change in any of launch_configuration, launch_template, or mixed_instances_policy. } workers_group_defaults = merge( diff --git a/workers.tf b/workers.tf index ae5a71113a..897805df98 100644 --- a/workers.tf +++ b/workers.tf @@ -162,6 +162,33 @@ resource "aws_autoscaling_group" "workers" { } } + # logic duplicated in workers_launch_template.tf + dynamic "instance_refresh" { + for_each = lookup(var.worker_groups[count.index], + "instance_refresh_enabled", + local.workers_group_defaults["instance_refresh_enabled"]) ? [1] : [] + content { + strategy = lookup( + var.worker_groups[count.index], "instance_refresh_strategy", + local.workers_group_defaults["instance_refresh_strategy"] + ) + preferences { + instance_warmup = lookup( + var.worker_groups[count.index], "instance_refresh_instance_warmup", + local.workers_group_defaults["instance_refresh_instance_warmup"] + ) + min_healthy_percentage = lookup( + var.worker_groups[count.index], "instance_refresh_min_healthy_percentage", + local.workers_group_defaults["instance_refresh_min_healthy_percentage"] + ) + } + triggers = lookup( + var.worker_groups[count.index], "instance_refresh_triggers", + local.workers_group_defaults["instance_refresh_triggers"] + ) + } + } + lifecycle { create_before_destroy = true ignore_changes = [desired_capacity] diff --git a/workers_launch_template.tf b/workers_launch_template.tf index 14564c1708..f22a48f294 100644 --- a/workers_launch_template.tf +++ b/workers_launch_template.tf @@ -156,7 +156,6 @@ resource "aws_autoscaling_group" "workers_launch_template" { instance_type = override.value } } - } } } @@ -238,6 +237,33 @@ resource "aws_autoscaling_group" "workers_launch_template" { } } + # logic duplicated in workers.tf + dynamic "instance_refresh" { + for_each = lookup(var.worker_groups_launch_template[count.index], + "instance_refresh_enabled", + local.workers_group_defaults["instance_refresh_enabled"]) ? [1] : [] + content { + strategy = lookup( + var.worker_groups_launch_template[count.index], "instance_refresh_strategy", + local.workers_group_defaults["instance_refresh_strategy"] + ) + preferences { + instance_warmup = lookup( + var.worker_groups_launch_template[count.index], "instance_refresh_instance_warmup", + local.workers_group_defaults["instance_refresh_instance_warmup"] + ) + min_healthy_percentage = lookup( + var.worker_groups_launch_template[count.index], "instance_refresh_min_healthy_percentage", + local.workers_group_defaults["instance_refresh_min_healthy_percentage"] + ) + } + triggers = lookup( + var.worker_groups_launch_template[count.index], "instance_refresh_triggers", + local.workers_group_defaults["instance_refresh_triggers"] + ) + } + } + lifecycle { create_before_destroy = true ignore_changes = [desired_capacity]