Skip to content

Commit

Permalink
CPD-183: Added ASG rotation plugin. (#267)
Browse files Browse the repository at this point in the history
  • Loading branch information
AbhishekRana95 authored Oct 13, 2020
1 parent 16b9d58 commit 40f27a0
Show file tree
Hide file tree
Showing 4 changed files with 566 additions and 0 deletions.
26 changes: 26 additions & 0 deletions docs/plugins/rotate_asg_instances.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Rotate ASG Instances Plugin

## Overview
The rotate ASG instances plugin rotates the outdated instances in Auto Scaling Groups. It compares the launch configuration and sees if any outdated instances are present. It detaches the instances first then shuts down the instance and waits for a new instance to replace the outdated one in ASG then proceeds to the next outdated instance.

After all outdated instances are shutdown successfully, it terminates them and reaps the associated volumes.

## Usage
It allows gracefully shutting down each instance instead of terminating them and killing all the running processes.

## Configuration
The plugin uses the ssh username specified in the `MOONSHOT_SSH_USER` or the `LOGNAME` environment variable for logging into the ASG instances to shutdown. The value should be the username with which you have the access to the instances. For example:
```ruby
export MOONSHOT_SSH_USER=abhishek.rana
```

The plugin needs no additional configuration parameters:

## Example
```ruby
Moonshot.config do |c|
# ...
c.plugins << Moonshot::Plugins::RotateAsgInstances.new
# ...
end
```
17 changes: 17 additions & 0 deletions lib/plugins/rotate_asg_instances.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# frozen_string_literal: true

require 'aws-sdk'
require_relative 'rotate_asg_instances/asg'

module Moonshot
module Plugins
# Rotate ASG instances after update.
class RotateAsgInstances
def post_update(resources)
asg = ASG.new(resources)
asg.rotate_asg_instances
asg.teardown_outdated_instances
end
end
end
end
238 changes: 238 additions & 0 deletions lib/plugins/rotate_asg_instances/asg.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
require 'moonshot/ssh_fork_executor'

module Moonshot
class ASG # rubocop:disable Metrics/ClassLength
include Moonshot::CredsHelper

def initialize(resources)
@resources = resources
@ilog = @resources.ilog
@ssh_user = ENV['MOONSHOT_SSH_USER'] || ENV['LOGNAME']
end

def asg
@asg ||=
Aws::AutoScaling::AutoScalingGroup.new(name: physical_resource_id)
end

def rotate_asg_instances
@ilog.start_threaded('Rotating ASG instances...') do |step|
@step = step
@volumes_to_delete = outdated_volumes(outdated_instances)
@shutdown_instances = cycle_instances(outdated_instances)
@step.success('ASG instances rotated successfully!')
end
end

def teardown_outdated_instances
@ilog.start_threaded('Tearing down outdated instances...') do |step|
@step = step
terminate_instances(@shutdown_instances)
reap_volumes(@volumes_to_delete)
@step.success('Outdated instances removed successfully!')
end
end

def physical_resource_id
@resources.controller.stack
.resources_of_type('AWS::AutoScaling::AutoScalingGroup')
.first.physical_resource_id
end

def outdated_instances
@outdated_instances ||=
asg.instances.reject do |i|
i.launch_configuration_name == asg.launch_configuration_name
end
end

private

def outdated_volumes(outdated_instances)
volumes = []
outdated_instances.each do |i|
begin
inst = Aws::EC2::Instance.new(id: i.id)
volumes << inst.block_device_mappings.first.ebs.volume_id
rescue StandardError => e
# We're catching all errors here, because failing to reap a volume
# is not a critical error, will not cause issues with the update.
@step.failure('Failed to get volumes for instance '\
"#{i.instance_id}: #{e.message}")
end
end
volumes
end

# Cycle the instances in the ASG.
#
# Each instance will be detached one at a time, waiting for the new instance
# to be ready before stopping the worker and terminating the instance.
#
# @param instances [Array] (outdated instances)
# List of instances to cycle. Defaults to all instances with outdated
# launch configurations.
# @return [Array] (array of Aws::AutoScaling::Instance)
# List of shutdown instances.
def cycle_instances(outdated_instances)
shutdown_instances = []

if outdated_instances.empty?
@step.success('No instances cycled!')
return []
end

@step.success("Cycling #{outdated_instances.size} " \
"of #{asg.instances.size} instances in " \
"#{physical_resource_id}...")

# Iterate over the instances in the stack, detaching and terminating each
# one.
outdated_instances.each do |i|
next if %w(Terminating Terminated).include?(i.lifecycle_state)

wait_for_instance(i)
detach_instance(i)

@step.success("Shutting down #{i.instance_id}")
shutdown_instance(i.instance_id)
shutdown_instances << i
end

@step.success('All instances cycled.')

shutdown_instances
end

# Waits for an instance to reach a ready state.
#
# @param instance [Aws::AutoScaling::Instance] Auto scaling instance to wait
# for.
def wait_for_instance(instance, state = 'InService')
instance.wait_until(max_attempts: 60, delay: 10) do |i|
i.lifecycle_state == state
end
end

# Detach an instance from its ASG. Re-attach if failed.
#
# @param instance [Aws::AutoScaling::Instance] Instance to detach.
def detach_instance(instance)
@step.success("Detaching instance: #{instance.instance_id}")

# If the ASG can't be brought up to capacity, re-attach the instance.
begin
instance.detach(should_decrement_desired_capacity: false)
@step.success('- Waiting for the AutoScaling '\
'Group to be up to capacity')
wait_for_capacity
rescue StandardError => e
@step.failure("Error bringing the ASG up to capacity: #{e.message}")
@step.failure("Attaching instance: #{instance.instance_id}")
reattach_instance(instance)
raise e
end
end

# Re-attach an instance to its ASG.
#
# @param instance [Aws::AutoScaling::Instance] Instance to re-attach.
def reattach_instance(instance)
instance.load
return unless instance.data.nil? \
|| %w(Detached Detaching).include?(instance.lifecycle_state)

until instance.data.nil? || instance.lifecycle_state == 'Detached'
sleep 10
instance.load
end
instance.attach
end

# Terminate instances.
#
# @param instances [Array] (instances for termination)
# List of instances to terminate. Defaults to all instances with outdated
# launch configurations.
def terminate_instances(outdated_instances)
if outdated_instances.any?
@step.continue(
"Terminating #{outdated_instances.size} outdated instances..."
)
end
outdated_instances.each do |asg_instance|
instance = Aws::EC2::Instance.new(asg_instance.instance_id)
begin
instance.load
rescue Aws::EC2::Errors::InvalidInstanceIDNotFound
next
end

next unless %w(stopping stopped).include?(instance.state.name)

instance.wait_until_stopped

@step.continue("Terminating #{instance.instance_id}")
instance.terminate
end
end

def reap_volumes(volumes)
volumes.each do |volume_id|
begin
@step.continue("Deleting volume: #{volume_id}")
ec2_client(region: ENV['AWS_REGION'])
.delete_volume(volume_id: volume_id)
rescue StandardError => e
# We're catching all errors here, because failing to reap a volume
# is not a critical error, will not cause issues with the release.
@step.failure("Failed to delete volume #{volume_id}: #{e.message}")
end
end
end

# Waits for the ASG to reach the desired capacity.
def wait_for_capacity
@step.continue(
'Replacing outdated instances with new instances for the AutoScaling Group...'
)
# While we wait for the asg to reach capacity, report instance statuses
# to the user.
before_wait = proc do
instances = []
asg.reload.instances.each do |i|
instances << " #{i.instance_id} (#{i.lifecycle_state})"
end

@step.continue("Instances: #{instances.join(', ')}")
end

asg.reload.wait_until(before_wait: before_wait, max_attempts: 60,
delay: 30) do |a|
instances_up = a.instances.select do |i|
i.lifecycle_state == 'InService'
end
instances_up.length == a.desired_capacity
end
@step.success('AutoScaling Group up to capacity!')
end

# Shuts down an instance, waiting for the instance to stop processing requests
# first. We do this so that services will be stopped properly.
#
# @param id [String] ID of the instance to terminate.
def shutdown_instance(id)
instance = Aws::EC2::Instance.new(id: id)
options = [
'UserKnownHostsFile=/dev/null',
'StrictHostKeyChecking=no'
]
remote = "#{@ssh_user}@#{instance.public_dns_name}"
cmd = "'sudo shutdown -h now'"
remote_cmd = "ssh -o #{options.join(' -o ')} #{remote} #{cmd}"
SSHForkExecutor.new.run(remote_cmd)

instance.wait_until_stopped
end
end
end
Loading

0 comments on commit 40f27a0

Please sign in to comment.