-
Notifications
You must be signed in to change notification settings - Fork 122
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #629 from joshbranham/refactor-egress-verification
OSD-25053: Refactor `osdctl network verify-egress` in anticipation of landing GCP support
- Loading branch information
Showing
3 changed files
with
470 additions
and
412 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,388 @@ | ||
package network | ||
|
||
import ( | ||
"bytes" | ||
"context" | ||
"errors" | ||
"fmt" | ||
"github.com/aws/aws-sdk-go-v2/aws" | ||
"github.com/aws/aws-sdk-go-v2/config" | ||
"github.com/aws/aws-sdk-go-v2/service/ec2" | ||
"github.com/aws/aws-sdk-go-v2/service/ec2/types" | ||
"github.com/openshift/osd-network-verifier/pkg/data/cloud" | ||
"github.com/openshift/osd-network-verifier/pkg/data/cpu" | ||
"github.com/openshift/osd-network-verifier/pkg/probes/curl" | ||
"github.com/openshift/osd-network-verifier/pkg/probes/legacy" | ||
onv "github.com/openshift/osd-network-verifier/pkg/verifier" | ||
"github.com/openshift/osdctl/pkg/osdCloud" | ||
"github.com/openshift/osdctl/pkg/utils" | ||
"os" | ||
"strings" | ||
) | ||
|
||
type egressVerificationAWSClient interface { | ||
DescribeSubnets(ctx context.Context, params *ec2.DescribeSubnetsInput, optFns ...func(options *ec2.Options)) (*ec2.DescribeSubnetsOutput, error) | ||
DescribeSecurityGroups(ctx context.Context, params *ec2.DescribeSecurityGroupsInput, optFns ...func(options *ec2.Options)) (*ec2.DescribeSecurityGroupsOutput, error) | ||
DescribeRouteTables(ctx context.Context, params *ec2.DescribeRouteTablesInput, optFns ...func(options *ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) | ||
} | ||
|
||
// setupForAws configures an EgressVerification's awsClient and cluster depending on whether the ClusterId or profile | ||
// flags are supplied. It also returns an aws.Config if needed. | ||
func (e *EgressVerification) setupForAws(ctx context.Context) (*aws.Config, error) { | ||
e.cpuArch = cpu.ArchitectureByName(e.CpuArchName) | ||
if e.CpuArchName != "" && !e.cpuArch.IsValid() { | ||
return nil, fmt.Errorf("%s is not a valid architecture", e.CpuArchName) | ||
} | ||
|
||
// If ClusterId is supplied, leverage ocm and ocm-backplane to get an AWS client. | ||
// We previously hydrated the EgressVerification struct with a `cluster` in this scenario. | ||
if e.ClusterId != "" && e.cluster != nil { | ||
ocmClient, err := utils.CreateConnection() | ||
if err != nil { | ||
return nil, fmt.Errorf("error creating OCM connection: %v", err) | ||
} | ||
defer ocmClient.Close() | ||
|
||
// We currently have insufficient permissions to run network verifier on ROSA HCP | ||
// We can update or, if applicable, remove this warning after https://issues.redhat.com/browse/XCMSTRAT-245 | ||
if e.cluster.Hypershift().Enabled() { | ||
e.log.Warn(ctx, "Generally, SRE has insufficient AWS permissions"+ | ||
" to run network verifier on hosted control plane clusters. Run anyway?") | ||
if !utils.ConfirmPrompt() { | ||
return nil, errors.New("You can try the network verifier script in ops-sop/hypershift/utils/verify-egress.sh") | ||
} | ||
} | ||
|
||
e.log.Info(ctx, "getting AWS credentials from backplane-api") | ||
cfg, err := osdCloud.CreateAWSV2Config(ocmClient, e.cluster) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to get credentials automatically from backplane-api: %v."+ | ||
" You can still try this command by exporting AWS credentials as environment variables and specifying"+ | ||
" the --subnet-id, --security-group, and other required flags manually."+ | ||
" See osdctl network verify-egress -h for more details", err) | ||
} | ||
e.log.Debug(ctx, "retrieved AWS credentials from backplane-api") | ||
e.awsClient = ec2.NewFromConfig(cfg) | ||
return &cfg, nil | ||
} | ||
|
||
if e.SubnetIds == nil || e.SecurityGroupId == "" || e.platformName == "" { | ||
return nil, fmt.Errorf("--subnet-id, --security-group, and --platform are required when --cluster-id is not specified") | ||
} | ||
|
||
e.log.Info(ctx, "[WARNING] no cluster-id specified, there is reduced validation around the security group, subnet, and proxy, causing inaccurate results") | ||
e.log.Info(ctx, "using whatever default AWS credentials are locally available") | ||
cfg, err := config.LoadDefaultConfig(ctx) | ||
if err != nil { | ||
return nil, fmt.Errorf("network verification failed to find valid creds locally: %s", err) | ||
} | ||
|
||
// Additionally, if an AWS region must be provided somehow if there's no ClusterId | ||
// This could have been done via the default AWS credentials or can be supplied manually via --region | ||
if e.Region != "" { | ||
e.log.Info(ctx, "overriding region with %s", e.Region) | ||
cfg.Region = e.Region | ||
} | ||
|
||
e.awsClient = ec2.NewFromConfig(cfg) | ||
|
||
return &cfg, nil | ||
} | ||
|
||
// generateAWSValidateEgressInput is an opinionated interface in front of osd-network-verifier. | ||
// Its input is an OCM internal/external ClusterId and it returns the corresponding input to osd-network-verifier with | ||
// default AWS tags, one of the cluster's private subnet IDs, and the cluster's master security group. | ||
// Can override SecurityGroupId and SubnetIds. | ||
func (e *EgressVerification) generateAWSValidateEgressInput(ctx context.Context, region string) ([]*onv.ValidateEgressInput, error) { | ||
// We can auto-detect information from OCM | ||
if e.cluster != nil { | ||
if e.cluster.Product().ID() != "rosa" && | ||
e.cluster.Product().ID() != "osd" && | ||
e.cluster.Product().ID() != "osdtrial" { | ||
return nil, fmt.Errorf("only supports rosa, osd, and osdtrial, got %s", e.cluster.Product().ID()) | ||
} | ||
} | ||
|
||
input, err := defaultValidateEgressInput(ctx, e.cluster, region) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to assemble validate egress input: %s", err) | ||
} | ||
|
||
// Configure verifier's PlatformType based on cluster info or user request | ||
input.PlatformType, err = e.getPlatform() | ||
if err != nil { | ||
return nil, fmt.Errorf("platform not supported: %w", err) | ||
} | ||
e.log.Info(ctx, "using platform: %s", input.PlatformType) | ||
|
||
input.CPUArchitecture = e.cpuArch | ||
|
||
// Setup proxy configuration that is not automatically determined | ||
input.Proxy.NoTls = e.NoTls | ||
if e.CaCert != "" { | ||
cert, err := os.ReadFile(e.CaCert) | ||
if err != nil { | ||
return nil, err | ||
} | ||
input.Proxy.Cacert = bytes.NewBuffer(cert).String() | ||
} | ||
|
||
if e.cluster != nil { | ||
// If a KMS key is defined for the cluster, use it as the default aws/ebs key may not exist | ||
clusterAWS := e.cluster.AWS() | ||
|
||
if clusterAWS != nil { | ||
if kmsKeyArn, isOk := clusterAWS.GetKMSKeyArn(); isOk { | ||
e.log.Info(ctx, "using KMS key defined for the cluster: %s", kmsKeyArn) | ||
input.AWS.KmsKeyID = kmsKeyArn | ||
} | ||
} | ||
|
||
// If the cluster has a cluster-wide proxy, configure it | ||
if e.cluster.Proxy() != nil && !e.cluster.Proxy().Empty() { | ||
input.Proxy.HttpProxy = e.cluster.Proxy().HTTPProxy() | ||
input.Proxy.HttpsProxy = e.cluster.Proxy().HTTPSProxy() | ||
} | ||
|
||
// The actual trust bundle is redacted in OCM, but is an indicator that --cacert is required | ||
if e.cluster.AdditionalTrustBundle() != "" && e.CaCert == "" { | ||
caBundle, err := e.getCABundle(ctx) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to get additional CA trust bundle from hive with error: %v, consider specifying --cacert", err) | ||
} | ||
|
||
input.Proxy.Cacert = caBundle | ||
} | ||
} | ||
|
||
// Obtain subnet ids to run against | ||
subnetId, err := e.getSubnetIds(context.TODO()) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
input.SubnetID = subnetId[0] | ||
|
||
checkPublic, err := e.isSubnetPublic(ctx, input.SubnetID) | ||
if err != nil { | ||
return nil, err | ||
} | ||
if checkPublic { | ||
return nil, fmt.Errorf("subnet %v you provided is public. The network verifier only works for private subnets. Please provide a private subnet ID", input.SubnetID) | ||
} | ||
|
||
// Obtain security group id to use | ||
sgId, err := e.getSecurityGroupId(context.TODO()) | ||
if err != nil { | ||
return nil, err | ||
} | ||
input.AWS.SecurityGroupIDs = []string{sgId} | ||
|
||
// Forward the timeout | ||
input.Timeout = e.EgressTimeout | ||
|
||
switch strings.ToLower(e.Probe) { | ||
case "curl": | ||
input.Probe = curl.Probe{} | ||
case "legacy": | ||
input.Probe = legacy.Probe{} | ||
default: | ||
e.log.Info(ctx, "unrecognized probe %s - defaulting to curl probe.", e.Probe) | ||
input.Probe = curl.Probe{} | ||
} | ||
|
||
// Creating a slice of input values for the network-verifier to loop over. | ||
// All inputs are essentially equivalent except their subnet ids | ||
inputs := make([]*onv.ValidateEgressInput, len(subnetId)) | ||
for i := range subnetId { | ||
// Copying a pointer to avoid overwriting it | ||
var myinput = &onv.ValidateEgressInput{} | ||
*myinput = *input | ||
inputs[i] = myinput | ||
inputs[i].SubnetID = subnetId[i] | ||
} | ||
|
||
return inputs, nil | ||
} | ||
|
||
// getSubnetIds attempts to return a private subnet ID or all private subnet IDs of the cluster. | ||
// e.SubnetIds acts as an override, otherwise e.awsClient will be used to attempt to determine the correct subnets | ||
func (e *EgressVerification) getSubnetIds(ctx context.Context) ([]string, error) { | ||
// A SubnetIds was manually specified, just use that | ||
if e.SubnetIds != nil { | ||
e.log.Info(ctx, "using manually specified subnet-id: %s", e.SubnetIds) | ||
return e.SubnetIds, nil | ||
} | ||
|
||
// If this is a non-BYOVPC cluster, we can find the private subnets based on the cluster and internal-elb tag | ||
if len(e.cluster.AWS().SubnetIDs()) == 0 { | ||
e.log.Info(ctx, "searching for subnets by tags: kubernetes.io/cluster/%s and %s=", e.cluster.InfraID(), nonByovpcPrivateSubnetTagKey) | ||
resp, err := e.awsClient.DescribeSubnets(ctx, &ec2.DescribeSubnetsInput{ | ||
Filters: []types.Filter{ | ||
{ | ||
Name: aws.String("tag-key"), | ||
Values: []string{fmt.Sprintf("kubernetes.io/cluster/%s", e.cluster.InfraID())}, | ||
}, | ||
{ | ||
Name: aws.String("tag-key"), | ||
Values: []string{nonByovpcPrivateSubnetTagKey}, | ||
}, | ||
}, | ||
}) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to find private subnets for %s: %w", e.cluster.InfraID(), err) | ||
} | ||
|
||
if len(resp.Subnets) == 0 { | ||
return nil, fmt.Errorf("found 0 subnets with tags: kubernetes.io/cluster/%s and %s, consider the --subnet-id flag", e.cluster.InfraID(), nonByovpcPrivateSubnetTagKey) | ||
} | ||
if e.AllSubnets { | ||
subnets := make([]string, len(resp.Subnets)) | ||
e.log.Debug(ctx, "Found %v subnets.", len(resp.Subnets)) | ||
for i := range resp.Subnets { | ||
e.log.Debug(ctx, "Found subnet: %v", *resp.Subnets[i].SubnetId) | ||
subnets[i] = *resp.Subnets[i].SubnetId | ||
} | ||
return subnets, nil | ||
} else { | ||
e.log.Info(ctx, "using subnet-id: %s", *resp.Subnets[0].SubnetId) | ||
return []string{*resp.Subnets[0].SubnetId}, nil | ||
} | ||
} | ||
|
||
// For PrivateLink clusters, any provided subnet is considered a private subnet | ||
if e.cluster.AWS().PrivateLink() { | ||
if len(e.cluster.AWS().SubnetIDs()) == 0 { | ||
return nil, fmt.Errorf("unexpected error: %s is a PrivateLink cluster, but no subnets in OCM", e.cluster.InfraID()) | ||
} | ||
// If the all-subnets flag is on, the network verifier will iterate over all subnets listed by ocm | ||
if e.AllSubnets { | ||
|
||
subnets := e.cluster.AWS().SubnetIDs() | ||
|
||
e.log.Debug(ctx, "Found the following subnets listed with ocm: %v", subnets) | ||
e.log.Debug(ctx, "Assigned value to var e.SubnetIds: %v", subnets) | ||
return subnets, nil | ||
|
||
} | ||
|
||
e.log.Info(ctx, "detected BYOVPC PrivateLink cluster, using first subnet from OCM: %s", e.cluster.AWS().SubnetIDs()[0]) | ||
return []string{e.cluster.AWS().SubnetIDs()[0]}, nil | ||
|
||
} | ||
|
||
// For non-PrivateLink BYOVPC clusters, provided subnets are 50/50 public/private subnets, so make the user decide for now | ||
// TODO: Figure out via IGW/NAT GW/Route Tables | ||
return nil, fmt.Errorf("unable to determine which non-PrivateLink BYOVPC subnets are private yet, please check manually and provide the --subnet-id flag") | ||
|
||
} | ||
|
||
// This function checks the gateway attached to the subnet and returns true if the subnet starts with igw- (for InternetGateway) and has a route to 0.0.0.0/0 | ||
func (e *EgressVerification) isSubnetPublic(ctx context.Context, subnetID string) (bool, error) { | ||
var routeTable string | ||
|
||
// Try and find a Route Table associated with the given subnet | ||
|
||
routeTable, err := utils.FindRouteTableForSubnetForVerification(e.awsClient, subnetID) | ||
|
||
// Check that the RouteTable for the subnet has a default route to 0.0.0.0/0 | ||
describeRouteTablesOutput, err := e.awsClient.DescribeRouteTables(ctx, &ec2.DescribeRouteTablesInput{ | ||
RouteTableIds: []string{routeTable}, | ||
}) | ||
if err != nil { | ||
return false, err | ||
} | ||
|
||
if len(describeRouteTablesOutput.RouteTables) == 0 { | ||
// Shouldn't happen | ||
return false, fmt.Errorf("no route tables found for route table id %v", routeTable) | ||
} | ||
|
||
// Checking if the attached gateway starts with igw- for Internet Gateway | ||
for _, route := range describeRouteTablesOutput.RouteTables[0].Routes { | ||
if route.GatewayId != nil && strings.HasPrefix(*route.GatewayId, "igw-") { | ||
// Some routes don't use CIDR blocks as targets, so this needs to be checked | ||
if route.DestinationCidrBlock != nil && *route.DestinationCidrBlock == "0.0.0.0/0" { | ||
return true, nil | ||
} | ||
} | ||
} | ||
|
||
// We haven't found a default route to the internet, so this subnet can't be public | ||
return false, nil | ||
} | ||
|
||
// findDefaultRouteTableForVPC returns the AWS Route Table ID of the VPC's default Route Table | ||
func (e *EgressVerification) findDefaultRouteTableForVPC(ctx context.Context, vpcID string) (string, error) { | ||
describeRouteTablesOutput, err := e.awsClient.DescribeRouteTables(ctx, &ec2.DescribeRouteTablesInput{ | ||
Filters: []types.Filter{ | ||
{ | ||
Name: aws.String("vpc-id"), | ||
Values: []string{vpcID}, | ||
}, | ||
}, | ||
}) | ||
if err != nil { | ||
return "", fmt.Errorf("failed to describe route tables associated with vpc %s: %w", vpcID, err) | ||
} | ||
|
||
for _, rt := range describeRouteTablesOutput.RouteTables { | ||
for _, assoc := range rt.Associations { | ||
if *assoc.Main { | ||
return *rt.RouteTableId, nil | ||
} | ||
} | ||
} | ||
|
||
return "", fmt.Errorf("no default route table found for vpc: %s", vpcID) | ||
} | ||
|
||
// getSecurityGroupId attempts to return a cluster's master node security group id | ||
// e.SecurityGroupId acts as an override, otherwise e.awsClient will be used to attempt to determine the correct security group | ||
func (e *EgressVerification) getSecurityGroupId(ctx context.Context) (string, error) { | ||
// A SecurityGroupId was manually specified, just use that | ||
if e.SecurityGroupId != "" { | ||
e.log.Info(ctx, "using manually specified security-group-id: %s", e.SecurityGroupId) | ||
return e.SecurityGroupId, nil | ||
} | ||
|
||
var filters []types.Filter | ||
platform, err := e.getPlatform() | ||
if err != nil { | ||
return "", err | ||
} | ||
switch platform { | ||
case cloud.AWSHCP, cloud.AWSHCPZeroEgress: | ||
filters = []types.Filter{ | ||
{ | ||
Name: aws.String("tag:Name"), | ||
Values: []string{fmt.Sprintf("%s-default-sg", e.cluster.ID())}, | ||
}, | ||
} | ||
default: | ||
filters = []types.Filter{ | ||
{ | ||
// Prior to 4.16: <infra_id>-master-sg | ||
// 4.16+: <infra_id>-controlplane | ||
Name: aws.String("tag:Name"), | ||
Values: []string{fmt.Sprintf("%s-master-sg", e.cluster.InfraID()), fmt.Sprintf("%s-controlplane", e.cluster.InfraID())}, | ||
}, | ||
} | ||
} | ||
|
||
// If no SecurityGroupId override is passed in, try to determine the master security group id | ||
e.log.Info(ctx, "searching for security group by tags: %s", filtersToString(filters)) | ||
resp, err := e.awsClient.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{ | ||
Filters: filters, | ||
}) | ||
if err != nil { | ||
return "", fmt.Errorf("failed to find security group for %s: %w", e.cluster.InfraID(), err) | ||
} | ||
|
||
if len(resp.SecurityGroups) == 0 { | ||
return "", fmt.Errorf("failed to find any security groups by tags: %s", filtersToString(filters)) | ||
} | ||
|
||
e.log.Info(ctx, "using security-group-id: %s", *resp.SecurityGroups[0].GroupId) | ||
return *resp.SecurityGroups[0].GroupId, nil | ||
} |
Oops, something went wrong.