Skip to content

Commit

Permalink
feat: Adds vcpu and memory metrics per instance type (aws#3824)
Browse files Browse the repository at this point in the history
  • Loading branch information
valdisrigdon authored May 9, 2023
1 parent e6d76e3 commit 0ddf200
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 3 deletions.
10 changes: 10 additions & 0 deletions pkg/providers/instancetype/instancetype.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import (
"sync"
"sync/atomic"

"github.com/prometheus/client_golang/prometheus"

awscache "github.com/aws/karpenter/pkg/cache"

"github.com/aws/aws-sdk-go/aws"
Expand Down Expand Up @@ -103,6 +105,14 @@ func (p *Provider) List(ctx context.Context, kc *v1alpha5.KubeletConfiguration,
result := lo.Map(instanceTypes, func(i *ec2.InstanceTypeInfo, _ int) *cloudprovider.InstanceType {
return NewInstanceType(ctx, i, kc, p.region, nodeTemplate, p.createOfferings(ctx, i, instanceTypeZones[aws.StringValue(i.InstanceType)]))
})
for _, instanceType := range instanceTypes {
InstanceTypeVCPU.With(prometheus.Labels{
InstanceTypeLabel: *instanceType.InstanceType,
}).Set(float64(aws.Int64Value(instanceType.VCpuInfo.DefaultVCpus)))
InstanceTypeMemory.With(prometheus.Labels{
InstanceTypeLabel: *instanceType.InstanceType,
}).Set(float64(aws.Int64Value(instanceType.MemoryInfo.SizeInMiB) * 1024 * 1024))
}
p.cache.SetDefault(key, result)
return result, nil
}
Expand Down
56 changes: 56 additions & 0 deletions pkg/providers/instancetype/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package instancetype

import (
"github.com/prometheus/client_golang/prometheus"
crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"

"github.com/aws/karpenter-core/pkg/metrics"
)

const (
cloudProviderSubsystem = "cloudprovider"
)

var (
InstanceTypeLabel = "instance_type"

InstanceTypeVCPU = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metrics.Namespace,
Subsystem: cloudProviderSubsystem,
Name: "instance_type_cpu_cores",
Help: "VCPUs cores for a given instance type.",
},
[]string{
InstanceTypeLabel,
})

InstanceTypeMemory = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metrics.Namespace,
Subsystem: cloudProviderSubsystem,
Name: "instance_type_memory_bytes",
Help: "Memory, in bytes, for a given instance type.",
},
[]string{
InstanceTypeLabel,
})
)

func init() {
crmetrics.Registry.MustRegister(InstanceTypeVCPU, InstanceTypeMemory)
}
28 changes: 28 additions & 0 deletions pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,34 @@ var _ = Describe("Instance Types", func() {
Expect(it.Capacity.Pods().Value()).ToNot(BeNumerically("==", 110))
}
})
It("should expose vcpu metrics for instance types", func() {
instanceInfo, err := awsEnv.InstanceTypesProvider.List(ctx, provisioner.Spec.KubeletConfiguration, nodeTemplate)
Expect(err).To(BeNil())
Expect(len(instanceInfo)).To(BeNumerically(">", 0))
for _, info := range instanceInfo {
metric, ok := FindMetricWithLabelValues("karpenter_cloudprovider_instance_type_cpu_cores", map[string]string{
instancetype.InstanceTypeLabel: info.Name,
})
Expect(ok).To(BeTrue())
Expect(metric).To(Not(BeNil()))
value := metric.GetGauge().Value
Expect(aws.Float64Value(value)).To(BeNumerically(">", 0))
}
})
It("should expose memory metrics for instance types", func() {
instanceInfo, err := awsEnv.InstanceTypesProvider.List(ctx, provisioner.Spec.KubeletConfiguration, nodeTemplate)
Expect(err).To(BeNil())
Expect(len(instanceInfo)).To(BeNumerically(">", 0))
for _, info := range instanceInfo {
metric, ok := FindMetricWithLabelValues("karpenter_cloudprovider_instance_type_memory_bytes", map[string]string{
instancetype.InstanceTypeLabel: info.Name,
})
Expect(ok).To(BeTrue())
Expect(metric).To(Not(BeNil()))
value := metric.GetGauge().Value
Expect(aws.Float64Value(value)).To(BeNumerically(">", 0))
}
})

Context("Overhead", func() {
var info *ec2.InstanceTypeInfo
Expand Down
2 changes: 1 addition & 1 deletion pkg/providers/pricing/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ var (
prometheus.GaugeOpts{
Namespace: metrics.Namespace,
Subsystem: cloudProviderSubsystem,
Name: "instance_price_estimate",
Name: "instance_type_price_estimate",
Help: "Estimated hourly price used when making informed decisions on node cost calculation. This is updated once on startup and then every 12 hours.",
},
[]string{
Expand Down
2 changes: 1 addition & 1 deletion pkg/providers/pricing/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ var _ = Describe("Pricing", func() {

func getPricingEstimateMetricValue(instanceType string, capacityType string, zone string) float64 {
var value *float64
metric, ok := FindMetricWithLabelValues("karpenter_cloudprovider_instance_price_estimate", map[string]string{
metric, ok := FindMetricWithLabelValues("karpenter_cloudprovider_instance_type_price_estimate", map[string]string{
pricing.InstanceTypeLabel: instanceType,
pricing.CapacityTypeLabel: capacityType,
pricing.RegionLabel: "",
Expand Down
13 changes: 13 additions & 0 deletions pkg/test/environment.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ import (
"github.com/aws/karpenter/pkg/providers/subnet"

coretest "github.com/aws/karpenter-core/pkg/test"

crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
)

type Environment struct {
Expand Down Expand Up @@ -146,4 +148,15 @@ func (env *Environment) Reset() {
env.LaunchTemplateCache.Flush()
env.SubnetCache.Flush()
env.SecurityGroupCache.Flush()

mfs, err := crmetrics.Registry.Gather()
if err != nil {
for _, mf := range mfs {
for _, metric := range mf.GetMetric() {
if metric != nil {
metric.Reset()
}
}
}
}
}
8 changes: 7 additions & 1 deletion website/content/en/preview/concepts/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,13 @@ Pod state is the current state of pods. This metric can be used several ways as
### `karpenter_cloudprovider_duration_seconds`
Duration of cloud provider method calls. Labeled by the controller, method name and provider.

### `karpenter_cloudprovider_instance_price_estimate`
### `karpenter_cloudprovider_instance_type_cpu_cores`
VCPUs cores for a given instance type.

### `karpenter_cloudprovider_instance_type_memory_bytes`
Memory, in bytes, for a given instance type.

### `karpenter_cloudprovider_instance_type_price_estimate`
Estimated hourly price used when making informed decisions on node cost calculation. This is updated once on startup and then every 12 hours.

## Cloudprovider Batcher Metrics
Expand Down

0 comments on commit 0ddf200

Please sign in to comment.