diff --git a/pkg/runner/metrics.go b/pkg/runner/metrics.go new file mode 100644 index 0000000..53f6673 --- /dev/null +++ b/pkg/runner/metrics.go @@ -0,0 +1,25 @@ +package runner + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var ( + // DeleteRunnerBackoffDuration is histogram of exponential backoff duration for deleting runner + DeleteRunnerBackoffDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "myshoes", + Subsystem: "runner", + Name: "delete_runner_backoff_duration_seconds", + Help: "Histogram of exponential backoff duration in seconds for deleting runner", + Buckets: prometheus.ExponentialBuckets(1, 2, 10), // 1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s, 512s + }, []string{"runner_uuid"}) + + // DeleteRunnerRetryTotal is counter of total retries for deleting runner + DeleteRunnerRetryTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Namespace: "myshoes", + Subsystem: "runner", + Name: "delete_runner_retry_total", + Help: "Total number of retries for deleting runner", + }, []string{"runner_uuid"}) +) diff --git a/pkg/runner/runner_delete.go b/pkg/runner/runner_delete.go index 7cd3f30..591bb75 100644 --- a/pkg/runner/runner_delete.go +++ b/pkg/runner/runner_delete.go @@ -118,7 +118,12 @@ func (m *Manager) removeRunners(ctx context.Context, t datastore.Target) error { sem.Release(1) ConcurrencyDeleting.Add(-1) }() - time.Sleep(util.CalcRetryTime(count)) + sleep := util.CalcRetryTime(count) + if count > 0 { + DeleteRunnerRetryTotal.WithLabelValues(runner.UUID.String()).Inc() + DeleteRunnerBackoffDuration.WithLabelValues(runner.UUID.String()).Observe(sleep.Seconds()) + } + time.Sleep(sleep) if err := m.removeRunner(cctx, t, runner, ghRunners); err != nil { DeleteRetryCount.Store(runner.UUID, count+1) diff --git a/pkg/starter/metrics.go b/pkg/starter/metrics.go new file mode 100644 index 0000000..a4fa277 --- /dev/null +++ b/pkg/starter/metrics.go @@ -0,0 +1,25 @@ +package starter + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var ( + // AddInstanceBackoffDuration is histogram of exponential backoff duration for adding instance + AddInstanceBackoffDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "myshoes", + Subsystem: "starter", + Name: "add_instance_backoff_duration_seconds", + Help: "Histogram of exponential backoff duration in seconds for adding instance", + Buckets: prometheus.ExponentialBuckets(1, 2, 10), // 1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s, 512s + }, []string{"job_uuid"}) + + // AddInstanceRetryTotal is counter of total retries for adding instance + AddInstanceRetryTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Namespace: "myshoes", + Subsystem: "starter", + Name: "add_instance_retry_total", + Help: "Total number of retries for adding instance", + }, []string{"job_uuid"}) +) diff --git a/pkg/starter/starter.go b/pkg/starter/starter.go index 4c70d20..dcb7297 100644 --- a/pkg/starter/starter.go +++ b/pkg/starter/starter.go @@ -163,6 +163,10 @@ func (s *Starter) run(ctx context.Context, ch chan datastore.Job) error { inProgress.Store(job.UUID, struct{}{}) sleep := util.CalcRetryTime(count) + if count > 0 { + AddInstanceRetryTotal.WithLabelValues(job.UUID.String()).Inc() + AddInstanceBackoffDuration.WithLabelValues(job.UUID.String()).Observe(sleep.Seconds()) + } go func(job datastore.Job, sleep time.Duration) { defer func() { sem.Release(1)