From 30fbdf7e98bc34deb268f12e65726d5075581610 Mon Sep 17 00:00:00 2001 From: Jon Eisenstein Date: Mon, 11 May 2020 16:18:39 -0400 Subject: [PATCH] Add additional metrics for worker saturation analysis and scaling --- lib/yabeda/sidekiq.rb | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/lib/yabeda/sidekiq.rb b/lib/yabeda/sidekiq.rb index 3701169..a1abd7b 100644 --- a/lib/yabeda/sidekiq.rb +++ b/lib/yabeda/sidekiq.rb @@ -34,6 +34,10 @@ module Sidekiq gauge :active_processes, tags: [], comment: "The number of active Sidekiq worker processes." gauge :queue_latency, tags: %i[queue], comment: "The queue latency, the difference in seconds since the oldest job in the queue was enqueued" + gauge :concurrency, tags: [], comment: "The total number of jobs that can be run at a time across all processes." + gauge :available_workers, tags: [], comment: "The number of workers available for new jobs across all processes." + gauge :saturation, tags: [], comment: "Percentage of workers available for new jobs across all processes." + histogram :job_latency, comment: "The job latency, the difference in seconds between enqueued and running time", unit: :seconds, per: :job, tags: %i[queue worker], @@ -59,6 +63,27 @@ module Sidekiq sidekiq_queue_latency.set({ queue: queue.name }, queue.latency) end + # Process-level metrics. These come from a common pool, but we can calculate them as global values. + # The "quiet" flag (set when the process receives TSTP signal) is only available in the global ProcessSet, + # so we may as well get everything from there. + process_set = ::Sidekiq::ProcessSet.new + total_concurrency = 0 + total_available_workers = 0 + process_set.each do |process| + concurrency = process['concurrency'] + busy_workers = process['busy'] + available_workers = (process['quiet'] == 'true') ? 0 : (concurrency - busy_workers) + + total_concurrency += concurrency + total_available_workers += available_workers + end + # Use available_workers instead of busy_workers here because we want quieted processes to report as full. + saturation = 1 - (total_available_workers.to_f / total_concurrency) + + sidekiq_concurrency.set({}, total_concurrency) + sidekiq_available_workers.set({}, total_available_workers) + sidekiq_saturation.set({}, saturation) + # That is quite slow if your retry set is large # I don't want to enable it by default # retries_by_queues =