diff --git a/snafu/scale_openshift_wrapper/trigger_scale.py b/snafu/scale_openshift_wrapper/trigger_scale.py index e4ad40fa..455322c2 100644 --- a/snafu/scale_openshift_wrapper/trigger_scale.py +++ b/snafu/scale_openshift_wrapper/trigger_scale.py @@ -34,6 +34,7 @@ def __init__(self, args): self.poll_interval = args.poll_interval self.kubeconfig = args.kubeconfig self.is_rosa = False + self.timeout = int(args.timeout * 60) if args.rosa_cluster is not None: logger.info("Identified ROSA for scaling process") if args.rosa_token is None: @@ -248,6 +249,8 @@ def _run_scale(self): logger.info("New worker per machine set %s" % (machine_spread)) logger.info("Starting Patching of machine sets") + start_time = time.time() + end_time = start_time + self.timeout # Patch the machinesets if not self.is_rosa: for i in range(len(machineset_workers)): @@ -269,6 +272,12 @@ def _run_scale(self): while new_machine_sets.status.readyReplicas != machine_spread[i]: if new_machine_sets.status.readyReplicas is None and machine_spread[i] == 0: break + + current_time = time.time() + if current_time >= end_time: + logger.error("Timeout %d minutes exceeded" % self.timeout) + exit(1) + new_machine_sets = machinesets.get( namespace="openshift-machine-api", name=machineset_worker_list[i].metadata.name ) @@ -287,10 +296,24 @@ def _run_scale(self): # Ensure all workers are not listed as unschedulable # If we don't do this it will auto-complete a scale-down even though the workers # have not been eliminated yet - new_worker_list = nodes.get(label_selector="node-role.kubernetes.io/worker").attributes.items + new_worker_list = nodes.get( + label_selector="node-role.kubernetes.io/worker," + "!node-role.kubernetes.io/master," + "!node-role.kubernetes.io/infra," + "!node-role.kubernetes.io/workload" + ).attributes.items for i in range(len(new_worker_list)): while i < len(new_worker_list) and new_worker_list[i].spec.unschedulable: - new_worker_list = nodes.get(label_selector="node-role.kubernetes.io/worker").attributes.items + current_time = time.time() + if current_time >= end_time: + logger.error("Timeout %d minutes exceeded" % self.timeout) + exit(1) + new_worker_list = nodes.get( + label_selector="node-role.kubernetes.io/worker," + "!node-role.kubernetes.io/master," + "!node-role.kubernetes.io/infra," + "!node-role.kubernetes.io/workload" + ).attributes.items logger.debug( "Number of ready workers: %d. Waiting %d seconds for next check..." % (len(new_worker_list), self.poll_interval) @@ -298,10 +321,36 @@ def _run_scale(self): time.sleep(self.poll_interval) logger.info("All workers schedulable") + logger.inf("Verifying correct worker count") + current_workers = len( + nodes.get( + label_selector="node-role.kubernetes.io/worker," + "!node-role.kubernetes.io/master," + "!node-role.kubernetes.io/infra," + "!node-role.kubernetes.io/workload" + ).attributes.items + ) + while current_workers != int(self.scale): + current_time = time.time() + if current_time >= end_time: + logger.error("Timeout %d minutes exceeded" % self.timeout) + exit(1) + + logger.debug( + "Number of ready workers: %d. Waiting %d seconds for next check..." + % (len(new_worker_list), self.poll_interval) + ) + time.sleep(self.poll_interval) + + logger.info("Correct worker count verified") + worker_count = ( len( nodes.get( - label_selector="node-role.kubernetes.io/worker,!node-role.kubernetes.io/master" + label_selector="node-role.kubernetes.io/worker," + "!node-role.kubernetes.io/master," + "!node-role.kubernetes.io/infra," + "!node-role.kubernetes.io/workload" ).attributes.items ) or 0 @@ -329,6 +378,7 @@ def emit_actions(self): workload_count, platform, action, + successful, ) = self._run_scale() end_time = time.time() elaspsed_time = end_time - start_time