From 095cf28230079401726a38e9565a2c50d2ba89b1 Mon Sep 17 00:00:00 2001
From: ShikharSahay <shikharsahay285@gmail.com>
Date: Wed, 17 Jan 2024 21:50:39 +0000
Subject: [PATCH] [#20684] yugabyted: Restarting a second node of the cluster
 with --join flag throws an error.

Summary:
Previously, attempting to restart the second node of a YugabyteDB cluster with the `--join` flag resulted in an error. Resolving the issue such that the updated behavior allows for a smooth restart of the second node using the `--join` flag.
Jira: DB-9684

Test Plan: Manual Testing.

Reviewers: sgarg-yb, nikhil

Reviewed By: sgarg-yb, nikhil

Subscribers: yugabyted-dev, shikhar.sahay

Differential Revision: https://phorge.dev.yugabyte.com/D31786
---
 bin/yugabyted | 149 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 89 insertions(+), 60 deletions(-)

diff --git a/bin/yugabyted b/bin/yugabyted
index a65dc3db4e71..3901e10879e6 100755
--- a/bin/yugabyted
+++ b/bin/yugabyted
@@ -959,7 +959,10 @@ class ControlScript(object):
     # Prints status of YugabyteDB.
     def status(self):
         if len(os.listdir(self.configs.saved_data.get("data_dir"))) != 0:
-            Output.print_out(self.get_status_string())
+            Output.init_animation("Fetching status...")
+            status_output = self.get_status_string().strip()
+            Output.update_animation("")
+            Output.print_out(status_output)
         else:
             Output.print_out("{} is not running.".format(SCRIPT_NAME))
 
@@ -2162,6 +2165,7 @@ class ControlScript(object):
             return "Failed to start tserver {}".format(SCRIPT_NAME)
 
         if not was_already_setup:
+            self.configs.saved_data["cluster_member"] = True
             master_addresses = self.configs.saved_data.get("current_masters")
             universe_uuid = YBAdminProxy.get_cluster_uuid(master_addresses)
             if universe_uuid and universe_uuid != self.configs.saved_data["universe_uuid"]:
@@ -2181,6 +2185,7 @@ class ControlScript(object):
             return "Failed to start tserver {}".format(SCRIPT_NAME)
 
         if not was_already_setup:
+            self.configs.saved_data["cluster_member"] = True
             master_addresses = self.configs.saved_data.get("current_masters")
             universe_uuid = YBAdminProxy.get_cluster_uuid(master_addresses)
             if universe_uuid and universe_uuid != self.configs.saved_data["universe_uuid"]:
@@ -3240,6 +3245,10 @@ class ControlScript(object):
             Output.log('HTTP error occurred while fetching current' +
                     'masters from tserver: {}', http_err)
             return ''
+        except URLError as url_err:
+            Output.log('URL error occurred while fetching current' +
+                    'masters from tserver: {}', url_err)
+            return ''
         except Exception as err:
             Output.log('Other error occurred while fetching current' +
                     'masters from tserver: {}', err)
@@ -4357,7 +4366,6 @@ class ControlScript(object):
             # join_ip, let's try to add ourselves to it, otherwise
             # it is a hard failure.
             if current_node_master_uuid in master_uuids:
-                self.configs.saved_data["cluster_member"] = True
                 if not join_ip:
                     placement_uuid = self.configs.saved_data.get("placement_uuid")
                     placement_info = list()
@@ -4402,7 +4410,6 @@ class ControlScript(object):
 
         try:
             if retry_op_with_argument(self.get_master_uuids, master_addrs, timeout):
-                self.configs.saved_data["cluster_member"] = True
                 Output.log("Completed setup and wait for master.")
                 return True
         except RuntimeError:
@@ -4550,6 +4557,8 @@ class ControlScript(object):
             if was_already_setup:
                 if master_addrs:
                     status = "Running."
+                else:
+                    status = "Bootstrapping."
             else:
                 if self.wait_get_all_masters(timeout=10):
                     status = "Running."
@@ -4582,10 +4591,10 @@ class ControlScript(object):
                 Output.update_animation("Data placement constraint successfully verified")
             else:
                 rf = YBAdminProxy.get_cluster_rf(master_addrs)
-            status_info = [
-                (Output.make_yellow("Status"), status),
-                (Output.make_yellow("Replication Factor"), rf),
-            ]
+
+            status_info = [(Output.make_yellow("Status"), status)]
+            if rf:
+                status_info.append((Output.make_yellow("Replication Factor"), rf))
 
             if enabled_security_features:
                 status_info += [
@@ -5814,6 +5823,7 @@ class ControlScript(object):
             if args.background is None:
                 args.background = "True"
 
+            cluster_member = self.configs.saved_data.get("cluster_member")
             if args.join is not None:
                 if not self.validate_hostname_ip(args.join):
                     Output.log_error_and_exit(Output.make_red("ERROR") + ": --join" +
@@ -5821,66 +5831,79 @@ class ControlScript(object):
                         "IPV6 or DNS.")
 
                 Output.print_and_log("Fetching configs from join IP...")
-                # Check if tserver webserver at join_IP is reachable or not
-                # Also get the leader master(used to get the info of all tservers)
-                master_leader = self.get_current_master_leader_from_api(args.join)
-                args.join = master_leader
-
-                # Get info on all tservers
-                master_leader_hostport = "{}:{}".format(master_leader,
-                                    self.configs.saved_data.get("master_webserver_port"))
-                tservers_info = dict(self.get_all_tserver_info(master_leader_hostport))
-
-                # Check if any existing node has the same IP as advertise address
-                for uuid, nodes in tservers_info.items():
-                    for node in [node.split(":")[0] for node in list(nodes.keys())]:
-                        if args.advertise_address == node:
-                            Output.log_error_and_exit(Output.make_red("ERROR:") + " A node is " +
-                                        "already running on {}, please ".format(args.join) +
-                                        "specify a valid address.")
-
-                is_placement_uuid_set = False
-
-                # Set placement UUID for the node according to it's properties(rr or primary)
-                if args.read_replica:
-                    # When the 1st read replica node is started use a new uuid
-                    if len(tservers_info) == 1:
-                        is_placement_uuid_set = True
-                        Output.log("Starting first read replica node. " +
-                                    "Using {} as placement_uuid".format(
-                                        self.configs.saved_data.get("placement_uuid")))
-                    # When a read replica cluster exists use the existing placement UUID
+                if not cluster_member:
+                    # Check if tserver webserver at join_IP is reachable or not
+                    # Also get the leader master(used to get the info of all tservers)
+                    master_leader = self.get_current_master_leader_from_api(args.join)
+                    args.join = master_leader
+
+                    # Get info on all tservers
+                    master_leader_hostport = "{}:{}".format(master_leader,
+                                        self.configs.saved_data.get("master_webserver_port"))
+                    tservers_info = dict(self.get_all_tserver_info(master_leader_hostport))
+
+                    # Check if any existing node has the same IP as advertise address
+                    for uuid, nodes in tservers_info.items():
+                        for node in [node.split(":")[0] for node in list(nodes.keys())]:
+                            if args.advertise_address == node:
+                                Output.log_error_and_exit(Output.make_red("ERROR:") + " A node" +
+                                    " is already running on {}, please ".format(args.join) +
+                                    "specify a valid address.")
+
+                    is_placement_uuid_set = False
+
+                    # Set placement UUID for the node according to it's properties(rr or primary)
+                    if args.read_replica:
+                        # When the 1st read replica node is started use a new uuid
+                        if len(tservers_info) == 1:
+                            is_placement_uuid_set = True
+                            Output.log("Starting first read replica node. " +
+                                        "Using {} as placement_uuid".format(
+                                            self.configs.saved_data.get("placement_uuid")))
+                        # When a read replica cluster exists use the existing placement UUID
+                        else:
+                            for uuid, nodes in tservers_info.items():
+                                nodes_list = [node.split(":")[0] for node in list(nodes.keys())]
+                                if master_leader not in nodes_list and len(nodes) != 0:
+                                    self.configs.saved_data["placement_uuid"] = uuid
+                                    Output.log("Using placement_uuid {} from ".format(uuid) +
+                                                "existing read replica cluster.")
+                                    is_placement_uuid_set = True
                     else:
+                        # Use placement uuid set for the primary cluster when 1st node was started.
                         for uuid, nodes in tservers_info.items():
                             nodes_list = [node.split(":")[0] for node in list(nodes.keys())]
-                            if master_leader not in nodes_list and len(nodes) != 0:
+                            if master_leader in nodes_list:
                                 self.configs.saved_data["placement_uuid"] = uuid
                                 Output.log("Using placement_uuid {} from ".format(uuid) +
-                                            "existing read replica cluster.")
+                                                "existing primary cluster.")
                                 is_placement_uuid_set = True
-                else:
-                    # Use placement uuid set for the primary cluster when 1st node was started.
-                    for uuid, nodes in tservers_info.items():
-                        nodes_list = [node.split(":")[0] for node in list(nodes.keys())]
-                        if master_leader in nodes_list:
-                            self.configs.saved_data["placement_uuid"] = uuid
-                            Output.log("Using placement_uuid {} from ".format(uuid) +
-                                            "existing primary cluster.")
-                            is_placement_uuid_set = True
 
-                # If placement UUID could not be set for some reason, throw an error
-                if not is_placement_uuid_set:
-                    Output.log("Cannot find placement UUID for the node. " +
-                            "Leader Master node: {}. ".format(master_leader) +
-                            "Response from tablet-servers API: {}".format(
-                                str(tservers_info)))
-                    Output.log_error_and_exit(Output.make_red("ERROR:") +
-                            " Unable to start the node.")
+                    # If placement UUID could not be set for some reason, throw an error
+                    if not is_placement_uuid_set:
+                        Output.log("Cannot find placement UUID for the node. " +
+                                "Leader Master node: {}. ".format(master_leader) +
+                                "Response from tablet-servers API: {}".format(
+                                    str(tservers_info)))
+                        Output.log_error_and_exit(Output.make_red("ERROR:") +
+                                " Unable to start the node.")
+
+                # Restart node as a part of an existing cluster with the join flag specified
+                else:
+                    Output.log("Restarting node as part of an existing cluster. " +
+                                            "Using {} as placement_uuid".format(
+                                            self.configs.saved_data.get("placement_uuid")))
 
-            # If no --join is passed then start a new cluster with a new placement_uuid
+            # If no --join is passed, check if its a first time start or its a restart
             else:
-                Output.log("Starting first primary node. Using {} as placement_uuid".format(
+                if not cluster_member:
+                    Output.log("Starting first primary node. Using {} as placement_uuid".format(
                                            self.configs.saved_data.get("placement_uuid")))
+                # Restart node as a part of an existing cluster without the join flag specified
+                else:
+                    Output.log("Restarting node as part of an existing cluster. " +
+                                            "Using {} as placement_uuid".format(
+                                            self.configs.saved_data.get("placement_uuid")))
 
             self.find_security_nature_of_deployment(args)
 
@@ -5891,7 +5914,7 @@ class ControlScript(object):
                     ": --certs_dir flag needs to be accompanied with the --secure flag.")
 
             if args.insecure:
-                if args.join:
+                if args.join and not cluster_member:
                     master_hostport = "{}:{}".format(args.join,
                             self.configs.saved_data.get("master_webserver_port"))
                     if self.is_leader_master_secure(master_hostport):
@@ -5902,7 +5925,7 @@ class ControlScript(object):
                             "IP was provided in --join flag has SSL/TLS enabled. Cannot join a " +
                             "secure and an insecure node.")
             elif args.secure:
-                if args.join:
+                if args.join and not cluster_member:
                     master_hostport = "{}:{}".format(args.join,
                             self.configs.saved_data.get("master_webserver_port"))
                     if not self.is_leader_master_secure(master_hostport):
@@ -6535,6 +6558,7 @@ class Configs(object):
             "backup_daemon": False,
             "dns_enabled": False,
             "read_replica": False,
+            "cluster_member": False,
         }
         # Used to store data specific to certain functions that we don't want to save.
         self.temp_data = {
@@ -7951,7 +7975,12 @@ class Output(object):
                     symbol = status
                     running = False
 
-                line = "\r{} {}".format(symbol, msg)
+                if msg == "":
+                    line = "\r" + " " * line_len
+                    running = False
+                else:
+                    line = "\r{} {}".format(symbol, msg)
+
                 line_len = max(len(line), line_len)
                 line_to_write = "{:<{}}".format(line, line_len)
                 if not running: