From 3d124d0efd63277e65036bf823b64f11a0ced117 Mon Sep 17 00:00:00 2001
From: iskettaneh <173953022+iskettaneh@users.noreply.github.com>
Date: Wed, 8 Oct 2025 11:53:21 -0400
Subject: [PATCH] server: return draining progress if the server controller is
 draining

This commit fixes a bug where if the server controller is being drained,
the drain command would return a response without populating the
IsDraining field. The CLI interprets this as if the draining is
completed.

Output example when running:

```
./cockroach node drain 3 --insecure --url {pgurl:3} --logtostderr=INFO

I251008 15:54:42.613200 15 2@rpc/peer.go:613  [rnode=?,raddr=10.142.1.228:26257,class=system,rpc] 1  connection is now healthy
node is draining... remaining: 3
I251008 15:54:42.622586 1 cli/rpc_node_shutdown.go:184  [-] 2  drain details: tenant servers: 3
node is draining... remaining: 3
I251008 15:54:42.824526 1 cli/rpc_node_shutdown.go:184  [-] 3  drain details: tenant servers: 3
node is draining... remaining: 1
I251008 15:54:43.026405 1 cli/rpc_node_shutdown.go:184  [-] 4  drain details: tenant servers: 1
node is draining... remaining: 1
I251008 15:54:43.228596 1 cli/rpc_node_shutdown.go:184  [-] 5  drain details: tenant servers: 1
node is draining... remaining: 243
I251008 15:54:44.580413 1 cli/rpc_node_shutdown.go:184  [-] 6  drain details: liveness record: 1, range lease iterations: 175, descriptor leases: 67
node is draining... remaining: 0 (complete)
drain ok
```

Release note (bug fix): fixed a bug in the drain command where draining
a node using virtual clusters (such as clusters running Physical
Cluster Replication) could return before the drain was complete,
possibly resulting in shutting down the node while it still had active
SQL clients and ranges leases.

Epic: None
---
 pkg/server/drain.go             | 14 +++++++---
 pkg/server/drain_test.go        | 45 +++++++++++++++++++++++++++++----
 pkg/server/server_controller.go |  4 +++
 3 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/pkg/server/drain.go b/pkg/server/drain.go
index 727a152dcfda..c63e72e7f20d 100644
--- a/pkg/server/drain.go
+++ b/pkg/server/drain.go
@@ -364,10 +364,13 @@ func (s *drainServer) drainInner(
 	return s.drainNode(ctx, reporter, verbose)
 }
 
-// isDraining returns true if either SQL client connections are being drained
-// or if one of the stores on the node is not accepting replicas.
+// isDraining returns true if the SQL client connections are being drained,
+// if one of the stores on the node is not accepting replicas, or if the
+// server controller is being drained.
 func (s *drainServer) isDraining() bool {
-	return s.sqlServer.pgServer.IsDraining() || (s.kvServer.node != nil && s.kvServer.node.IsDraining())
+	return s.sqlServer.pgServer.IsDraining() ||
+		(s.kvServer.node != nil && s.kvServer.node.IsDraining()) ||
+		(s.serverCtl != nil && s.serverCtl.IsDraining())
 }
 
 // drainClients starts draining the SQL layer.
@@ -379,7 +382,10 @@ func (s *drainServer) drainClients(
 	var cancel context.CancelFunc
 	ctx, cancel = context.WithCancel(ctx)
 	defer cancel()
-	shouldDelayDraining := !s.isDraining()
+
+	// If this is the first time we are draining the clients, we delay draining
+	// to allow health probes to notice that the node is not ready.
+	shouldDelayDraining := !s.sqlServer.pgServer.IsDraining()
 
 	// Set the gRPC mode of the node to "draining" and mark the node as "not ready".
 	// Probes to /health?ready=1 will now notice the change in the node's readiness.
diff --git a/pkg/server/drain_test.go b/pkg/server/drain_test.go
index 5d6178162a07..19329b13814f 100644
--- a/pkg/server/drain_test.go
+++ b/pkg/server/drain_test.go
@@ -7,6 +7,7 @@ package server_test
 
 import (
 	"context"
+	gosql "database/sql"
 	"io"
 	"testing"
 	"time"
@@ -19,6 +20,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/sql/sqlstats"
 	"github.com/cockroachdb/cockroach/pkg/testutils"
 	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
+	"github.com/cockroachdb/cockroach/pkg/testutils/skip"
 	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
 	"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
 	"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
@@ -33,15 +35,39 @@ import (
 func TestDrain(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 	defer log.Scope(t).Close(t)
-	doTestDrain(t)
+	testutils.RunTrueAndFalse(t, "secondaryTenants", func(t *testing.T, secondaryTenants bool) {
+		doTestDrain(t, secondaryTenants)
+	})
 }
 
 // doTestDrain runs the drain test.
-func doTestDrain(tt *testing.T) {
+func doTestDrain(tt *testing.T, secondaryTenants bool) {
+	if secondaryTenants {
+		// Draining the tenant server takes a long time under stress. See #155229.
+		skip.UnderRace(tt)
+	}
 	var drainSleepCallCount = 0
 	t := newTestDrainContext(tt, &drainSleepCallCount)
 	defer t.Close()
 
+	var tenantConn *gosql.DB
+	if secondaryTenants {
+		_, err := t.tc.ServerConn(0).Exec("CREATE TENANT hello")
+		require.NoError(tt, err)
+		_, err = t.tc.ServerConn(0).Exec("ALTER TENANT hello START SERVICE SHARED")
+		require.NoError(tt, err)
+
+		// Wait for the tenant to be ready by establishing a test connection.
+		testutils.SucceedsSoon(tt, func() error {
+			tenantConn, err = t.tc.Server(0).SystemLayer().SQLConnE(
+				serverutils.DBName("cluster:hello/defaultdb"))
+			if err != nil {
+				return err
+			}
+			return tenantConn.Ping()
+		})
+	}
+
 	// Issue a probe. We're not draining yet, so the probe should
 	// reflect that.
 	resp := t.sendProbe()
@@ -53,7 +79,12 @@ func doTestDrain(tt *testing.T) {
 	resp = t.sendDrainNoShutdown()
 	t.assertDraining(resp, true)
 	t.assertRemaining(resp, true)
-	t.assertEqual(1, drainSleepCallCount)
+	if !secondaryTenants {
+		// If we have secondary tenants, we might not have waited before draining
+		// the clients at this point because we might have returned early if we are
+		// still draining the serverController.
+		t.assertEqual(1, drainSleepCallCount)
+	}
 
 	// Issue another probe. This checks that the server is still running
 	// (i.e. Shutdown: false was effective), the draining status is
@@ -63,12 +94,14 @@ func doTestDrain(tt *testing.T) {
 	t.assertDraining(resp, true)
 	// probe-only has no remaining.
 	t.assertRemaining(resp, false)
-	t.assertEqual(1, drainSleepCallCount)
-
+	if !secondaryTenants {
+		t.assertEqual(1, drainSleepCallCount)
+	}
 	// Repeat drain commands until we verify that there are zero remaining leases
 	// (i.e. complete). Also validate that the server did not sleep again.
 	testutils.SucceedsSoon(t, func() error {
 		resp = t.sendDrainNoShutdown()
+		t.Logf("drain response: %+v", resp)
 		if !resp.IsDraining {
 			return errors.Newf("expected draining")
 		}
@@ -78,6 +111,7 @@ func doTestDrain(tt *testing.T) {
 		}
 		return nil
 	})
+
 	t.assertEqual(1, drainSleepCallCount)
 
 	// Now issue a drain request without drain but with shutdown.
@@ -184,6 +218,7 @@ func newTestDrainContext(t *testing.T, drainSleepCallCount *int) *testDrainConte
 			// We need to start the cluster insecure in order to not
 			// care about TLS settings for the RPC client connection.
 			ServerArgs: base.TestServerArgs{
+				DefaultTestTenant: base.TestControlsTenantsExplicitly,
 				Knobs: base.TestingKnobs{
 					Server: &server.TestingKnobs{
 						DrainSleepFn: func(time.Duration) {
diff --git a/pkg/server/server_controller.go b/pkg/server/server_controller.go
index 37a205cde71e..02f0a5e0d517 100644
--- a/pkg/server/server_controller.go
+++ b/pkg/server/server_controller.go
@@ -168,6 +168,10 @@ func (s *serverController) SetDraining() {
 	}
 }
 
+func (s *serverController) IsDraining() bool {
+	return s.draining.Load()
+}
+
 // start monitors changes to the service mode and updates
 // the running servers accordingly.
 func (c *serverController) start(ctx context.Context, ie isql.Executor) error {