Skip to content

Commit c6b5625

Browse files
authored
Merge pull request #156312 from cockroachdb/blathers/backport-staging-v25.2.8-155640
staging-v25.2.8: release-25.2: server: return draining progress if the server controller is draining
2 parents aa7bc62 + 93fbb91 commit c6b5625

File tree

3 files changed

+54
-9
lines changed

3 files changed

+54
-9
lines changed

pkg/server/drain.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -364,10 +364,13 @@ func (s *drainServer) drainInner(
364364
return s.drainNode(ctx, reporter, verbose)
365365
}
366366

367-
// isDraining returns true if either SQL client connections are being drained
368-
// or if one of the stores on the node is not accepting replicas.
367+
// isDraining returns true if the SQL client connections are being drained,
368+
// if one of the stores on the node is not accepting replicas, or if the
369+
// server controller is being drained.
369370
func (s *drainServer) isDraining() bool {
370-
return s.sqlServer.pgServer.IsDraining() || (s.kvServer.node != nil && s.kvServer.node.IsDraining())
371+
return s.sqlServer.pgServer.IsDraining() ||
372+
(s.kvServer.node != nil && s.kvServer.node.IsDraining()) ||
373+
(s.serverCtl != nil && s.serverCtl.IsDraining())
371374
}
372375

373376
// drainClients starts draining the SQL layer.
@@ -379,7 +382,10 @@ func (s *drainServer) drainClients(
379382
var cancel context.CancelFunc
380383
ctx, cancel = context.WithCancel(ctx)
381384
defer cancel()
382-
shouldDelayDraining := !s.isDraining()
385+
386+
// If this is the first time we are draining the clients, we delay draining
387+
// to allow health probes to notice that the node is not ready.
388+
shouldDelayDraining := !s.sqlServer.pgServer.IsDraining()
383389

384390
// Set the gRPC mode of the node to "draining" and mark the node as "not ready".
385391
// Probes to /health?ready=1 will now notice the change in the node's readiness.

pkg/server/drain_test.go

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package server_test
77

88
import (
99
"context"
10+
gosql "database/sql"
1011
"io"
1112
"testing"
1213
"time"
@@ -19,6 +20,7 @@ import (
1920
"github.com/cockroachdb/cockroach/pkg/sql/sqlstats"
2021
"github.com/cockroachdb/cockroach/pkg/testutils"
2122
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
23+
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
2224
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
2325
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
2426
"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
@@ -33,15 +35,39 @@ import (
3335
func TestDrain(t *testing.T) {
3436
defer leaktest.AfterTest(t)()
3537
defer log.Scope(t).Close(t)
36-
doTestDrain(t)
38+
testutils.RunTrueAndFalse(t, "secondaryTenants", func(t *testing.T, secondaryTenants bool) {
39+
doTestDrain(t, secondaryTenants)
40+
})
3741
}
3842

3943
// doTestDrain runs the drain test.
40-
func doTestDrain(tt *testing.T) {
44+
func doTestDrain(tt *testing.T, secondaryTenants bool) {
45+
if secondaryTenants {
46+
// Draining the tenant server takes a long time under stress. See #155229.
47+
skip.UnderRace(tt)
48+
}
4149
var drainSleepCallCount = 0
4250
t := newTestDrainContext(tt, &drainSleepCallCount)
4351
defer t.Close()
4452

53+
var tenantConn *gosql.DB
54+
if secondaryTenants {
55+
_, err := t.tc.ServerConn(0).Exec("CREATE TENANT hello")
56+
require.NoError(tt, err)
57+
_, err = t.tc.ServerConn(0).Exec("ALTER TENANT hello START SERVICE SHARED")
58+
require.NoError(tt, err)
59+
60+
// Wait for the tenant to be ready by establishing a test connection.
61+
testutils.SucceedsSoon(tt, func() error {
62+
tenantConn, err = t.tc.Server(0).SystemLayer().SQLConnE(
63+
serverutils.DBName("cluster:hello/defaultdb"))
64+
if err != nil {
65+
return err
66+
}
67+
return tenantConn.Ping()
68+
})
69+
}
70+
4571
// Issue a probe. We're not draining yet, so the probe should
4672
// reflect that.
4773
resp := t.sendProbe()
@@ -53,7 +79,12 @@ func doTestDrain(tt *testing.T) {
5379
resp = t.sendDrainNoShutdown()
5480
t.assertDraining(resp, true)
5581
t.assertRemaining(resp, true)
56-
t.assertEqual(1, drainSleepCallCount)
82+
if !secondaryTenants {
83+
// If we have secondary tenants, we might not have waited before draining
84+
// the clients at this point because we might have returned early if we are
85+
// still draining the serverController.
86+
t.assertEqual(1, drainSleepCallCount)
87+
}
5788

5889
// Issue another probe. This checks that the server is still running
5990
// (i.e. Shutdown: false was effective), the draining status is
@@ -63,12 +94,14 @@ func doTestDrain(tt *testing.T) {
6394
t.assertDraining(resp, true)
6495
// probe-only has no remaining.
6596
t.assertRemaining(resp, false)
66-
t.assertEqual(1, drainSleepCallCount)
67-
97+
if !secondaryTenants {
98+
t.assertEqual(1, drainSleepCallCount)
99+
}
68100
// Repeat drain commands until we verify that there are zero remaining leases
69101
// (i.e. complete). Also validate that the server did not sleep again.
70102
testutils.SucceedsSoon(t, func() error {
71103
resp = t.sendDrainNoShutdown()
104+
t.Logf("drain response: %+v", resp)
72105
if !resp.IsDraining {
73106
return errors.Newf("expected draining")
74107
}
@@ -78,6 +111,7 @@ func doTestDrain(tt *testing.T) {
78111
}
79112
return nil
80113
})
114+
81115
t.assertEqual(1, drainSleepCallCount)
82116

83117
// Now issue a drain request without drain but with shutdown.
@@ -184,6 +218,7 @@ func newTestDrainContext(t *testing.T, drainSleepCallCount *int) *testDrainConte
184218
// We need to start the cluster insecure in order to not
185219
// care about TLS settings for the RPC client connection.
186220
ServerArgs: base.TestServerArgs{
221+
DefaultTestTenant: base.TestControlsTenantsExplicitly,
187222
Knobs: base.TestingKnobs{
188223
Server: &server.TestingKnobs{
189224
DrainSleepFn: func(time.Duration) {

pkg/server/server_controller.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,10 @@ func (s *serverController) SetDraining() {
168168
}
169169
}
170170

171+
func (s *serverController) IsDraining() bool {
172+
return s.draining.Load()
173+
}
174+
171175
// start monitors changes to the service mode and updates
172176
// the running servers accordingly.
173177
func (c *serverController) start(ctx context.Context, ie isql.Executor) error {

0 commit comments

Comments
 (0)