From 5fbd3c756287a1a65b0c712e8743b27b29e8752c Mon Sep 17 00:00:00 2001 From: Rafi Shamim Date: Mon, 6 Oct 2025 23:19:04 -0400 Subject: [PATCH 1/2] pgwire: lower the max repeated error count before closing a connection In 39067de we added behavior to give up and close a network connection if a threshold of repeated errors was reached. It retried on errors since some network errors could be transient. It was retrying tens of thousands of times, which is excessive. We lower this to 256 now. This is motivated by a few tests that identifed the error handling logic in this tight loop being quite expensive. Retrying fewer times means that we'll reduce CPU usage during failure scenarios. Release note: None --- pkg/sql/pgwire/server.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/sql/pgwire/server.go b/pkg/sql/pgwire/server.go index 99f5613e72cf..d4894fce5b00 100644 --- a/pkg/sql/pgwire/server.go +++ b/pkg/sql/pgwire/server.go @@ -1075,7 +1075,7 @@ func (s *Server) newConn( // maxRepeatedErrorCount is the number of times an error can be received // while reading from the network connection before the server decides to give // up and abort the connection. -const maxRepeatedErrorCount = 1 << 15 +const maxRepeatedErrorCount = 1 << 8 // serveImpl continuously reads from the network connection and pushes execution // instructions into a sql.StmtBuf, from where they'll be processed by a command From 5b51f7485e1ddadae0e465d991335232b5ae9cb6 Mon Sep 17 00:00:00 2001 From: Rafi Shamim Date: Fri, 17 Oct 2025 17:24:35 -0400 Subject: [PATCH 2/2] pgwire: make max repeated error count configurable via cluster setting Previously, the maximum number of repeated network read errors before aborting a connection was a hardcoded constant set to 256 (1 << 8). This change makes the value configurable via a non-public cluster setting `sql.pgwire.max_repeated_error_count`. This allows operators to tune the threshold for aborting connections experiencing repeated network errors, and allows us to backport this change along with 5f562ad3d57a657fc1c01fb2b653020c9a3e6d73. Epic: None Release note: None --- pkg/sql/pgwire/conn_test.go | 9 ++++++--- pkg/sql/pgwire/server.go | 16 ++++++++++------ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/pkg/sql/pgwire/conn_test.go b/pkg/sql/pgwire/conn_test.go index b499bbf6f541..38c4b9613bb2 100644 --- a/pkg/sql/pgwire/conn_test.go +++ b/pkg/sql/pgwire/conn_test.go @@ -1449,8 +1449,11 @@ func TestConnServerAbortsOnRepeatedErrors(t *testing.T) { conn, err := db.Conn(ctx) require.NoError(t, err) + // Get the current value of the cluster setting. + maxErrors := int(maxRepeatedErrorCount.Get(&srv.ClusterSettings().SV)) + atomic.StoreUint32(&shouldError, 1) - for i := 0; i < maxRepeatedErrorCount+100; i++ { + for i := 0; i < maxErrors+100; i++ { var s int err := conn.QueryRowContext(ctx, "SELECT 1").Scan(&s) if err != nil { @@ -1459,9 +1462,9 @@ func TestConnServerAbortsOnRepeatedErrors(t *testing.T) { } if errors.Is(err, driver.ErrBadConn) { // The server closed the connection, which is what we want! - require.GreaterOrEqualf(t, i, maxRepeatedErrorCount, + require.GreaterOrEqualf(t, i, maxErrors, "the server should have aborted after seeing %d errors", - maxRepeatedErrorCount, + maxErrors, ) return } diff --git a/pkg/sql/pgwire/server.go b/pkg/sql/pgwire/server.go index d4894fce5b00..4af01e9d0c04 100644 --- a/pkg/sql/pgwire/server.go +++ b/pkg/sql/pgwire/server.go @@ -94,6 +94,15 @@ var logVerboseSessionAuth = settings.RegisterBoolSetting( false, settings.WithPublic) +var maxRepeatedErrorCount = settings.RegisterIntSetting( + settings.ApplicationLevel, + "sql.pgwire.max_repeated_error_count", + "the maximum number of times an error can be received while reading from a "+ + "network connection before the server aborts the connection", + 1<<15, // 32768 + settings.PositiveInt, +) + const ( // ErrSSLRequired is returned when a client attempts to connect to a // secure server in cleartext. @@ -1072,11 +1081,6 @@ func (s *Server) newConn( return c } -// maxRepeatedErrorCount is the number of times an error can be received -// while reading from the network connection before the server decides to give -// up and abort the connection. -const maxRepeatedErrorCount = 1 << 8 - // serveImpl continuously reads from the network connection and pushes execution // instructions into a sql.StmtBuf, from where they'll be processed by a command // "processor" goroutine (a connExecutor). @@ -1448,7 +1452,7 @@ func (s *Server) serveImpl( // 3. we reached an arbitrary threshold of repeated errors. if netutil.IsClosedConnection(err) || errors.Is(err, context.Canceled) || - repeatedErrorCount > maxRepeatedErrorCount { + repeatedErrorCount > int(maxRepeatedErrorCount.Get(&s.execCfg.Settings.SV)) { break } } else {