ti-chi-bot
diff --git a/‎cmd/explaintest/r/explain_complex.result
+20 b/‎cmd/explaintest/r/explain_complex.result
+20
diff --git a/‎pkg/planner/cardinality/row_size.go
+189 b/‎pkg/planner/cardinality/row_size.go
+189
diff --git a/‎pkg/planner/core/casetest/planstats/testdata/plan_stats_suite_out.json
+161 b/‎pkg/planner/core/casetest/planstats/testdata/plan_stats_suite_out.json
+161
@@ -246,6 +246,7 @@ UNIQUE KEY org_employee_position_pk (hotel_id,user_id,position_id)
 set tidb_cost_model_version=2;
 explain format = 'brief' SELECT d.id, d.ctx, d.name, d.left_value, d.right_value, d.depth, d.leader_id, d.status, d.created_on, d.updated_on FROM org_department AS d LEFT JOIN org_position AS p ON p.department_id = d.id AND p.status = 1000 LEFT JOIN org_employee_position AS ep ON ep.position_id = p.id AND ep.status = 1000 WHERE (d.ctx = 1 AND (ep.user_id = 62 OR d.id = 20 OR d.id = 20) AND d.status = 1000) GROUP BY d.id ORDER BY d.left_value;
 id	estRows	task	access object	operator info
+<<<<<<< HEAD:cmd/explaintest/r/explain_complex.result
 Sort	1.00	root		test.org_department.left_value
 └─HashAgg	1.00	root		group by:test.org_department.id, funcs:firstrow(test.org_department.id)->test.org_department.id, funcs:firstrow(test.org_department.ctx)->test.org_department.ctx, funcs:firstrow(test.org_department.name)->test.org_department.name, funcs:firstrow(test.org_department.left_value)->test.org_department.left_value, funcs:firstrow(test.org_department.right_value)->test.org_department.right_value, funcs:firstrow(test.org_department.depth)->test.org_department.depth, funcs:firstrow(test.org_department.leader_id)->test.org_department.leader_id, funcs:firstrow(test.org_department.status)->test.org_department.status, funcs:firstrow(test.org_department.created_on)->test.org_department.created_on, funcs:firstrow(test.org_department.updated_on)->test.org_department.updated_on
   └─Selection	0.01	root		or(eq(test.org_employee_position.user_id, 62), or(eq(test.org_department.id, 20), eq(test.org_department.id, 20)))
@@ -263,6 +264,25 @@ Sort	1.00	root		test.org_department.left_value
       └─TableReader(Probe)	9.99	root		data:Selection
         └─Selection	9.99	cop[tikv]		eq(test.org_employee_position.status, 1000), not(isnull(test.org_employee_position.position_id))
           └─TableFullScan	10000.00	cop[tikv]	table:ep	keep order:false, stats:pseudo
+=======
+Sort	1.00	root		explain_complex.org_department.left_value
+└─HashAgg	1.00	root		group by:explain_complex.org_department.id, funcs:firstrow(explain_complex.org_department.id)->explain_complex.org_department.id, funcs:firstrow(explain_complex.org_department.ctx)->explain_complex.org_department.ctx, funcs:firstrow(explain_complex.org_department.name)->explain_complex.org_department.name, funcs:firstrow(explain_complex.org_department.left_value)->explain_complex.org_department.left_value, funcs:firstrow(explain_complex.org_department.right_value)->explain_complex.org_department.right_value, funcs:firstrow(explain_complex.org_department.depth)->explain_complex.org_department.depth, funcs:firstrow(explain_complex.org_department.leader_id)->explain_complex.org_department.leader_id, funcs:firstrow(explain_complex.org_department.status)->explain_complex.org_department.status, funcs:firstrow(explain_complex.org_department.created_on)->explain_complex.org_department.created_on, funcs:firstrow(explain_complex.org_department.updated_on)->explain_complex.org_department.updated_on
+  └─Selection	0.01	root		or(eq(explain_complex.org_employee_position.user_id, 62), or(eq(explain_complex.org_department.id, 20), eq(explain_complex.org_department.id, 20)))
+    └─HashJoin	0.02	root		left outer join, equal:[eq(explain_complex.org_position.id, explain_complex.org_employee_position.position_id)]
+      ├─TableReader(Build)	9.99	root		data:Selection
+      │ └─Selection	9.99	cop[tikv]		eq(explain_complex.org_employee_position.status, 1000), not(isnull(explain_complex.org_employee_position.position_id))
+      │   └─TableFullScan	10000.00	cop[tikv]	table:ep	keep order:false, stats:pseudo
+      └─IndexJoin(Probe)	0.01	root		left outer join, inner:IndexLookUp, outer key:explain_complex.org_department.id, inner key:explain_complex.org_position.department_id, equal cond:eq(explain_complex.org_department.id, explain_complex.org_position.department_id)
+        ├─IndexLookUp(Build)	0.01	root		
+        │ ├─IndexRangeScan(Build)	10.00	cop[tikv]	table:d, index:org_department_ctx_index(ctx)	range:[1,1], keep order:false, stats:pseudo
+        │ └─Selection(Probe)	0.01	cop[tikv]		eq(explain_complex.org_department.status, 1000)
+        │   └─TableRowIDScan	10.00	cop[tikv]	table:d	keep order:false, stats:pseudo
+        └─IndexLookUp(Probe)	0.01	root		
+          ├─Selection(Build)	12.50	cop[tikv]		not(isnull(explain_complex.org_position.department_id))
+          │ └─IndexRangeScan	12.51	cop[tikv]	table:p, index:org_position_department_id_index(department_id)	range: decided by [eq(explain_complex.org_position.department_id, explain_complex.org_department.id)], keep order:false, stats:pseudo
+          └─Selection(Probe)	0.01	cop[tikv]		eq(explain_complex.org_position.status, 1000)
+            └─TableRowIDScan	12.50	cop[tikv]	table:p	keep order:false, stats:pseudo
+>>>>>>> 8fde2d6fa2b (planner: set min for high risk plan steps (#56631)):tests/integrationtest/r/explain_complex.result
 set tidb_cost_model_version=1;
 create table test.Tab_A (id int primary key,bid int,cid int,name varchar(20),type varchar(20),num int,amt decimal(11,2));
 create table test.Tab_B (id int primary key,name varchar(20));
 
@@ -0,0 +1,189 @@
+// Copyright 2023 PingCAP, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cardinality
+
+import (
+	"math"
+
+	"github.com/pingcap/tidb/pkg/expression"
+	"github.com/pingcap/tidb/pkg/kv"
+	"github.com/pingcap/tidb/pkg/parser/mysql"
+	"github.com/pingcap/tidb/pkg/planner/planctx"
+	"github.com/pingcap/tidb/pkg/statistics"
+	"github.com/pingcap/tidb/pkg/tablecodec"
+	"github.com/pingcap/tidb/pkg/util/chunk"
+)
+
+const pseudoColSize = 8.0
+
+// GetIndexAvgRowSize computes average row size for a index scan.
+func GetIndexAvgRowSize(ctx planctx.PlanContext, coll *statistics.HistColl, cols []*expression.Column, isUnique bool) (size float64) {
+	size = GetAvgRowSize(ctx, coll, cols, true, true)
+	// tablePrefix(1) + tableID(8) + indexPrefix(2) + indexID(8)
+	// Because the cols for index scan always contain the handle, so we don't add the rowID here.
+	size += 19
+	if !isUnique {
+		// add the len("_")
+		size++
+	}
+	return
+}
+
+// GetTableAvgRowSize computes average row size for a table scan, exclude the index key-value pairs.
+func GetTableAvgRowSize(ctx planctx.PlanContext, coll *statistics.HistColl, cols []*expression.Column, storeType kv.StoreType, handleInCols bool) (size float64) {
+	size = GetAvgRowSize(ctx, coll, cols, false, true)
+	switch storeType {
+	case kv.TiKV:
+		size += tablecodec.RecordRowKeyLen
+		// The `cols` for TiKV always contain the row_id, so prefix row size subtract its length.
+		size -= 8
+	case kv.TiFlash:
+		if !handleInCols {
+			size += 8 /* row_id length */
+		}
+	}
+	// Avoid errors related to size less than zero
+	size = max(0, size)
+	return
+}
+
+// GetAvgRowSize computes average row size for given columns.
+func GetAvgRowSize(ctx planctx.PlanContext, coll *statistics.HistColl, cols []*expression.Column, isEncodedKey bool, isForScan bool) (size float64) {
+	sessionVars := ctx.GetSessionVars()
+	if coll.Pseudo || coll.ColNum() == 0 || coll.RealtimeCount == 0 {
+		size = pseudoColSize * float64(len(cols))
+	} else {
+		for _, col := range cols {
+			colHist := coll.GetCol(col.UniqueID)
+			// Normally this would not happen, it is for compatibility with old version stats which
+			// does not include TotColSize.
+			if colHist == nil || (!colHist.IsHandle && colHist.TotColSize == 0 && (colHist.NullCount != coll.RealtimeCount)) {
+				size += pseudoColSize
+				continue
+			}
+			// We differentiate if the column is encoded as key or value, because the resulted size
+			// is different.
+			if sessionVars.EnableChunkRPC && !isForScan {
+				size += AvgColSizeChunkFormat(colHist, coll.RealtimeCount)
+			} else {
+				size += AvgColSize(colHist, coll.RealtimeCount, isEncodedKey)
+			}
+		}
+	}
+	// Avoid errors related to size less than zero
+	size = max(0, size)
+	if sessionVars.EnableChunkRPC && !isForScan {
+		// Add 1/8 byte for each column's nullBitMap byte.
+		return size + float64(len(cols))/8
+	}
+	// Add 1 byte for each column's flag byte. See `encode` for details.
+	return size + float64(len(cols))
+}
+
+// GetAvgRowSizeDataInDiskByRows computes average row size for given columns.
+func GetAvgRowSizeDataInDiskByRows(coll *statistics.HistColl, cols []*expression.Column) (size float64) {
+	if coll.Pseudo || coll.ColNum() == 0 || coll.RealtimeCount == 0 {
+		for _, col := range cols {
+			size += float64(chunk.EstimateTypeWidth(col.GetStaticType()))
+		}
+	} else {
+		for _, col := range cols {
+			colHist := coll.GetCol(col.UniqueID)
+			// Normally this would not happen, it is for compatibility with old version stats which
+			// does not include TotColSize.
+			if colHist == nil || (!colHist.IsHandle && colHist.TotColSize == 0 && (colHist.NullCount != coll.RealtimeCount)) {
+				size += float64(chunk.EstimateTypeWidth(col.GetStaticType()))
+				continue
+			}
+			size += AvgColSizeDataInDiskByRows(colHist, coll.RealtimeCount)
+		}
+	}
+	// Add 8 byte for each column's size record. See `DataInDiskByRows` for details.
+	return max(0, size+float64(8*len(cols)))
+}
+
+// AvgColSize is the average column size of the histogram. These sizes are derived from function `encode`
+// and `Datum::ConvertTo`, so we need to update them if those 2 functions are changed.
+func AvgColSize(c *statistics.Column, count int64, isKey bool) float64 {
+	if count == 0 {
+		return 0
+	}
+	// Note that, if the handle column is encoded as value, instead of key, i.e,
+	// when the handle column is in a unique index, the real column size may be
+	// smaller than 8 because it is encoded using `EncodeVarint`. Since we don't
+	// know the exact value size now, use 8 as approximation.
+	if c.IsHandle {
+		return 8
+	}
+	histCount := c.TotalRowCount()
+	notNullRatio := 1.0
+	if histCount > 0 {
+		notNullRatio = max(0, 1.0-float64(c.NullCount)/histCount)
+	}
+	switch c.Histogram.Tp.GetType() {
+	case mysql.TypeFloat, mysql.TypeDouble, mysql.TypeDuration, mysql.TypeDate, mysql.TypeDatetime, mysql.TypeTimestamp:
+		return 8 * notNullRatio
+	case mysql.TypeTiny, mysql.TypeShort, mysql.TypeInt24, mysql.TypeLong, mysql.TypeLonglong, mysql.TypeYear, mysql.TypeEnum, mysql.TypeBit, mysql.TypeSet:
+		if isKey {
+			return 8 * notNullRatio
+		}
+	}
+	// Keep two decimal place.
+	return max(0, math.Round(float64(c.TotColSize)/float64(count)*100)/100)
+}
+
+// AvgColSizeChunkFormat is the average column size of the histogram. These sizes are derived from function `Encode`
+// and `DecodeToChunk`, so we need to update them if those 2 functions are changed.
+func AvgColSizeChunkFormat(c *statistics.Column, count int64) float64 {
+	if count == 0 {
+		return 0
+	}
+	fixedLen := chunk.GetFixedLen(c.Histogram.Tp)
+	if fixedLen >= 0 {
+		return float64(fixedLen)
+	}
+	// Keep two decimal place.
+	// Add 8 bytes for unfixed-len type's offsets.
+	// Minus Log2(avgSize) for unfixed-len type LEN.
+	avgSize := float64(c.TotColSize) / float64(count)
+	if avgSize < 1 {
+		return max(0, math.Round(avgSize*100)/100) + 8
+	}
+	return max(0, math.Round((avgSize-math.Log2(avgSize))*100)/100) + 8
+}
+
+// AvgColSizeDataInDiskByRows is the average column size of the histogram. These sizes are derived
+// from `chunk.DataInDiskByRows` so we need to update them if those 2 functions are changed.
+func AvgColSizeDataInDiskByRows(c *statistics.Column, count int64) float64 {
+	if count == 0 {
+		return 0
+	}
+	histCount := c.TotalRowCount()
+	notNullRatio := 1.0
+	if histCount > 0 {
+		notNullRatio = 1.0 - float64(c.NullCount)/histCount
+	}
+	size := chunk.GetFixedLen(c.Histogram.Tp)
+	if size >= 0 {
+		return float64(size) * notNullRatio
+	}
+	// Keep two decimal place.
+	// Minus Log2(avgSize) for unfixed-len type LEN.
+	avgSize := float64(c.TotColSize) / float64(count)
+	if avgSize < 1 {
+		return max(0, math.Round((avgSize)*100)/100)
+	}
+	return math.Round((avgSize-math.Log2(avgSize))*100) / 100
+}
@@ -0,0 +1,161 @@
+[
+  {
+    "Name": "TestCollectDependingVirtualCols",
+    "Cases": [
+      {
+        "TableName": "t",
+        "InputColNames": [
+          "a",
+          "b"
+        ],
+        "OutputColNames": []
+      },
+      {
+        "TableName": "t",
+        "InputColNames": [
+          "c"
+        ],
+        "OutputColNames": [
+          "_v$_ic_char_0",
+          "_v$_ic_signed_0",
+          "_v$_ic_unsigned_0"
+        ]
+      },
+      {
+        "TableName": "t",
+        "InputColNames": [
+          "b",
+          "c"
+        ],
+        "OutputColNames": [
+          "_v$_ic_char_0",
+          "_v$_ic_signed_0",
+          "_v$_ic_unsigned_0"
+        ]
+      },
+      {
+        "TableName": "t1",
+        "InputColNames": [
+          "a"
+        ],
+        "OutputColNames": [
+          "vab"
+        ]
+      },
+      {
+        "TableName": "t1",
+        "InputColNames": [
+          "b"
+        ],
+        "OutputColNames": [
+          "_v$_ib_0",
+          "vab",
+          "vvc"
+        ]
+      },
+      {
+        "TableName": "t1",
+        "InputColNames": [
+          "c"
+        ],
+        "OutputColNames": [
+          "_v$_icvab_0",
+          "vc"
+        ]
+      },
+      {
+        "TableName": "t1",
+        "InputColNames": [
+          "vab"
+        ],
+        "OutputColNames": [
+          "_v$_icvab_0",
+          "_v$_ivvcvab_0",
+          "vvabvvc"
+        ]
+      },
+      {
+        "TableName": "t1",
+        "InputColNames": [
+          "vab",
+          "c"
+        ],
+        "OutputColNames": [
+          "_v$_icvab_0",
+          "_v$_ivvcvab_0",
+          "vc",
+          "vvabvvc"
+        ]
+      },
+      {
+        "TableName": "t1",
+        "InputColNames": [
+          "vc",
+          "c",
+          "vvc"
+        ],
+        "OutputColNames": [
+          "_v$_icvab_0",
+          "_v$_ivvcvab_0",
+          "vvabvvc"
+        ]
+      }
+    ]
+  },
+  {
+    "Name": "TestPartialStatsInExplain",
+    "Cases": [
+      {
+        "Query": "explain format = brief select * from tp where b = 10",
+        "Result": [
+          "TableReader 0.01 root partition:all data:Selection",
+          "└─Selection 0.01 cop[tikv]  eq(test.tp.b, 10)",
+          "  └─TableFullScan 6.00 cop[tikv] table:tp keep order:false, stats:partial[b:allEvicted]"
+        ]
+      },
+      {
+        "Query": "explain format = brief select * from t join tp where tp.a = 10 and t.b = tp.c",
+        "Result": [
+          "Projection 1.00 root  test.t.a, test.t.b, test.t.c, test.tp.a, test.tp.b, test.tp.c",
+          "└─HashJoin 1.00 root  inner join, equal:[eq(test.tp.c, test.t.b)]",
+          "  ├─TableReader(Build) 1.00 root partition:p1 data:Selection",
+          "  │ └─Selection 1.00 cop[tikv]  eq(test.tp.a, 10), not(isnull(test.tp.c))",
+          "  │   └─TableFullScan 6.00 cop[tikv] table:tp keep order:false, stats:partial[c:allEvicted]",
+          "  └─TableReader(Probe) 3.00 root  data:Selection",
+          "    └─Selection 3.00 cop[tikv]  not(isnull(test.t.b))",
+          "      └─TableFullScan 3.00 cop[tikv] table:t keep order:false, stats:partial[idx:allEvicted, a:allEvicted, b:allEvicted]"
+        ]
+      },
+      {
+        "Query": "explain format = brief select * from t join tp partition (p0) join t2 where t.a < 10 and t.b = tp.c and t2.a > 10 and t2.a = tp.c",
+        "Result": [
+          "HashJoin 0.33 root  inner join, equal:[eq(test.tp.c, test.t2.a)]",
+          "├─TableReader(Build) 1.00 root  data:TableRangeScan",
+          "│ └─TableRangeScan 1.00 cop[tikv] table:t2 range:(10,+inf], keep order:false, stats:partial[a:allEvicted]",
+          "└─IndexJoin(Probe) 0.33 root  inner join, inner:IndexLookUp, outer key:test.t.b, inner key:test.tp.c, equal cond:eq(test.t.b, test.tp.c)",
+          "  ├─TableReader(Build) 0.33 root  data:Selection",
+          "  │ └─Selection 0.33 cop[tikv]  gt(test.t.b, 10), not(isnull(test.t.b))",
+          "  │   └─TableRangeScan 1.00 cop[tikv] table:t range:[-inf,10), keep order:false, stats:partial[idx:allEvicted, a:allEvicted, b:allEvicted]",
+          "  └─IndexLookUp(Probe) 0.33 root partition:p0 ",
+          "    ├─Selection(Build) 0.33 cop[tikv]  gt(test.tp.c, 10), not(isnull(test.tp.c))",
+          "    │ └─IndexRangeScan 0.50 cop[tikv] table:tp, index:ic(c) range: decided by [eq(test.tp.c, test.t.b)], keep order:false, stats:partial[c:allEvicted]",
+          "    └─TableRowIDScan(Probe) 0.33 cop[tikv] table:tp keep order:false, stats:partial[c:allEvicted]"
+        ]
+      }
+    ]
+  },
+  {
+    "Name": "TestPlanStatsLoadForCTE",
+    "Cases": [
+      {
+        "Query": "explain format= brief with cte(x, y) as (select d + 1, b from t where c > 1) select * from cte where x < 3",
+        "Result": [
+          "Projection 1.60 root  plus(test.t.d, 1)->Column#12, test.t.b",
+          "└─TableReader 1.60 root  data:Selection",
+          "  └─Selection 1.60 cop[tikv]  gt(test.t.c, 1), lt(plus(test.t.d, 1), 3)",
+          "    └─TableFullScan 3.00 cop[tikv] table:t keep order:false"
+        ]
+      }
+    ]
+  }
+]