|
| 1 | +// Copyright 2023 PingCAP, Inc. |
| 2 | +// |
| 3 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +// you may not use this file except in compliance with the License. |
| 5 | +// You may obtain a copy of the License at |
| 6 | +// |
| 7 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +// |
| 9 | +// Unless required by applicable law or agreed to in writing, software |
| 10 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +// See the License for the specific language governing permissions and |
| 13 | +// limitations under the License. |
| 14 | + |
| 15 | +package cardinality |
| 16 | + |
| 17 | +import ( |
| 18 | + "math" |
| 19 | + |
| 20 | + "github.com/pingcap/tidb/pkg/expression" |
| 21 | + "github.com/pingcap/tidb/pkg/kv" |
| 22 | + "github.com/pingcap/tidb/pkg/parser/mysql" |
| 23 | + "github.com/pingcap/tidb/pkg/planner/planctx" |
| 24 | + "github.com/pingcap/tidb/pkg/statistics" |
| 25 | + "github.com/pingcap/tidb/pkg/tablecodec" |
| 26 | + "github.com/pingcap/tidb/pkg/util/chunk" |
| 27 | +) |
| 28 | + |
| 29 | +const pseudoColSize = 8.0 |
| 30 | + |
| 31 | +// GetIndexAvgRowSize computes average row size for a index scan. |
| 32 | +func GetIndexAvgRowSize(ctx planctx.PlanContext, coll *statistics.HistColl, cols []*expression.Column, isUnique bool) (size float64) { |
| 33 | + size = GetAvgRowSize(ctx, coll, cols, true, true) |
| 34 | + // tablePrefix(1) + tableID(8) + indexPrefix(2) + indexID(8) |
| 35 | + // Because the cols for index scan always contain the handle, so we don't add the rowID here. |
| 36 | + size += 19 |
| 37 | + if !isUnique { |
| 38 | + // add the len("_") |
| 39 | + size++ |
| 40 | + } |
| 41 | + return |
| 42 | +} |
| 43 | + |
| 44 | +// GetTableAvgRowSize computes average row size for a table scan, exclude the index key-value pairs. |
| 45 | +func GetTableAvgRowSize(ctx planctx.PlanContext, coll *statistics.HistColl, cols []*expression.Column, storeType kv.StoreType, handleInCols bool) (size float64) { |
| 46 | + size = GetAvgRowSize(ctx, coll, cols, false, true) |
| 47 | + switch storeType { |
| 48 | + case kv.TiKV: |
| 49 | + size += tablecodec.RecordRowKeyLen |
| 50 | + // The `cols` for TiKV always contain the row_id, so prefix row size subtract its length. |
| 51 | + size -= 8 |
| 52 | + case kv.TiFlash: |
| 53 | + if !handleInCols { |
| 54 | + size += 8 /* row_id length */ |
| 55 | + } |
| 56 | + } |
| 57 | + // Avoid errors related to size less than zero |
| 58 | + size = max(0, size) |
| 59 | + return |
| 60 | +} |
| 61 | + |
| 62 | +// GetAvgRowSize computes average row size for given columns. |
| 63 | +func GetAvgRowSize(ctx planctx.PlanContext, coll *statistics.HistColl, cols []*expression.Column, isEncodedKey bool, isForScan bool) (size float64) { |
| 64 | + sessionVars := ctx.GetSessionVars() |
| 65 | + if coll.Pseudo || coll.ColNum() == 0 || coll.RealtimeCount == 0 { |
| 66 | + size = pseudoColSize * float64(len(cols)) |
| 67 | + } else { |
| 68 | + for _, col := range cols { |
| 69 | + colHist := coll.GetCol(col.UniqueID) |
| 70 | + // Normally this would not happen, it is for compatibility with old version stats which |
| 71 | + // does not include TotColSize. |
| 72 | + if colHist == nil || (!colHist.IsHandle && colHist.TotColSize == 0 && (colHist.NullCount != coll.RealtimeCount)) { |
| 73 | + size += pseudoColSize |
| 74 | + continue |
| 75 | + } |
| 76 | + // We differentiate if the column is encoded as key or value, because the resulted size |
| 77 | + // is different. |
| 78 | + if sessionVars.EnableChunkRPC && !isForScan { |
| 79 | + size += AvgColSizeChunkFormat(colHist, coll.RealtimeCount) |
| 80 | + } else { |
| 81 | + size += AvgColSize(colHist, coll.RealtimeCount, isEncodedKey) |
| 82 | + } |
| 83 | + } |
| 84 | + } |
| 85 | + // Avoid errors related to size less than zero |
| 86 | + size = max(0, size) |
| 87 | + if sessionVars.EnableChunkRPC && !isForScan { |
| 88 | + // Add 1/8 byte for each column's nullBitMap byte. |
| 89 | + return size + float64(len(cols))/8 |
| 90 | + } |
| 91 | + // Add 1 byte for each column's flag byte. See `encode` for details. |
| 92 | + return size + float64(len(cols)) |
| 93 | +} |
| 94 | + |
| 95 | +// GetAvgRowSizeDataInDiskByRows computes average row size for given columns. |
| 96 | +func GetAvgRowSizeDataInDiskByRows(coll *statistics.HistColl, cols []*expression.Column) (size float64) { |
| 97 | + if coll.Pseudo || coll.ColNum() == 0 || coll.RealtimeCount == 0 { |
| 98 | + for _, col := range cols { |
| 99 | + size += float64(chunk.EstimateTypeWidth(col.GetStaticType())) |
| 100 | + } |
| 101 | + } else { |
| 102 | + for _, col := range cols { |
| 103 | + colHist := coll.GetCol(col.UniqueID) |
| 104 | + // Normally this would not happen, it is for compatibility with old version stats which |
| 105 | + // does not include TotColSize. |
| 106 | + if colHist == nil || (!colHist.IsHandle && colHist.TotColSize == 0 && (colHist.NullCount != coll.RealtimeCount)) { |
| 107 | + size += float64(chunk.EstimateTypeWidth(col.GetStaticType())) |
| 108 | + continue |
| 109 | + } |
| 110 | + size += AvgColSizeDataInDiskByRows(colHist, coll.RealtimeCount) |
| 111 | + } |
| 112 | + } |
| 113 | + // Add 8 byte for each column's size record. See `DataInDiskByRows` for details. |
| 114 | + return max(0, size+float64(8*len(cols))) |
| 115 | +} |
| 116 | + |
| 117 | +// AvgColSize is the average column size of the histogram. These sizes are derived from function `encode` |
| 118 | +// and `Datum::ConvertTo`, so we need to update them if those 2 functions are changed. |
| 119 | +func AvgColSize(c *statistics.Column, count int64, isKey bool) float64 { |
| 120 | + if count == 0 { |
| 121 | + return 0 |
| 122 | + } |
| 123 | + // Note that, if the handle column is encoded as value, instead of key, i.e, |
| 124 | + // when the handle column is in a unique index, the real column size may be |
| 125 | + // smaller than 8 because it is encoded using `EncodeVarint`. Since we don't |
| 126 | + // know the exact value size now, use 8 as approximation. |
| 127 | + if c.IsHandle { |
| 128 | + return 8 |
| 129 | + } |
| 130 | + histCount := c.TotalRowCount() |
| 131 | + notNullRatio := 1.0 |
| 132 | + if histCount > 0 { |
| 133 | + notNullRatio = max(0, 1.0-float64(c.NullCount)/histCount) |
| 134 | + } |
| 135 | + switch c.Histogram.Tp.GetType() { |
| 136 | + case mysql.TypeFloat, mysql.TypeDouble, mysql.TypeDuration, mysql.TypeDate, mysql.TypeDatetime, mysql.TypeTimestamp: |
| 137 | + return 8 * notNullRatio |
| 138 | + case mysql.TypeTiny, mysql.TypeShort, mysql.TypeInt24, mysql.TypeLong, mysql.TypeLonglong, mysql.TypeYear, mysql.TypeEnum, mysql.TypeBit, mysql.TypeSet: |
| 139 | + if isKey { |
| 140 | + return 8 * notNullRatio |
| 141 | + } |
| 142 | + } |
| 143 | + // Keep two decimal place. |
| 144 | + return max(0, math.Round(float64(c.TotColSize)/float64(count)*100)/100) |
| 145 | +} |
| 146 | + |
| 147 | +// AvgColSizeChunkFormat is the average column size of the histogram. These sizes are derived from function `Encode` |
| 148 | +// and `DecodeToChunk`, so we need to update them if those 2 functions are changed. |
| 149 | +func AvgColSizeChunkFormat(c *statistics.Column, count int64) float64 { |
| 150 | + if count == 0 { |
| 151 | + return 0 |
| 152 | + } |
| 153 | + fixedLen := chunk.GetFixedLen(c.Histogram.Tp) |
| 154 | + if fixedLen >= 0 { |
| 155 | + return float64(fixedLen) |
| 156 | + } |
| 157 | + // Keep two decimal place. |
| 158 | + // Add 8 bytes for unfixed-len type's offsets. |
| 159 | + // Minus Log2(avgSize) for unfixed-len type LEN. |
| 160 | + avgSize := float64(c.TotColSize) / float64(count) |
| 161 | + if avgSize < 1 { |
| 162 | + return max(0, math.Round(avgSize*100)/100) + 8 |
| 163 | + } |
| 164 | + return max(0, math.Round((avgSize-math.Log2(avgSize))*100)/100) + 8 |
| 165 | +} |
| 166 | + |
| 167 | +// AvgColSizeDataInDiskByRows is the average column size of the histogram. These sizes are derived |
| 168 | +// from `chunk.DataInDiskByRows` so we need to update them if those 2 functions are changed. |
| 169 | +func AvgColSizeDataInDiskByRows(c *statistics.Column, count int64) float64 { |
| 170 | + if count == 0 { |
| 171 | + return 0 |
| 172 | + } |
| 173 | + histCount := c.TotalRowCount() |
| 174 | + notNullRatio := 1.0 |
| 175 | + if histCount > 0 { |
| 176 | + notNullRatio = 1.0 - float64(c.NullCount)/histCount |
| 177 | + } |
| 178 | + size := chunk.GetFixedLen(c.Histogram.Tp) |
| 179 | + if size >= 0 { |
| 180 | + return float64(size) * notNullRatio |
| 181 | + } |
| 182 | + // Keep two decimal place. |
| 183 | + // Minus Log2(avgSize) for unfixed-len type LEN. |
| 184 | + avgSize := float64(c.TotColSize) / float64(count) |
| 185 | + if avgSize < 1 { |
| 186 | + return max(0, math.Round((avgSize)*100)/100) |
| 187 | + } |
| 188 | + return math.Round((avgSize-math.Log2(avgSize))*100) / 100 |
| 189 | +} |
0 commit comments