cockroach: github.com/cockroachdb/cockroach/pkg/sql/stats Index | Files

package stats

import "github.com/cockroachdb/cockroach/pkg/sql/stats"

Index

Package Files

automatic_stats.go delete_stats.go histogram.go histogram.pb.go json.go new_stat.go row_sampling.go stats_cache.go table_statistic.pb.go

Constants

const AutoStatsClusterSettingName = "sql.stats.automatic_collection.enabled"

AutoStatsClusterSettingName is the name of the automatic stats collection cluster setting.

const (
    // AutoStatsName is the name to use for statistics created automatically.
    // The name is chosen to be something that users are unlikely to choose when
    // running CREATE STATISTICS manually.
    AutoStatsName = "__auto__"
)

Constants for automatic statistics collection. TODO(rytaft): Should these constants be configurable?

Variables

var (
    ErrInvalidLengthHistogram = fmt.Errorf("proto: negative length found during unmarshaling")
    ErrIntOverflowHistogram   = fmt.Errorf("proto: integer overflow")
)
var (
    ErrInvalidLengthTableStatistic = fmt.Errorf("proto: negative length found during unmarshaling")
    ErrIntOverflowTableStatistic   = fmt.Errorf("proto: integer overflow")
)
var AutomaticStatisticsClusterMode = settings.RegisterBoolSetting(
    AutoStatsClusterSettingName,
    "automatic statistics collection mode",
    true,
)

AutomaticStatisticsClusterMode controls the cluster setting for enabling automatic table statistics collection.

var AutomaticStatisticsFractionStaleRows = settings.RegisterNonNegativeFloatSetting(
    "sql.stats.automatic_collection.fraction_stale_rows",
    "target fraction of stale rows per table that will trigger a statistics refresh",
    0.2,
)

AutomaticStatisticsFractionStaleRows controls the cluster setting for the target fraction of rows in a table that should be stale before statistics on that table are refreshed, in addition to the constant value AutomaticStatisticsMinStaleRows.

var AutomaticStatisticsMaxIdleTime = settings.RegisterValidatedFloatSetting(
    "sql.stats.automatic_collection.max_fraction_idle",
    "maximum fraction of time that automatic statistics sampler processors are idle",
    0.9,
    func(val float64) error {
        if val < 0 || val >= 1 {
            return pgerror.Newf(pgcode.InvalidParameterValue,
                "sql.stats.automatic_collection.max_fraction_idle must be >= 0 and < 1 but found: %v", val)
        }
        return nil
    },
)

AutomaticStatisticsMaxIdleTime controls the maximum fraction of time that the sampler processors will be idle when scanning large tables for automatic statistics (in high load scenarios). This value can be tuned to trade off the runtime vs performance impact of automatic stats.

var AutomaticStatisticsMinStaleRows = settings.RegisterNonNegativeIntSetting(
    "sql.stats.automatic_collection.min_stale_rows",
    "target minimum number of stale rows per table that will trigger a statistics refresh",
    500,
)

AutomaticStatisticsMinStaleRows controls the cluster setting for the target number of rows that should be updated before a table is refreshed, in addition to the fraction AutomaticStatisticsFractionStaleRows.

var ConcurrentCreateStatsError error = concurrentCreateStatisticsError{}

ConcurrentCreateStatsError is reported when two CREATE STATISTICS jobs are issued concurrently. This is a sentinel error.

var DefaultAsOfTime = 30 * time.Second

DefaultAsOfTime is a duration which is used to define the AS OF time for automatic runs of CREATE STATISTICS. It is mutable for testing. NB: Updates to this value after MakeRefresher has been called will not have any effect.

var DefaultRefreshInterval = time.Minute

DefaultRefreshInterval is the frequency at which the Refresher will check if the stats for each table should be refreshed. It is mutable for testing. NB: Updates to this value after Refresher.Start has been called will not have any effect.

var HistogramClusterMode = settings.RegisterBoolSetting(
    "sql.stats.histogram_collection.enabled",
    "histogram collection mode",
    true,
)

HistogramClusterMode controls the cluster setting for enabling histogram collection.

func DeleteOldStatsForColumns Uses

func DeleteOldStatsForColumns(
    ctx context.Context,
    executor sqlutil.InternalExecutor,
    txn *client.Txn,
    tableID sqlbase.ID,
    columnIDs []sqlbase.ColumnID,
) error

DeleteOldStatsForColumns deletes old statistics from the system.table_statistics table. For the given tableID and columnIDs, DeleteOldStatsForColumns keeps the most recent keepCount automatic statistics and deletes all the others.

func GossipTableStatAdded Uses

func GossipTableStatAdded(g *gossip.Gossip, tableID sqlbase.ID) error

GossipTableStatAdded causes the statistic caches for this table to be invalidated.

func InsertNewStat Uses

func InsertNewStat(
    ctx context.Context,
    executor sqlutil.InternalExecutor,
    txn *client.Txn,
    tableID sqlbase.ID,
    name string,
    columnIDs []sqlbase.ColumnID,
    rowCount, distinctCount, nullCount int64,
    h *HistogramData,
) error

InsertNewStat inserts a new statistic in the system table. The caller is responsible for calling GossipTableStatAdded to notify the stat caches.

func InsertNewStats Uses

func InsertNewStats(
    ctx context.Context,
    executor sqlutil.InternalExecutor,
    txn *client.Txn,
    tableStats []*TableStatistic,
) error

InsertNewStats inserts a slice of statistics at the current time into the system table.

type HistogramData Uses

type HistogramData struct {
    // Value type for the column.
    ColumnType github_com_cockroachdb_cockroach_pkg_sql_types.T `protobuf:"bytes,2,opt,name=column_type,json=columnType,proto3,customtype=github.com/cockroachdb/cockroach/pkg/sql/types.T" json:"column_type"`
    // Histogram buckets. Note that NULL values are excluded from the
    // histogram.
    Buckets []HistogramData_Bucket `protobuf:"bytes,1,rep,name=buckets,proto3" json:"buckets"`
}

HistogramData encodes the data for a histogram, which captures the distribution of values on a specific column.

func EquiDepthHistogram Uses

func EquiDepthHistogram(
    evalCtx *tree.EvalContext, samples tree.Datums, numRows, distinctCount int64, maxBuckets int,
) (HistogramData, error)

EquiDepthHistogram creates a histogram where each bucket contains roughly the same number of samples (though it can vary when a boundary value has high frequency).

numRows is the total number of rows from which values were sampled.

In addition to building the histogram buckets, EquiDepthHistogram also estimates the number of distinct values in each bucket. It distributes the known number of distinct values (distinctCount) among the buckets, in proportion with the number of rows in each bucket.

func (*HistogramData) Descriptor Uses

func (*HistogramData) Descriptor() ([]byte, []int)

func (*HistogramData) Marshal Uses

func (m *HistogramData) Marshal() (dAtA []byte, err error)

func (*HistogramData) MarshalTo Uses

func (m *HistogramData) MarshalTo(dAtA []byte) (int, error)

func (*HistogramData) ProtoMessage Uses

func (*HistogramData) ProtoMessage()

func (*HistogramData) Reset Uses

func (m *HistogramData) Reset()

func (*HistogramData) Size Uses

func (m *HistogramData) Size() (n int)

func (*HistogramData) String Uses

func (m *HistogramData) String() string

func (*HistogramData) Unmarshal Uses

func (m *HistogramData) Unmarshal(dAtA []byte) error

func (*HistogramData) XXX_DiscardUnknown Uses

func (m *HistogramData) XXX_DiscardUnknown()

func (*HistogramData) XXX_Marshal Uses

func (m *HistogramData) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*HistogramData) XXX_Merge Uses

func (dst *HistogramData) XXX_Merge(src proto.Message)

func (*HistogramData) XXX_Size Uses

func (m *HistogramData) XXX_Size() int

func (*HistogramData) XXX_Unmarshal Uses

func (m *HistogramData) XXX_Unmarshal(b []byte) error

type HistogramData_Bucket Uses

type HistogramData_Bucket struct {
    // The estimated number of values that are equal to upper_bound.
    NumEq int64 `protobuf:"varint,1,opt,name=num_eq,json=numEq,proto3" json:"num_eq,omitempty"`
    // The estimated number of values in the bucket (excluding those
    // that are equal to upper_bound). Splitting the count into two
    // makes the histogram effectively equivalent to a histogram with
    // twice as many buckets, with every other bucket containing a
    // single value. This might be particularly advantageous if the
    // histogram algorithm makes sure the top "heavy hitters" (most
    // frequent elements) are bucket boundaries (similar to a
    // compressed histogram).
    NumRange int64 `protobuf:"varint,2,opt,name=num_range,json=numRange,proto3" json:"num_range,omitempty"`
    // The estimated number of distinct values in the bucket (excluding
    // those that are equal to upper_bound). This is a floating point
    // value because it is estimated by distributing the known distinct
    // count for the column among the buckets, in proportion to the number
    // of rows in each bucket. This value is in fact derived from the rest
    // of the data, but is included to avoid re-computing it later.
    DistinctRange float64 `protobuf:"fixed64,4,opt,name=distinct_range,json=distinctRange,proto3" json:"distinct_range,omitempty"`
    // The upper boundary of the bucket. The column values for the upper bound
    // are encoded using the ascending key encoding of the column type.
    UpperBound []byte `protobuf:"bytes,3,opt,name=upper_bound,json=upperBound,proto3" json:"upper_bound,omitempty"`
}

func (*HistogramData_Bucket) Descriptor Uses

func (*HistogramData_Bucket) Descriptor() ([]byte, []int)

func (*HistogramData_Bucket) Marshal Uses

func (m *HistogramData_Bucket) Marshal() (dAtA []byte, err error)

func (*HistogramData_Bucket) MarshalTo Uses

func (m *HistogramData_Bucket) MarshalTo(dAtA []byte) (int, error)

func (*HistogramData_Bucket) ProtoMessage Uses

func (*HistogramData_Bucket) ProtoMessage()

func (*HistogramData_Bucket) Reset Uses

func (m *HistogramData_Bucket) Reset()

func (*HistogramData_Bucket) Size Uses

func (m *HistogramData_Bucket) Size() (n int)

func (*HistogramData_Bucket) String Uses

func (m *HistogramData_Bucket) String() string

func (*HistogramData_Bucket) Unmarshal Uses

func (m *HistogramData_Bucket) Unmarshal(dAtA []byte) error

func (*HistogramData_Bucket) XXX_DiscardUnknown Uses

func (m *HistogramData_Bucket) XXX_DiscardUnknown()

func (*HistogramData_Bucket) XXX_Marshal Uses

func (m *HistogramData_Bucket) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*HistogramData_Bucket) XXX_Merge Uses

func (dst *HistogramData_Bucket) XXX_Merge(src proto.Message)

func (*HistogramData_Bucket) XXX_Size Uses

func (m *HistogramData_Bucket) XXX_Size() int

func (*HistogramData_Bucket) XXX_Unmarshal Uses

func (m *HistogramData_Bucket) XXX_Unmarshal(b []byte) error

type JSONHistoBucket Uses

type JSONHistoBucket struct {
    NumEq         int64   `json:"num_eq"`
    NumRange      int64   `json:"num_range"`
    DistinctRange float64 `json:"distinct_range"`
    // UpperBound is the string representation of a datum; parsable with
    // tree.ParseStringAs.
    UpperBound string `json:"upper_bound"`
}

JSONHistoBucket is a struct used for JSON marshaling and unmarshaling of histogram data.

See HistogramData for a description of the fields.

type JSONStatistic Uses

type JSONStatistic struct {
    Name          string   `json:"name,omitempty"`
    CreatedAt     string   `json:"created_at"`
    Columns       []string `json:"columns"`
    RowCount      uint64   `json:"row_count"`
    DistinctCount uint64   `json:"distinct_count"`
    NullCount     uint64   `json:"null_count"`
    // HistogramColumnType is the string representation of the column type for the
    // histogram (or unset if there is no histogram). Parsable with
    // tree.ParseType.
    HistogramColumnType string            `json:"histo_col_type"`
    HistogramBuckets    []JSONHistoBucket `json:"histo_buckets,omitempty"`
}

JSONStatistic is a struct used for JSON marshaling and unmarshaling statistics.

See TableStatistic for a description of the fields.

func (*JSONStatistic) DecodeAndSetHistogram Uses

func (js *JSONStatistic) DecodeAndSetHistogram(datum tree.Datum) error

DecodeAndSetHistogram decodes a histogram marshaled as a Bytes datum and fills in the HistogramColumnType and HistogramBuckets fields.

func (*JSONStatistic) GetHistogram Uses

func (js *JSONStatistic) GetHistogram(evalCtx *tree.EvalContext) (*HistogramData, error)

GetHistogram converts the json histogram into HistogramData.

func (*JSONStatistic) SetHistogram Uses

func (js *JSONStatistic) SetHistogram(h *HistogramData) error

SetHistogram fills in the HistogramColumnType and HistogramBuckets fields.

type Refresher Uses

type Refresher struct {
    // contains filtered or unexported fields
}

Refresher is responsible for automatically refreshing the table statistics that are used by the cost-based optimizer. It is necessary to periodically refresh the statistics to prevent them from becoming stale as data in the database changes.

The Refresher is designed to schedule a CREATE STATISTICS refresh job after approximately X% of total rows have been updated/inserted/deleted in a given table. Currently, X is hardcoded to be 20%.

The decision to refresh is based on a percentage rather than a fixed number of rows because if a table is huge and rarely updated, we don't want to waste time frequently refreshing stats. Likewise, if it's small and rapidly updated, we want to update stats more often.

To avoid contention on row update counters, we use a statistical approach. For example, suppose we want to refresh stats after 20% of rows are updated and there are currently 1M rows in the table. If a user updates 10 rows, we use random number generation to refresh stats with probability 10/(1M * 0.2) = 0.00005. The general formula is:

                        # rows updated/inserted/deleted
p =  --------------------------------------------------------------------
     (# rows in table) * (target fraction of rows updated before refresh)

The existing statistics in the stats cache are used to get the number of rows in the table.

In order to prevent small tables from being constantly refreshed, we also require that approximately 500 rows have changed in addition to the 20%.

Refresher also implements some heuristic limits designed to corral statistical outliers. If we haven't refreshed stats in 2x the average time between the last few refreshes, we automatically trigger a refresh. The existing statistics in the stats cache are used to calculate the average time between refreshes as well as to determine when the stats were last updated.

If the decision is made to continue with the refresh, Refresher runs CREATE STATISTICS on the given table with the default set of column statistics. See comments in sql/create_stats.go for details about which default columns are chosen. Refresher runs CREATE STATISTICS with AS OF SYSTEM TIME ‘-30s’ to minimize performance impact on running transactions.

To avoid adding latency to SQL mutation operations, the Refresher is run in one separate background thread per Server. SQL mutation operations signal to the Refresher thread by calling NotifyMutation, which sends mutation metadata to the Refresher thread over a non-blocking buffered channel. The signaling is best-effort; if the channel is full, the metadata will not be sent.

func MakeRefresher Uses

func MakeRefresher(
    st *cluster.Settings,
    ex sqlutil.InternalExecutor,
    cache *TableStatisticsCache,
    asOfTime time.Duration,
) *Refresher

MakeRefresher creates a new Refresher.

func (*Refresher) NotifyMutation Uses

func (r *Refresher) NotifyMutation(tableID sqlbase.ID, rowsAffected int)

NotifyMutation is called by SQL mutation operations to signal to the Refresher that a table has been mutated. It should be called after any successful insert, update, upsert or delete. rowsAffected refers to the number of rows written as part of the mutation operation.

func (*Refresher) Start Uses

func (r *Refresher) Start(
    ctx context.Context, stopper *stop.Stopper, refreshInterval time.Duration,
) error

Start starts the stats refresher thread, which polls for messages about new SQL mutations and refreshes the table statistics with probability proportional to the percentage of rows affected.

type SampleReservoir Uses

type SampleReservoir struct {
    // contains filtered or unexported fields
}

SampleReservoir implements reservoir sampling using random sort. Each row is assigned a rank (which should be a uniformly generated random value), and rows with the smallest K ranks are retained.

This is implemented as a max-heap of the smallest K ranks; each row can replace the row with the maximum rank. Note that heap operations only happen when we actually encounter a row that is among the top K so far; the probability of this is K/N if there were N rows so far; for large streams, we would have O(K log K) heap operations. The overall running time for a stream of size N is O(N + K log^2 K).

The same structure can be used to combine sample sets (as long as the original ranks are preserved) for distributed reservoir sampling. The requirement is that the capacity of each distributed reservoir must have been at least as large as this reservoir.

func (*SampleReservoir) Disable Uses

func (sr *SampleReservoir) Disable()

Disable releases the memory of this SampleReservoir and sets its capacity to zero.

func (*SampleReservoir) Get Uses

func (sr *SampleReservoir) Get() []SampledRow

Get returns the sampled rows.

func (*SampleReservoir) Init Uses

func (sr *SampleReservoir) Init(numSamples int, colTypes []types.T, memAcc *mon.BoundAccount)

Init initializes a SampleReservoir.

func (*SampleReservoir) Len Uses

func (sr *SampleReservoir) Len() int

Len is part of heap.Interface.

func (*SampleReservoir) Less Uses

func (sr *SampleReservoir) Less(i, j int) bool

Less is part of heap.Interface.

func (*SampleReservoir) Pop Uses

func (sr *SampleReservoir) Pop() interface{}

Pop is part of heap.Interface, but we're not using it.

func (*SampleReservoir) Push Uses

func (sr *SampleReservoir) Push(x interface{})

Push is part of heap.Interface, but we're not using it.

func (*SampleReservoir) SampleRow Uses

func (sr *SampleReservoir) SampleRow(
    ctx context.Context, evalCtx *tree.EvalContext, row sqlbase.EncDatumRow, rank uint64,
) error

SampleRow looks at a row and either drops it or adds it to the reservoir.

func (*SampleReservoir) Swap Uses

func (sr *SampleReservoir) Swap(i, j int)

Swap is part of heap.Interface.

type SampledRow Uses

type SampledRow struct {
    Row  sqlbase.EncDatumRow
    Rank uint64
}

SampledRow is a row that was sampled.

type TableStatistic Uses

type TableStatistic struct {
    // The ID of the table.
    TableID github_com_cockroachdb_cockroach_pkg_sql_sqlbase.ID `protobuf:"varint,1,opt,name=table_id,json=tableId,proto3,casttype=github.com/cockroachdb/cockroach/pkg/sql/sqlbase.ID" json:"table_id,omitempty"`
    // The ID for this statistic.  It need not be globally unique,
    // but must be unique for this table.
    StatisticID uint64 `protobuf:"varint,2,opt,name=statistic_id,json=statisticId,proto3" json:"statistic_id,omitempty"`
    // Optional user-defined name for the statistic.
    Name string `protobuf:"bytes,3,opt,name=name,proto3" json:"name,omitempty"`
    // The column ID(s) for which this statistic is generated.
    ColumnIDs []github_com_cockroachdb_cockroach_pkg_sql_sqlbase.ColumnID `protobuf:"varint,4,rep,packed,name=column_ids,json=columnIds,proto3,casttype=github.com/cockroachdb/cockroach/pkg/sql/sqlbase.ColumnID" json:"column_ids,omitempty"`
    // The time at which the statistic was created.
    CreatedAt time.Time `protobuf:"bytes,5,opt,name=created_at,json=createdAt,proto3,stdtime" json:"created_at"`
    // The total number of rows in the table.
    RowCount uint64 `protobuf:"varint,6,opt,name=row_count,json=rowCount,proto3" json:"row_count,omitempty"`
    // The estimated number of distinct values of the columns in ColumnIDs.
    DistinctCount uint64 `protobuf:"varint,7,opt,name=distinct_count,json=distinctCount,proto3" json:"distinct_count,omitempty"`
    // The number of rows that have a NULL in any of the columns in ColumnIDs.
    NullCount uint64 `protobuf:"varint,8,opt,name=null_count,json=nullCount,proto3" json:"null_count,omitempty"`
    // Histogram (if available)
    Histogram *HistogramData `protobuf:"bytes,9,opt,name=histogram,proto3" json:"histogram,omitempty"`
}

A TableStatistic object holds a statistic for a particular column or group of columns. It mirrors the structure of the system.table_statistics table. It is also used as the format in which table statistics are serialized in a backup.

func (*TableStatistic) Descriptor Uses

func (*TableStatistic) Descriptor() ([]byte, []int)

func (*TableStatistic) Marshal Uses

func (m *TableStatistic) Marshal() (dAtA []byte, err error)

func (*TableStatistic) MarshalTo Uses

func (m *TableStatistic) MarshalTo(dAtA []byte) (int, error)

func (*TableStatistic) ProtoMessage Uses

func (*TableStatistic) ProtoMessage()

func (*TableStatistic) Reset Uses

func (m *TableStatistic) Reset()

func (*TableStatistic) Size Uses

func (m *TableStatistic) Size() (n int)

func (*TableStatistic) String Uses

func (m *TableStatistic) String() string

func (*TableStatistic) Unmarshal Uses

func (m *TableStatistic) Unmarshal(dAtA []byte) error

func (*TableStatistic) XXX_DiscardUnknown Uses

func (m *TableStatistic) XXX_DiscardUnknown()

func (*TableStatistic) XXX_Marshal Uses

func (m *TableStatistic) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*TableStatistic) XXX_Merge Uses

func (dst *TableStatistic) XXX_Merge(src proto.Message)

func (*TableStatistic) XXX_Size Uses

func (m *TableStatistic) XXX_Size() int

func (*TableStatistic) XXX_Unmarshal Uses

func (m *TableStatistic) XXX_Unmarshal(b []byte) error

type TableStatisticsCache Uses

type TableStatisticsCache struct {
    Gossip      *gossip.Gossip
    ClientDB    *client.DB
    SQLExecutor sqlutil.InternalExecutor
    // contains filtered or unexported fields
}

A TableStatisticsCache contains two underlying LRU caches: (1) A cache of []*TableStatistic objects, keyed by table ID.

Each entry consists of all the statistics for different columns and
column groups for the given table.

(2) A cache of *HistogramData objects, keyed by

HistogramCacheKey{table ID, statistic ID}.

func NewTableStatisticsCache Uses

func NewTableStatisticsCache(
    cacheSize int, g *gossip.Gossip, db *client.DB, sqlExecutor sqlutil.InternalExecutor,
) *TableStatisticsCache

NewTableStatisticsCache creates a new TableStatisticsCache that can hold statistics for <cacheSize> tables.

func (*TableStatisticsCache) GetTableStats Uses

func (sc *TableStatisticsCache) GetTableStats(
    ctx context.Context, tableID sqlbase.ID,
) ([]*TableStatistic, error)

GetTableStats looks up statistics for the requested table ID in the cache, and if the stats are not present in the cache, it looks them up in system.table_statistics.

The statistics are ordered by their CreatedAt time (newest-to-oldest).

func (*TableStatisticsCache) InvalidateTableStats Uses

func (sc *TableStatisticsCache) InvalidateTableStats(ctx context.Context, tableID sqlbase.ID)

InvalidateTableStats invalidates the cached statistics for the given table ID.

Package stats imports 34 packages (graph) and is imported by 16 packages. Updated 2019-09-16. Refresh now. Tools for package owners.