`import "github.com/cockroachdb/cockroach/pkg/sql/opt/props"`

- Variables
- type AvailableRuleProps
- type Cardinality
- func (c Cardinality) Add(other Cardinality) Cardinality
- func (c Cardinality) AsLowAs(min uint32) Cardinality
- func (c Cardinality) AtLeast(other Cardinality) Cardinality
- func (c Cardinality) CanBeZero() bool
- func (c Cardinality) IsOne() bool
- func (c Cardinality) IsZero() bool
- func (c Cardinality) IsZeroOrOne() bool
- func (c Cardinality) Limit(max uint32) Cardinality
- func (c Cardinality) Product(other Cardinality) Cardinality
- func (c Cardinality) Skip(rows uint32) Cardinality
- func (c Cardinality) String() string
- type ColStatsMap
- func (m *ColStatsMap) Add(cols opt.ColSet) (_ *ColumnStatistic, added bool)
- func (m *ColStatsMap) Clear()
- func (m *ColStatsMap) CopyFrom(other *ColStatsMap)
- func (m *ColStatsMap) Count() int
- func (m *ColStatsMap) Get(nth int) *ColumnStatistic
- func (m *ColStatsMap) Lookup(cols opt.ColSet) (colStat *ColumnStatistic, ok bool)
- func (m *ColStatsMap) RemoveIntersecting(cols opt.ColSet)
- type ColumnStatistic
- type ColumnStatistics
- func (c ColumnStatistics) Len() int
- func (c ColumnStatistics) Less(i, j int) bool
- func (c ColumnStatistics) Swap(i, j int)
- type FuncDepSet
- func (f *FuncDepSet) AddConstants(cols opt.ColSet)
- func (f *FuncDepSet) AddEquivFrom(fdset *FuncDepSet)
- func (f *FuncDepSet) AddEquivalency(a, b opt.ColumnID)
- func (f *FuncDepSet) AddFrom(fdset *FuncDepSet)
- func (f *FuncDepSet) AddLaxKey(keyCols, allCols opt.ColSet)
- func (f *FuncDepSet) AddStrictKey(keyCols, allCols opt.ColSet)
- func (f *FuncDepSet) AddSynthesizedCol(from opt.ColSet, col opt.ColumnID)
- func (f *FuncDepSet) ColSet() opt.ColSet
- func (f *FuncDepSet) ColsAreLaxKey(cols opt.ColSet) bool
- func (f *FuncDepSet) ColsAreStrictKey(cols opt.ColSet) bool
- func (f *FuncDepSet) ComputeClosure(cols opt.ColSet) opt.ColSet
- func (f *FuncDepSet) ComputeEquivClosure(cols opt.ColSet) opt.ColSet
- func (f *FuncDepSet) ComputeEquivGroup(rep opt.ColumnID) opt.ColSet
- func (f *FuncDepSet) ConstantCols() opt.ColSet
- func (f *FuncDepSet) CopyFrom(fdset *FuncDepSet)
- func (f *FuncDepSet) Empty() bool
- func (f *FuncDepSet) EquivReps() opt.ColSet
- func (f *FuncDepSet) HasMax1Row() bool
- func (f *FuncDepSet) InClosureOf(cols, in opt.ColSet) bool
- func (f *FuncDepSet) LaxKey() (_ opt.ColSet, ok bool)
- func (f *FuncDepSet) MakeApply(inner *FuncDepSet)
- func (f *FuncDepSet) MakeFullOuter(leftCols, rightCols, notNullInputCols opt.ColSet)
- func (f *FuncDepSet) MakeLeftOuter( leftFDs, filtersFDs *FuncDepSet, leftCols, rightCols, notNullInputCols opt.ColSet, )
- func (f *FuncDepSet) MakeMax1Row(cols opt.ColSet)
- func (f *FuncDepSet) MakeNotNull(notNullCols opt.ColSet)
- func (f *FuncDepSet) MakeProduct(inner *FuncDepSet)
- func (f *FuncDepSet) ProjectCols(cols opt.ColSet)
- func (f *FuncDepSet) ReduceCols(cols opt.ColSet) opt.ColSet
- func (f *FuncDepSet) StrictKey() (_ opt.ColSet, ok bool)
- func (f FuncDepSet) String() string
- func (f FuncDepSet) StringOnlyFDs() string
- func (f *FuncDepSet) Verify()
- type Histogram
- func (h *Histogram) ApplySelectivity(selectivity float64) *Histogram
- func (h *Histogram) Bucket(i int) *cat.HistogramBucket
- func (h *Histogram) BucketCount() int
- func (h *Histogram) CanFilter(c *constraint.Constraint) (colOffset, exactPrefix int, ok bool)
- func (h *Histogram) DistinctValuesCount() float64
- func (h *Histogram) Filter(c *constraint.Constraint) *Histogram
- func (h *Histogram) Init( evalCtx *tree.EvalContext, col opt.ColumnID, buckets []cat.HistogramBucket, )
- func (h *Histogram) String() string
- func (h *Histogram) ValuesCount() float64
- type JoinMultiplicity
- func (mp *JoinMultiplicity) JoinDoesNotDuplicateLeftRows() bool
- func (mp *JoinMultiplicity) JoinDoesNotDuplicateRightRows() bool
- func (mp *JoinMultiplicity) JoinPreservesLeftRows() bool
- func (mp *JoinMultiplicity) JoinPreservesRightRows() bool
- func (mp *JoinMultiplicity) String() string
- type MultiplicityValue
- type Relational
- func (r *Relational) IsAvailable(p AvailableRuleProps) bool
- func (r *Relational) SetAvailable(p AvailableRuleProps)
- type Scalar
- func (s *Scalar) IsAvailable(p AvailableRuleProps) bool
- func (s *Scalar) SetAvailable(p AvailableRuleProps)
- type Shared
- type Statistics
- func (s *Statistics) ApplySelectivity(selectivity float64)
- func (s *Statistics) CopyFrom(other *Statistics)
- func (s *Statistics) Init(relProps *Relational) (zeroCardinality bool)
- func (s *Statistics) String() string
- func (s *Statistics) UnionWith(other *Statistics)
- type VolatilitySet
- func (vs *VolatilitySet) Add(v tree.Volatility)
- func (vs *VolatilitySet) AddImmutable()
- func (vs *VolatilitySet) AddStable()
- func (vs *VolatilitySet) AddVolatile()
- func (vs VolatilitySet) HasStable() bool
- func (vs VolatilitySet) HasVolatile() bool
- func (vs VolatilitySet) IsLeakProof() bool
- func (vs VolatilitySet) String() string
- func (vs *VolatilitySet) UnionWith(other VolatilitySet)
- type WithUseInfo
- type WithUsesMap

cardinality.go col_stats_map.go func_dep.go histogram.go logical.go multiplicity.go statistics.go volatility.go

❖

var AnyCardinality = Cardinality{Min: 0, Max: math.MaxUint32}

AnyCardinality indicates that any number of rows can be returned by an expression.

❖

var OneCardinality = Cardinality{Min: 1, Max: 1}

OneCardinality indicates that exactly one row will be returned by expression.

❖

var ZeroCardinality = Cardinality{}

ZeroCardinality indicates that no rows will be returned by expression.

AvailableRuleProps is a bit set that indicates when lazily-populated Rule properties are initialized and ready for use.

❖

const ( // PruneCols is set when the Relational.Rule.PruneCols field is populated. PruneCols AvailableRuleProps = 1 << iota // RejectNullCols is set when the Relational.Rule.RejectNullCols field is // populated. RejectNullCols // InterestingOrderings is set when the Relational.Rule.InterestingOrderings // field is populated. InterestingOrderings // HasHoistableSubquery is set when the Scalar.Rule.HasHoistableSubquery // is populated. HasHoistableSubquery // UnfilteredCols is set when the Relational.Rule.UnfilteredCols field is // populated. UnfilteredCols // JoinSize is set when the Relational.Rule.JoinSize field is populated. JoinSize // WithUses is set when the Shared.Rule.WithUses field is populated. WithUses )

Cardinality is the number of rows that can be returned by a relational expression. Both Min and Max are inclusive bounds. If Max = math.MaxUint32, that indicates there is no limit to the number of returned rows. Max should always be >= Min, or method results are undefined. Cardinality is determined from the relational properties, and are "hard" bounds that are never incorrect. This constrasts with row cardinality derived by the statistics code, which are only estimates and may be incorrect.

❖

func (c Cardinality) Add(other Cardinality) Cardinality

Add sums the min and max bounds to get a combined count of rows.

❖

func (c Cardinality) AsLowAs(min uint32) Cardinality

AsLowAs ratchets the min bound downwards in order to ensure that it allows values that are >= the min value.

❖

func (c Cardinality) AtLeast(other Cardinality) Cardinality

AtLeast ratchets the bounds upwards so that they're at least as large as the bounds in the given cardinality.

❖

func (c Cardinality) CanBeZero() bool

CanBeZero is true if the expression can return zero rows.

❖

func (c Cardinality) IsOne() bool

IsOne returns true if the expression always returns one row.

❖

func (c Cardinality) IsZero() bool

IsZero returns true if the expression never returns any rows.

❖

func (c Cardinality) IsZeroOrOne() bool

IsZeroOrOne is true if the expression never returns more than one row.

❖

func (c Cardinality) Limit(max uint32) Cardinality

Limit ratchets the bounds downwards so that they're no bigger than the given max value.

❖

func (c Cardinality) Product(other Cardinality) Cardinality

Product multiples the min and max bounds to get the combined product of rows.

❖

func (c Cardinality) Skip(rows uint32) Cardinality

Skip subtracts the given number of rows from the min and max bounds to account for skipped rows.

❖

func (c Cardinality) String() string

❖

```
type ColStatsMap struct {
// contains filtered or unexported fields
}
```

ColStatsMap stores a set of column statistics, each of which is keyed by the set of columns over which that statistic is defined. Statistics can be added, removed, and efficiently accessed (by opt.ColumnSet key or by ordinal position) and enumerated.

Since most expressions have just a few column statistics attached to them, ColStatsMap optimizes for this case by storing the first 3 column statistics inline. Additional column statistics trigger the creation of a slice to store them, as well as a lookup index for efficient lookup by opt.ColSet.

Because opt.ColSet contains a pointer, it is not useful as a map key for fast statistic lookup. So instead of directly using opt.ColSet as a key in a Go map, ColStatsMap uses a prefix tree index. Each opt.ColSet key is treated as a string of ascending opt.ColumnID values that are each hashed by its own value plus a prefix id that uniquely identifies the set of smaller values. For example, if an opt.ColSet contains (2, 3, 6), then its index looks like:

(prefix: 0, id: 2) => (prefix: 1, pos: -1) └── (prefix: 1, id: 3) => (prefix: 2, pos: -1) └── (prefix: 2, id: 6) => (prefix: 3, pos: 0)

Where pos is the ordinal position of the statistic in ColStatsMap, and pos=-1 signifies that there is not yet any statistic for that column set. If an additional opt.ColSet containing (2, 4) is added to the index, then it shares the initial lookup node, but then diverges:

(prefix: 0, id: 2) => (prefix: 1, pos: -1) ├── (prefix: 1, id: 3) => (prefix: 2, pos: -1) │ └── (prefix: 2, id: 6) => (prefix: 3, pos: 0) └── (prefix: 1, id: 4) => (prefix: 4, pos: 1)

This algorithm can be implemented by a single Go map that uses efficient int64 keys and values. It requires O(N) accesses to add and find a column statistic, where N is the number of values in the column set key.

❖

func (m *ColStatsMap) Add(cols opt.ColSet) (_ *ColumnStatistic, added bool)

Add ensures that a ColumnStatistic over the given columns is in the map. If it does not yet exist in the map, then Add adds a new blank ColumnStatistic and returns it, along with added=true. Otherwise, Add returns the existing ColumnStatistic with added=false. NOTE: The returned *ColumnStatistic is only valid until this ColStatsMap is

updated via another call to Add() or RemoveIntersecting(). At that point, the address of the statistic may have changed, so it must be fetched again using Lookup() or Get().

❖

func (m *ColStatsMap) Clear()

Clear empties the map of all column statistics.

❖

func (m *ColStatsMap) CopyFrom(other *ColStatsMap)

CopyFrom sets this map to a deep copy of another map, which can be modified independently.

❖

func (m *ColStatsMap) Count() int

Count returns the number of column statistics in the map.

❖

func (m *ColStatsMap) Get(nth int) *ColumnStatistic

Get returns the nth statistic in the map, by its ordinal position. This position is stable across calls to Get or Add (but not RemoveIntersecting). NOTE: The returned *ColumnStatistic is only valid until this ColStatsMap is

updated via a call to Add() or RemoveIntersecting(). At that point, the address of the statistic may have changed, so it must be fetched again using another call to Get() or Lookup().

❖

func (m *ColStatsMap) Lookup(cols opt.ColSet) (colStat *ColumnStatistic, ok bool)

Lookup returns the column statistic indexed by the given column set. If no such statistic exists in the map, then ok=false. NOTE: The returned *ColumnStatistic is only valid until this ColStatsMap is

updated via a call to Add() or RemoveIntersecting(). At that point, the address of the statistic may have changed, so it must be fetched again using another call to Lookup() or Get().

❖

func (m *ColStatsMap) RemoveIntersecting(cols opt.ColSet)

RemoveIntersecting scans the set of column statistics in the ColStatsMap and removes any that are defined over any of the columns in the given set. For example, if the map contains stats for (1), (1,2), and (3), then removing (1) would remove the (1) and (1,2) stats from the map.

❖

type ColumnStatistic struct { // Cols is the set of columns whose data are summarized by this // ColumnStatistic struct. The ColSet is never modified in-place. Cols opt.ColSet // DistinctCount is the estimated number of distinct values of this // set of columns for this expression. Includes null values. DistinctCount float64 // NullCount is the estimated number of null values of this set of // columns for this expression. For multi-column stats, this null // count tracks only the rows in which all columns in the set are null. NullCount float64 // Histogram is only used when the size of Cols is one. It contains // the approximate distribution of values for that column, represented // by a slice of histogram buckets. Histogram *Histogram }

ColumnStatistic is a collection of statistics that applies to a particular set of columns. In theory, a table could have a ColumnStatistic object for every possible subset of columns. In practice, it is only worth maintaining statistics on a few columns and column sets that are frequently used in predicates, group by columns, etc.

ColumnStatistiscs can be copied by value.

❖

func (c *ColumnStatistic) ApplySelectivity(selectivity, inputRows float64)

ApplySelectivity updates the distinct count, null count, and histogram according to a given selectivity.

❖

type ColumnStatistics []*ColumnStatistic

ColumnStatistics is a slice of pointers to ColumnStatistic values.

❖

func (c ColumnStatistics) Len() int

Len returns the number of ColumnStatistic values.

❖

func (c ColumnStatistics) Less(i, j int) bool

Less is part of the Sorter interface.

❖

func (c ColumnStatistics) Swap(i, j int)

Swap is part of the Sorter interface.

❖

```
type FuncDepSet struct {
// contains filtered or unexported fields
}
```

FuncDepSet is a set of functional dependencies (FDs) that encode useful relationships between columns in a base or derived relation. Given two sets of columns A and B, a functional dependency A-->B holds if A fully determines B. In other words, if two different rows have equal values for columns in A, then those two rows will also have equal values for columns in B. For example, where columns (a1, a2) are in set A, and column (b1) is in set B:

a1 a2 b1 -------- 1 2 5 1 2 5 3 4 6 3 4 6

The left side of a functional dependency is called the "determinant", and the right side is called the "dependant". Each side can contain zero or more columns, though the FuncDepSet library will fold away certain combinations that don't provide useful information, like A-->A and A-->(), since every column trivially determines itself, as well as the empty set.

When a dependant contains multiple columns, it is equivalent to splitting the single FD into multiple FDs, each with a single column dependant:

(a)-->(b,c)

is equivalent to these two FDs:

(a)-->(b) (a)-->(c)

When a determinant contains zero columns, as in ()-->A, then A is fully determined without reference to any other columns. An equivalent statement is that any arbitrary combination of determinant columns trivially determines A. And both of these statements are just another way of saying that columns in A are constant:

a1 a2 b1 c1 ---------------- 1 NULL 3 3 1 NULL 3 NULL 1 NULL 4 NULL

When a determinant contains multiple columns, then the functional dependency holds for the *composite* value of those columns. For example:

a1 a2 b1 -------- 1 2 5 1 2 5 1 3 4

These are valid values, even though a1 has the same values for all three rows, because it is only the combination of (a1,a2) that determines (b1).

Multiple FDs can be transitively applied in order to compute the "closure" of a set of input columns. The closure includes the input columns plus all columns that are functionally dependent on those columns, either directly or indirectly. Consider this set of FD's:

(a)-->(b,c,d) (b,c,e)-->(f) (d)-->(e)

The transitive closure of (a) is (a,b,c,d,e,f). To start, (a) determines (b,c,d). From there, (d) transitively determines (e). And now that (b,c,e) have been determined, they in turn determine (f). Because (a) determines all other columns, if two rows have the same value for (a), then the rows will be duplicates, since all other columns will be equal. And if there are no duplicate rows, then (a) is a key for the relation.

Base table primary keys can be trivially mapped into an FD set, since the primary key always uniquely determines the other columns:

CREATE TABLE t (a INT PRIMARY KEY, b INT, c INT) (a)-->(b,c)

Each SQL relational operator derives its own FD set from the FD sets of its inputs. For example, the Select operator augments the FD set of its input, based on its filter condition:

SELECT * FROM t WHERE a=1

Equating a column to a constant value constructs a new FD with an empty determinant, so that the augmented FD set becomes:

(a)-->(b,c) ()-->(a)

Since the value of column "a" is always the same, and since "a" functionally determines "b" and "c", the values of all columns are constants. Furthermore, because "a" is known to be a key, the result set can have at most one row.

This is but one example of how FDs can assist the optimizer in proving useful properties about query results. This information powers many optimizations, including eliminating unnecessary DISTINCT operators, simplifying ORDER BY columns, removing Max1Row operators, and mapping semi-joins to inner-joins.

FDs become more complex when the possibility of NULL values is introduced. SQL semantics often treat a NULL value as an "unknown" value that is not equal to any other value, including another NULL value. For example, SQL unique indexes exhibit this behavior:

CREATE TABLE t (a INT PRIMARY KEY, b INT, c INT, UNIQUE (b))

Here, "b" column values are unique...except for the case of multiple NULL values, which are allowed because each NULL is treated as if it was a different value. Contrast this with the different NULL handling rules used by SQL's GROUP BY and DISTINCT operators. Those operators treat multiple NULL values as duplicates, because each NULL is treated as if it was the same value.

The functional dependencies described up until now always use the "NULLs are equal" semantics (denoted as NULL= hereafter) in order to answer the question "are these two columns equal". The semantics are identical to what this SQL expression returns:

((c1 = c2) OR (c1 IS NULL AND c2 IS NULL)) IS True

And here are some examples:

c1 c2 NULL= ----------------- 1 1 true NULL NULL true 1 2 false 1 NULL false NULL 1 false

So now for the definition of A-->B that incorporates NULL values:

for any two rows r1 and r2 in the relation: A(r1) NULL= A(r2) ==> B(r1) NULL= B(r2)

Intuitively, if two different rows have equal values for A using "NULLs are equal" semantics, then those rows will also have equal values for B using those same semantics. As an example, the following sets of rows would be valid for the dependency (b)-->(c):

b c ---------- 1 NULL 1 NULL NULL 1 NULL 1 2 3 2 3 b c ---------- NULL NULL NULL NULL

but these sets of rows would be invalid:

b c ---------- NULL 1 NULL NULL b c ---------- NULL 1 NULL 2

Unique constraints allow the latter cases, however, and therefore it is desirable to somehow encode these weaker dependencies as FDs, because they can be strengthened later on if NULL values are filtered from determinant columns (more on that below).

The solution is to store an extra "strict" bit on each FD. If true, then the the FD is a "strict" dependency, and behaves as described above. However, if false, then the FD is a "lax" dependency. Lax dependencies use "squiggly" arrow notation to differentiate them from the strict variant:

A~~>B

In contrast to strict dependencies, lax dependencies treat NULLs on determinant columns as distinct from one another, with equality semantics identical to this SQL expression:

(c1 = c2) IS True

In other words, if either c1 or c2 is NULL, or both are NULL, then c1 is considered not equal to c2. The definition for A~~>B follows from that:

for any two rows r1 and r2 in the relation: (A(r1) = A(r2)) IS True ==> B(r1) NULL= B(r2)

In other words, if two different non-NULL rows have equal values for A, then those rows will also have equal values for B using NULL= semantics. Note that both strict and lax equality definitions collapse to the same semantics when the columns of A are not-NULL. The example row sets shown above that were invalid for a strict dependency are valid for a lax dependency:

b c ---------- NULL 1 NULL NULL b c ---------- NULL 1 NULL 2

To continue the CREATE TABLE example shown above, another FD can now be derived from that statement, in addition to the primary key FD:

(a)-->(b,c) (b)~~>(a,c)

Lax dependencies are *not* transitive, and have limited usefulness as-is. However, some operators (like Select) can "reject" NULL values, which means that they filter out rows containing the troublesome NULL values. That makes it possible for the operator to "upgrade" a lax dependency to a strict dependency (recall that the both have identical semantics when NULLs are not present), as in this example:

SELECT * FROM t WHERE b>5

The ">" operator rejects NULL values, which means that the Select operator can convert the lax dependency to a strict dependency:

(a)-->(b,c) (b)-->(a,c)

Now, either the "a" or "b" column determines the values of all other columns, and both are keys for the relation.

Another thing to note is that a lax dependency with an empty determinant is the same as the corresponding strict dependency:

()~~>(a,b) ()-->(a,b)

As described above, a strict dependency differs from a lax dependency only in terms of what values are allowed in determinant columns. Since the determinant has no columns in these cases, the semantics will be identical. For that reason, this library automatically maps lax constant dependencies to strict constant dependencies.

A key is a set of columns that have a unique composite value for every row in the relation. There are two kinds of keys, strict and lax, that parallel the two kinds of functional dependencies. Strict keys treat NULL values in key columns as equal to one another:

b c -------- 1 10 2 20 NULL 30

Here, "b" is a key for the relation, even though it contains a NULL value, because there is only one such value. Multiple NULL values would violate the strict key, because they would compare as equal, and therefore would be considered duplicates. The SQL GROUP BY operator uses the same semantics for grouping (it's no coincidence that the definition for strict keys follows that lead).

By contrast, lax keys treat NULL values in key columns as distinct from one another, and so considers column "b" as unique in the following example:

b c -------- 1 10 2 20 NULL 30 NULL 40

Note that both strict and lax keys treat non-NULL values identically; values from two different rows must never compare equal to one another. In addition, the presence of a strict or lax key always implies a functional dependency with the key as determinant and all other columns in the relation as dependants. Here is an example assuming a table with columns (a,b,c,d):

lax-key(a,b) => (a,b)~~>(c,d) strict-key(a,b) => (a,b)-->(c,d)

The "empty key" is a special key that has zero columns. It is used when the relation is guaranteed to have at most one row. In this special case, every column is constant. Every combination of columns is a trivial key for the relation and determines every other column. Because the lax and strict key designations are equivalent when there is a single row, empty keys are always normalized to be strict for convenience.

FuncDepSet tracks whether at least one key (whether it be strict or lax) exists for the relation. If this is true, then all possible keys for the relation can be enumerated using the FD set. This is because any subset of columns forms a key if its FD closure contains every column in the relation. Therefore, all keys can be brute force enumerated by checking the closure of each combination in the power set. Again, this is only possible when the relation is known to have a key; otherwise, knowing the closure contains all columns is not a sufficient condition to identify a key, because of the possibility of duplicate rows.

In practice, it is never necessary to enumerate all possible keys (fortunate, since there can be O(2**N) of them), since the vast majority of them turn out to have redundant columns that can be functionally determined from other columns in the key. Of more value is the set of "candidate keys", which are keys that contain no redundant columns. Removing any column from such a key causes it to longer be a key. It is possible to enumerate the set of candidate keys in polynomial rather than exponential time (see Wikipedia "Candidate key" entry).

However, since even polynomial time can be problematic, this library tries to avoid enumerating keys by storing and maintaining a single candidate key for the relation. And while it is not always successful, the library tries to keep the candidate key that has the fewest number of columns. In most cases, this single key is enough to satisfy the requirements of the optimizer. But when it is not enough, or the existing key is no longer valid, then a new candidate key can always be generated.

It turns out that the most common key-related question that must be answered is not "what are the list of keys for this relation?", but instead, "does this set of columns contain a key for the relation?". The latter question can be easily answered by computing the closure of the columns, and checking whether the closure contains the key maintained by FuncDepSet. And when a relatively short key is needed (e.g. during decorrelation), FuncDepSet has one ready to go.

FD sets encode "equivalent columns", which are pairs of columns that always have equal values using the SQL equality operator with NULL= semantics. Two columns a and b are equivalent if the following expression returns true:

((a = b) OR (a IS NULL AND b IS NULL)) IS True

Equivalent columns are typically derived from a Select filter condition, and are represented as two FDs with each column acting as both determinant and dependant:

SELECT * FROM t WHERE b=c (a)-->(b,c) (b)~~>(a,c) (b)==(c) (c)==(b)

In the common case shown above, the WHERE clause rejects NULL values, so the equivalency will always be strict, which means it retains all the same properties of a strict dependency. While lax equivalencies are theoretically possible, the library currently maps them into regular lax dependencies to simplify implementation.

For a more rigorous examination of functional dependencies and their interaction with various SQL operators, see the following Master's Thesis:

Norman Paulley, Glenn. (2000). Exploiting Functional Dependence in Query Optimization. https://cs.uwaterloo.ca/research/tr/2000/11/CS-2000-11.thesis.pdf

While this paper served as the inspiration for this library, a number of details differ, including (but not limited to):

1. Most importantly, the definition of "lax" used in the paper differs from the definition used by this library. For a lax dependency A~~>B, the paper allows this set of rows: a b ------- 1 1 1 NULL This library disallows that, since it requires that if the determinant of a lax dependency is not-null, then it is equivalent to a strict dependency. This alternate definition is briefly covered in section 2.5.3.2 of the paper (see definition 2.19). The reason for this change is to allow a lax dependency to be upgraded to a strict dependency more readily, needing only the determinant columns to be not-null rather than both determinant and dependant columns. 2. The paper simplifies FD sets so that dependants never contain more than one column. This library allows multiple dependent columns, since they can be so efficiently stored and processed as ColSets. 3. The paper deliberately avoids all simplifications when a SQL operator adds new FDs to an existing FD set, in order to avoid unneeded work and expensive reductions. This library performs quite a few simplifications in order to keep the FD set more manageable and understandable. 4. The paper "colors" columns black when they are no longer part of a derived relation. Rather than marking removed columns, this library actually removes them from the FD set. 5. In order to ensure a unique key for every relation, the paper uses a special "tuple identifier" that acts like a virtual column and can be both a determinant and a dependant. If the transitive closure of any set of columns includes the tuple identifier column, then that set of columns is a super key for the relation. As described in the Keys section above, this library takes a simplified approach so that it doesn't need to allocate virtual columns in property derivation code.

❖

func (f *FuncDepSet) AddConstants(cols opt.ColSet)

AddConstants adds a strict FD to the set that declares the given column as having the same constant value for all rows. If the column is nullable, then its value may be NULL, but then the column must be NULL for all rows. For column "a", the FD looks like this:

()-->(a)

Since it is a constant, any set of determinant columns (including the empty set) trivially determines the value of "a".

❖

func (f *FuncDepSet) AddEquivFrom(fdset *FuncDepSet)

AddEquivFrom is similar to AddFrom, except that it only adds equivalence dependencies from the given set to this set.

❖

func (f *FuncDepSet) AddEquivalency(a, b opt.ColumnID)

AddEquivalency adds two FDs to the set that establish a strict equivalence between the given columns. Either "a" equals "b" according to SQL equality semantics, or else "a" is NULL and "b" is NULL. The following FDs are created in the set:

(a)==(b) (b)==(a)

❖

func (f *FuncDepSet) AddFrom(fdset *FuncDepSet)

AddFrom merges two FD sets by adding each FD from the given set to this set. While this requires O(N**2) time, it's useful when the two FD sets may overlap one another and substantial simplifications are possible (as with IndexJoin). It is up to the caller to ensure that the two FD sets are "compatible", meaning that they operate on the same relations, with the same keys, same columns, etc.

❖

func (f *FuncDepSet) AddLaxKey(keyCols, allCols opt.ColSet)

AddLaxKey is similar to AddStrictKey, except that it creates a lax FD rather than a strict FD. This means that two rows with NULL key values might not have the same values in other non-key columns. For key columns (a,b) and relation columns (a,b,c,d), and FD like this is created:

(a,b)~~>(c,d)

❖

func (f *FuncDepSet) AddStrictKey(keyCols, allCols opt.ColSet)

AddStrictKey adds an FD for a new key. The given key columns are reduced to a candidate key, and that becomes the determinant for the allCols column set. The resulting FD is strict, meaning that a NULL key value always maps to the same set of values in the rest of the relation's columns. For key columns (a,b) and relation columns (a,b,c,d), an FD like this is created:

(a,b)-->(c,d)

If the resulting candidate key has fewer columns than the current key, then the new key is adopted in its place.

AddSynthesizedCol adds an FD to the set that is derived from a synthesized column in a projection list. The synthesized column is often derived from other columns, in which case AddSynthesizedCol creates a new FD like this:

(a,b)-->(c)

Or it may be a constant column, like this:

()-->(c)

❖

func (f *FuncDepSet) ColSet() opt.ColSet

ColSet returns all columns referenced by the FD set.

❖

func (f *FuncDepSet) ColsAreLaxKey(cols opt.ColSet) bool

ColsAreLaxKey returns true if the given columns contain a lax key for the relation. This means that any two rows in the relation will never have the same values for this set of columns, except potentially in the case where at least one of the columns is NULL. For example, (a,b) is a lax key for the following relation, but (a) is not (because there are multiple rows where a=1):

a b c ---------------- NULL NULL NULL NULL NULL 1 NULL NULL 2 NULL 1 1 NULL 1 2 1 NULL 1 1 NULL 2 1 1 1

❖

func (f *FuncDepSet) ColsAreStrictKey(cols opt.ColSet) bool

ColsAreStrictKey returns true if the given columns contain a strict key for the relation. This means that any two rows in the relation will never have the same values for this set of columns. If the columns are nullable, then at most one row could have NULL values for all of the columns. For example, (a,b) is a strict key for the following relation, but (a) is not (because there are multiple rows where a=1 and a=NULL):

a b c ---------------- NULL NULL NULL NULL 1 1 1 NULL 1 1 1 1

ComputeClosure returns the strict closure of the given columns. The closure includes the input columns plus all columns that are functionally dependent on those columns, either directly or indirectly. Consider this set of FD's:

(a)-->(b,c,d) (b,c,e)-->(f) (d)-->(e)

The strict closure of (a) is (a,b,c,d,e,f), because (a) determines all other columns. Therefore, if two rows have the same value for (a), then the rows will be duplicates, since all other columns will be equal.

ComputeEquivClosure returns the equivalence closure of the given columns. The closure includes the input columns plus all columns that are equivalent to any of these columns, either directly or indirectly. For example:

(a)==(b) (b)==(c) (a)==(d)

The equivalence closure for (a) is (a,b,c,d) because (a) is transitively equivalent to all other columns. Therefore, all columns must have equal non-NULL values, or else all must be NULL (see definition for NULL= in the comment for FuncDepSet).

ComputeEquivGroup returns the group of columns that are equivalent to the given column. See ComputeEquivClosure for more details.

❖

func (f *FuncDepSet) ConstantCols() opt.ColSet

ConstantCols returns the set of columns that will always have the same value for all rows in the relation.

❖

func (f *FuncDepSet) CopyFrom(fdset *FuncDepSet)

CopyFrom copies the given FD into this FD, replacing any existing data.

❖

func (f *FuncDepSet) Empty() bool

Empty is true if the set contains no FDs and no key.

❖

func (f *FuncDepSet) EquivReps() opt.ColSet

EquivReps returns one "representative" column from each equivalency group in the FD set. ComputeEquivGroup can be called to obtain the remaining columns from each equivalency group.

❖

func (f *FuncDepSet) HasMax1Row() bool

HasMax1Row returns true if the relation has zero or one rows.

❖

func (f *FuncDepSet) InClosureOf(cols, in opt.ColSet) bool

InClosureOf returns true if the given columns are functionally determined by the "in" column set.

❖

func (f *FuncDepSet) LaxKey() (_ opt.ColSet, ok bool)

LaxKey returns a lax key for the relation, if there is one. Note that strict keys are implicitly also lax keys, so if the relation has a strict key, this returns the same key as StrictKey(). A best effort is made to return a lax key that has the fewest columns.

❖

func (f *FuncDepSet) MakeApply(inner *FuncDepSet)

MakeApply modifies the FD set to reflect the impact of an apply join. This FD set reflects the properties of the outer query, and the given FD set reflects the properties of the inner query. Constant FDs from inner set no longer hold and some other dependencies need to be augmented in order to be valid for the apply join operator. Consider this example:

SELECT * FROM a INNER JOIN LATERAL (SELECT * FROM b WHERE b.y=a.y) ON True

1. The constant dependency created from the outer column reference b.y=a.y

does not hold for the Apply operator, since b.y is no longer constant at this level. In general, constant dependencies cannot be retained, because they may have been generated from outer column equivalencies.

2. If a strict dependency (b.x,b.y)-->(b.z) held, it would have been reduced

to (b.x)-->(b.z) because (b.y) is constant in the inner query. However, (b.x)-->(b.z) does not hold for the Apply operator, since (b.y) is not constant in that case. However, the dependency *does* hold as long as its determinant is augmented by the left input's key columns (if key exists).

3. Lax dependencies follow the same rules as #2. 4. Equivalence dependencies in the inner query still hold for the Apply

operator.

5. If both the outer and inner inputs of the apply join have keys, then the

concatenation of those keys is a key on the apply join result.

❖

func (f *FuncDepSet) MakeFullOuter(leftCols, rightCols, notNullInputCols opt.ColSet)

MakeFullOuter modifies the cartesian product FD set to reflect the impact of adding NULL-extended rows to the results of an inner join. An inner join can be modeled as a cartesian product + ON filtering, and an outer join is modeled as an inner join + union of NULL-extended rows. MakeFullOuter performs the final step for a full join, given the set of columns on each side, as well as the set of input columns from both sides of the join that are not null.

❖

func (f *FuncDepSet) MakeLeftOuter( leftFDs, filtersFDs *FuncDepSet, leftCols, rightCols, notNullInputCols opt.ColSet, )

MakeLeftOuter modifies the cartesian product FD set to reflect the impact of adding NULL-extended rows to the results of an inner join. An inner join can be modeled as a cartesian product + ON filtering, and an outer join is modeled as an inner join + union of NULL-extended rows. MakeLeftOuter enacts the filtering and null-extension of the cartesian product. If it is possible to prove that there is a key over the join result that consists only of columns from the left input, that key will be used.

This same logic applies for right joins as well (by reversing sides).

See the "Left outer join" section on page 84 of the Master's Thesis for the impact of outer joins on FDs.

❖

func (f *FuncDepSet) MakeMax1Row(cols opt.ColSet)

MakeMax1Row initializes the FD set for a relation containing either zero or one rows, and with the given columns. In this special case, the value of every column is trivially considered a constant, and the key is the empty set, because no columns are required to ensure uniqueness of rows. This special case may seem trivial, but it is quite important to detect during optimization. For a relation with columns (a, b), the following FD is created in the set:

()-->(a,b)

❖

func (f *FuncDepSet) MakeNotNull(notNullCols opt.ColSet)

MakeNotNull modifies the FD set based on which columns cannot contain NULL values. This often allows upgrading lax dependencies to strict dependencies, and lax keys to strict keys.

Note: this function should be called with all known null columns; it won't do as good of a job if it's called multiple times with different subsets.

❖

func (f *FuncDepSet) MakeProduct(inner *FuncDepSet)

MakeProduct modifies the FD set to reflect the impact of a cartesian product operation between this set and the given set. The result is a union of the FDs from each set, as well as a union of their keys. The two FD sets are expected to operate on disjoint columns, so the FDs from each are simply concatenated, rather than simplified via calls to addDependency (except for case of constant columns).

❖

func (f *FuncDepSet) ProjectCols(cols opt.ColSet)

ProjectCols removes all columns that are not in the given set. It does this by replacing any un-projected dependants by their closures, and then removing all FDs containing un-projected columns. While this algorithm may cause some loss of information in edge cases, it does a good job of preserving the most important dependency information.

ReduceCols removes redundant columns from the given set. Redundant columns can be functionally determined from the remaining columns. If the columns contain a key for the relation, then the reduced columns will form a candidate key for the relation.

The reduction algorithm removes one column at a time (in an arbitrary order), and then tests to see if the closure still includes the removed column. If so, then the column is redundant. This algorithm has decent running time, but will not necessarily find the candidate key with the fewest columns.

❖

func (f *FuncDepSet) StrictKey() (_ opt.ColSet, ok bool)

StrictKey returns a strict key for the relation, if there is one. A best effort is made to return a candidate key that has the fewest columns.

❖

func (f FuncDepSet) String() string

❖

func (f FuncDepSet) StringOnlyFDs() string

StringOnlyFDs returns a string representation of the FDs (without the key information).

❖

func (f *FuncDepSet) Verify()

Verify runs consistency checks against the FD set, in order to ensure that it conforms to several invariants:

1. An FD determinant should not intersect its dependants. 2. If a constant FD is present, it's the first FD in the set. 3. A constant FD must be strict. 4. Lax equivalencies should be reduced to lax dependencies. 5. Equivalence determinant should be exactly one column. 6. The dependants of an equivalence is always its closure. 7. If FD set has a key, it should be a candidate key (already reduced). 8. Closure of key should include all known columns in the FD set. 9. If FD set has no key then key columns should be empty.

❖

```
type Histogram struct {
// contains filtered or unexported fields
}
```

Histogram captures the distribution of values for a particular column within a relational expression. Histograms are immutable.

ApplySelectivity reduces the size of each histogram bucket according to the given selectivity, and returns a new histogram with the results.

❖

func (h *Histogram) Bucket(i int) *cat.HistogramBucket

Bucket returns a pointer to the ith bucket in the histogram. i must be greater than or equal to 0 and less than BucketCount.

BucketCount returns the number of buckets in the histogram.

❖

func (h *Histogram) CanFilter(c *constraint.Constraint) (colOffset, exactPrefix int, ok bool)

CanFilter returns true if the given constraint can filter the histogram. This is the case if the histogram column matches one of the columns in the exact prefix of c or the next column immediately after the exact prefix. Returns the offset of the matching column in the constraint if found, as well as the exact prefix.

DistinctValuesCount returns the estimated number of distinct values in the histogram.

❖

func (h *Histogram) Filter(c *constraint.Constraint) *Histogram

Filter filters the histogram according to the given constraint, and returns a new histogram with the results. CanFilter should be called first to validate that c can filter the histogram.

❖

func (h *Histogram) Init( evalCtx *tree.EvalContext, col opt.ColumnID, buckets []cat.HistogramBucket, )

Init initializes the histogram with data from the catalog.

ValuesCount returns the total number of values in the histogram. It can be used to estimate the selectivity of a predicate by comparing the values count before and after calling Filter on the histogram.

❖

type JoinMultiplicity struct { // LeftMultiplicity and RightMultiplicity describe how the left and right // input rows respectively will be affected by the join operator. // As an example, using the query from above: // // SELECT * FROM xy FULL JOIN uv ON x=u; // // MultiplicityNotDuplicatedVal: both LeftMultiplicity and RightMultiplicity // would set the not-duplicated flag because the equality is between key // columns, which means that no row can match more than once. // // MultiplicityPreservedVal: both fields would set the preserved flag because // the FullJoin will add back any rows that don't match on the filter // conditions. LeftMultiplicity MultiplicityValue RightMultiplicity MultiplicityValue }

JoinMultiplicity answers queries about how a join will affect the rows from its inputs. Left and right input rows can be duplicated and/or filtered by the join. As an example:

CREATE TABLE xy (x INT PRIMARY KEY, y INT); CREATE TABLE uv (u INT PRIMARY KEY, v INT); SELECT * FROM xy FULL JOIN uv ON x=u;

1. Are rows from xy or uv being duplicated by the join? 2. Are any rows being filtered from the join output?

A JoinMultiplicity constructed for the join is able to answer either of the above questions by checking one of the MultiplicityValue bit flags. The not-duplicated and preserved flags are always unset for a join unless it can be statically proven that no rows from the given input will be duplicated or filtered respectively. As an example, take the following query:

SELECT * FROM xy INNER JOIN uv ON y = v;

At execution time, it may be that every row from xy will be included in the join output exactly once. However, since this cannot be proven before runtime, the duplicated and filtered flags must be set.

After initial construction by multiplicity_builder.go, JoinMultiplicity should be considered immutable.

❖

func (mp *JoinMultiplicity) JoinDoesNotDuplicateLeftRows() bool

JoinDoesNotDuplicateLeftRows returns true when rows from the left input will not be included in the join output more than once.

❖

func (mp *JoinMultiplicity) JoinDoesNotDuplicateRightRows() bool

JoinDoesNotDuplicateRightRows returns true when rows from the right input will not be included in the join output more than once.

❖

func (mp *JoinMultiplicity) JoinPreservesLeftRows() bool

JoinPreservesLeftRows returns true when all rows from the left input are guaranteed to be included in the join output.

❖

func (mp *JoinMultiplicity) JoinPreservesRightRows() bool

JoinPreservesRightRows returns true when all rows from the right input are guaranteed to be included in the join output.

❖

func (mp *JoinMultiplicity) String() string

String returns a formatted string containing flags for the left and right inputs that indicate how many times any given input row can be guaranteed to show up in the join output.

MultiplicityValue is a bitfield that describes whether a join duplicates and/or filters rows from a particular input.

❖

const ( // MultiplicityIndeterminateVal indicates that no guarantees can be made about // the effect the join will have on its input rows. MultiplicityIndeterminateVal MultiplicityValue = 0 // MultiplicityNotDuplicatedVal indicates that the join will not include input // rows in its output more than once. MultiplicityNotDuplicatedVal MultiplicityValue = 1 << (iota - 1) // MultiplicityPreservedVal indicates that the join will include all input // rows in its output. MultiplicityPreservedVal )

❖

type Relational struct { Shared // OutputCols is the set of columns that can be projected by the expression. // Ordering, naming, and duplication of columns is not representable by this // property; those are physical properties. OutputCols opt.ColSet // NotNullCols is the subset of output columns which cannot be NULL. The // nullability of columns flows from the inputs and can also be derived from // filters that reject nulls. NotNullCols opt.ColSet // Cardinality is the number of rows that can be returned from this relational // expression. The number of rows will always be between the inclusive Min and // Max bounds. If Max=math.MaxUint32, then there is no limit to the number of // rows returned by the expression. Cardinality Cardinality // FuncDepSet is a set of functional dependencies (FDs) that encode useful // relationships between columns in a base or derived relation. Given two sets // of columns A and B, a functional dependency A-->B holds if A uniquely // determines B. In other words, if two different rows have equal values for // columns in A, then those two rows will also have equal values for columns // in B. For example: // // a1 a2 b1 // -------- // 1 2 5 // 1 2 5 // // FDs assist the optimizer in proving useful properties about query results. // This information powers many optimizations, including eliminating // unnecessary DISTINCT operators, simplifying ORDER BY columns, removing // Max1Row operators, and mapping semi-joins to inner-joins. // // The methods that are most useful for optimizations are: // Key: extract a candidate key for the relation // ColsAreStrictKey: determine if a set of columns uniquely identify rows // ReduceCols: discard redundant columns to create a candidate key // // For more details, see the header comment for FuncDepSet. FuncDeps FuncDepSet // Stats is the set of statistics that apply to this relational expression. // See statistics.go and memo/statistics_builder.go for more details. Stats Statistics // Rule encapsulates the set of properties that are maintained to assist // with specific sets of transformation rules. They are not intended to be // general purpose in nature. Typically, they're used by rules which need to // decide whether to push operators down into the tree. These properties // "bubble up" information about the subtree which can aid in that decision. // // Whereas the other logical relational properties are filled in by the memo // package upon creation of a new memo group, the rules properties are filled // in by one of the transformation packages, since deriving rule properties // is so closely tied with maintenance of the rules that depend upon them. // For example, the PruneCols set is connected to the PruneCols normalization // rules. The decision about which columns to add to PruneCols depends upon // what works best for those rules. Neither the rules nor their properties // can be considered in isolation, without considering the other. Rule struct { // Available contains bits that indicate whether lazily-populated Rule // properties have been initialized. For example, if the UnfilteredCols // bit is set, then the Rule.UnfilteredCols field has been initialized // and is ready for use. Available AvailableRuleProps // PruneCols is the subset of output columns that can potentially be // eliminated by one of the PruneCols normalization rules. Those rules // operate by pushing a Project operator down the tree that discards // unused columns. For example: // // SELECT y FROM xyz WHERE x=1 ORDER BY y LIMIT 1 // // The z column is never referenced, either by the filter or by the // limit, and would be part of the PruneCols set for the Limit operator. // The final Project operator could then push down a pruning Project // operator that eliminated the z column from its subtree. // // PruneCols is built bottom-up. It typically starts out containing the // complete set of output columns in a leaf expression, but quickly // empties out at higher levels of the expression tree as the columns // are referenced. Drawing from the example above: // // Limit PruneCols : [z] // Select PruneCols: [y, z] // Scan PruneCols : [x, y, z] // // Only a small number of relational operators are capable of pruning // columns (e.g. Scan, Project). A pruning Project operator pushed down // the tree must journey downwards until it finds a pruning-capable // operator. If a column is part of PruneCols, then it is guaranteed that // such an operator exists at the end of the journey. Operators that are // not capable of filtering columns (like Explain) will not add any of // their columns to this set. // // PruneCols is lazily populated by rules in prune_cols.opt. It is // only valid once the Rule.Available.PruneCols bit has been set. PruneCols opt.ColSet // RejectNullCols is the subset of nullable output columns that can // potentially be made not-null by one of the RejectNull normalization // rules. Those rules work in concert with the predicate pushdown rules // to synthesize a "col IS NOT NULL" filter and push it down the tree. // See the header comments for the reject_nulls.opt file for more // information and an example. // // RejectNullCols is built bottom-up by rulePropsBuilder, and only contains // nullable outer join columns that can be simplified. The columns can be // propagated up through multiple operators, giving higher levels of the // tree a window into the structure of the tree several layers down. In // particular, the null rejection rules use this property to determine when // it's advantageous to synthesize a new "IS NOT NULL" filter. Without this // information, the rules can clutter the tree with extraneous and // marginally useful null filters. // // RejectNullCols is lazily populated by rules in reject_nulls.opt. It is // only valid once the Rule.Available.RejectNullCols bit has been set. RejectNullCols opt.ColSet // InterestingOrderings is a list of orderings that potentially could be // provided by the operator without sorting. Interesting orderings normally // come from scans (index orders) and are bubbled up through some operators. // // Note that all prefixes of an interesting order are "interesting"; the // list doesn't need to contain orderings that are prefixes of some other // ordering in the list. // // InterestingOrderings is lazily populated by interesting_orderings.go. // It is only valid once the Rule.Available.InterestingOrderings bit has // been set. InterestingOrderings opt.OrderingSet // UnfilteredCols is the set of all columns for which rows from their base // table are guaranteed not to have been filtered. Rows may be duplicated, // but no rows can be missing. Even columns which are not output columns are // included as long as table rows are guaranteed not filtered. For example, // an unconstrained, unlimited Scan operator can add all columns from its // table to this property, but a Select operator cannot add any columns, as // it may have filtered rows. // // UnfilteredCols is lazily populated by GetJoinMultiplicityFromInputs. It // is only valid once the Rule.Available.UnfilteredCols bit has been set. UnfilteredCols opt.ColSet // JoinSize is the number of relations being *inner* joined underneath // this node. It is used to only reorder joins via AssociateJoin up to // a certain limit. JoinSize int } }

Relational properties describe the content and characteristics of relational data returned by all expression variants within a memo group. While each expression in the group may return rows or columns in a different order, or compute the result using different algorithms, the same set of data is returned and can then be transformed into whatever layout or presentation format that is desired, according to the required physical properties.

❖

func (r *Relational) IsAvailable(p AvailableRuleProps) bool

IsAvailable returns true if the specified rule property has been populated on this relational properties instance.

❖

func (r *Relational) SetAvailable(p AvailableRuleProps)

SetAvailable sets the available bits for the given properties, in order to mark them as populated on this relational properties instance.

❖

type Scalar struct { Shared // Constraints is the set of constraints deduced from a boolean expression. // For the expression to be true, all constraints in the set must be // satisfied. Constraints *constraint.Set // TightConstraints is true if the expression is exactly equivalent to the // constraints. If it is false, the constraints are weaker than the // expression. TightConstraints bool // FuncDeps is a set of functional dependencies (FDs) inferred from a // boolean expression. This field is only populated for Filters expressions. // // - Constant column FDs such as ()-->(1,2) from conjuncts such as // x = 5 AND y = 10. // - Equivalent column FDs such as (1)==(2), (2)==(1) from conjuncts such // as x = y. // // It is useful to calculate FDs on Filters expressions, because it allows // additional filters to be inferred for push-down. For example, consider // the query: // // SELECT * FROM a, b WHERE a.x = b.x AND a.x > 5; // // By adding the equivalency FD for a.x = b.x, we can infer an additional // filter, b.x > 5. This allows us to rewrite the query as: // // SELECT * FROM (SELECT * FROM a WHERE a.x > 5) AS a, // (SELECT * FROM b WHERE b.x > 5) AS b WHERE a.x = b.x; // // For more details, see the header comment for FuncDepSet. FuncDeps FuncDepSet // Rule encapsulates the set of properties that are maintained to assist // with specific sets of transformation rules. See the Relational.Rule // comment for more details. Rule struct { // Available contains bits that indicate whether lazily-populated Rule // properties have been initialized. For example, if the // HasHoistableSubquery bit is set, then the Rule.HasHoistableSubquery // field has been initialized and is ready for use. Available AvailableRuleProps // HasHoistableSubquery is true if the scalar expression tree contains a // subquery having one or more outer columns, and if the subquery needs // to be hoisted up into its parent query as part of query decorrelation. // The subquery can be a Subquery, Exists, or Any operator. These operators // need to be hoisted out of scalar expression trees and turned into top- // level apply joins. This property makes detection fast and easy so that // the hoister doesn't waste time searching subtrees that don't contain // subqueries. // // HasHoistableSubquery is lazily populated by rules in decorrelate.opt. // It is only valid once the Rule.Available.HasHoistableSubquery bit has // been set. HasHoistableSubquery bool } }

Scalar properties are logical properties that are computed for scalar expressions that return primitive-valued types. Scalar properties are lazily populated on request.

❖

func (s *Scalar) IsAvailable(p AvailableRuleProps) bool

IsAvailable returns true if the specified rule property has been populated on this scalar properties instance.

❖

func (s *Scalar) SetAvailable(p AvailableRuleProps)

SetAvailable sets the available bits for the given properties, in order to mark them as populated on this scalar properties instance.

❖

type Shared struct { // Populated is set to true once the properties have been built for the // operator. bool // OuterCols is the set of columns that are referenced by variables within // this sub-expression, but are not bound within the scope of the expression. // For example: // // SELECT * // FROM a // WHERE EXISTS(SELECT * FROM b WHERE b.x = a.x AND b.y = 5) // // For the EXISTS expression, a.x is an outer column, meaning that it is // defined "outside" the EXISTS expression (hence the name "outer"). The // SELECT expression binds the b.x and b.y references, so they are not part // of the outer column set. The outer SELECT binds the a.x column, and so // its outer column set is empty. // // Note that what constitutes an "outer column" is dependent on an // expression's location in the query. For example, while the b.x and b.y // columns are not outer columns on the EXISTS expression, they *are* outer // columns on the inner WHERE condition. opt.ColSet // HasSubquery is true if the subtree rooted at this node contains a subquery. // The subquery can be a Subquery, Exists, Any, or ArrayFlatten expression. // Subqueries are the only place where a relational node can be nested within a // scalar expression. bool // HasCorrelatedSubquery is true if the scalar expression tree contains a // subquery having one or more outer columns. The subquery can be a Subquery, // Exists, or Any operator. These operators usually need to be hoisted out of // scalar expression trees and turned into top-level apply joins. This // property makes detection fast and easy so that the hoister doesn't waste // time searching subtrees that don't contain subqueries. bool // VolatilitySet contains the set of volatilities contained in the expression. VolatilitySet // CanMutate is true if the subtree rooted at this expression contains at // least one operator that modifies schema (like CreateTable) or writes or // deletes rows (like Insert). bool // HasPlaceholder is true if the subtree rooted at this expression contains // at least one Placeholder operator. bool // Rule props are lazily calculated and typically only apply to a single // rule. See the comment above Relational.Rule for more details. struct { // WithUses tracks information about the WithScans inside the given // expression which reference WithIDs outside of that expression. WithUses WithUsesMap } }

Shared are properties that are shared by both relational and scalar expressions.

❖

type Statistics struct { // Available indicates whether the underlying table statistics for this // expression were available. If true, RowCount contains a real estimate. // If false, RowCount does not represent reality, and should only be used // for relative cost comparison. Available bool // RowCount is the estimated number of rows returned by the expression. // Note that - especially when there are no stats available - the scaling of // the row counts can be unpredictable; thus, a row count of 0.001 should be // considered 1000 times better than a row count of 1, even though if this was // a true row count they would be pretty much the same thing. RowCount float64 // ColStats is a collection of statistics that pertain to columns in an // expression or table. It is keyed by a set of one or more columns over which // the statistic is defined. ColStats ColStatsMap // Selectivity is a value between 0 and 1 representing the estimated // reduction in number of rows for the top-level operator in this // expression. Selectivity float64 }

Statistics is a collection of measurements and statistics that is used by the coster to estimate the cost of expressions. Statistics are collected for tables and indexes and are exposed to the optimizer via cat.Catalog interfaces.

As logical properties are derived bottom-up for each expression, the estimated row count is derived bottom-up for each relational expression. The column statistics (stored in ColStats and MultiColStats) are derived lazily, and only as needed to determine the row count for the current expression or a parent expression. For example:

SELECT y FROM a WHERE x=1

The only column that affects the row count of this query is x, since the distribution of values in x is what determines the selectivity of the predicate. As a result, column statistics will be derived for column x but not for column y.

See memo/statistics_builder.go for more information about how statistics are calculated.

❖

func (s *Statistics) ApplySelectivity(selectivity float64)

ApplySelectivity applies a given selectivity to the statistics. RowCount and Selectivity are updated. Note that DistinctCounts, NullCounts, and Histograms are not updated. See ColumnStatistic.ApplySelectivity for updating distinct counts, null counts, and histograms.

❖

func (s *Statistics) CopyFrom(other *Statistics)

CopyFrom copies a Statistics object which can then be modified independently.

❖

func (s *Statistics) Init(relProps *Relational) (zeroCardinality bool)

Init initializes the data members of Statistics.

❖

func (s *Statistics) String() string

❖

func (s *Statistics) UnionWith(other *Statistics)

UnionWith unions this Statistics object with another Statistics object. It updates the RowCount and Selectivity, and represents the result of unioning two relational expressions with the given statistics. Note that DistinctCounts, NullCounts, and Histograms are not updated.

VolatilitySet tracks the set of operator volatilities contained inside an expression. See tree.Volatility for more info on volatility values.

The reason why we use a set (rather than the "maximum" volatility) is that for plan caching purposes, we want to distinguish the case when a stable operator is used - regardless of whether a volatile operator is used. For example, consider these two statements:

(1) INSERT INTO t VALUES (gen_random_uuid(), '2020-10-09') (2) INSERT INTO t VALUES (gen_random_uuid(), now())

For (1) we can cache the final optimized plan. For (2), we can only cache the memo if we don't constant fold stable operators, and subsequently fold them each time we try to execute an instance of the query.

The optimizer makes *only* the following side-effect related guarantees:

1. CASE/IF branches are only evaluated if the branch condition is true or if all operators are LeakProof. Therefore, the following is guaranteed to never raise a divide by zero error, regardless of how cleverly the optimizer rewrites the expression: CASE WHEN divisor<>0 THEN dividend / divisor ELSE NULL END While this example is trivial, a more complex example might have correlated subqueries that cannot be hoisted outside the CASE expression in the usual way, since that would trigger premature evaluation. 2. Volatile expressions are never treated as constant expressions, even though they do not depend on other columns in the query: SELECT * FROM xy ORDER BY random() If the random() expression were treated as a constant, then the ORDER BY could be dropped by the optimizer, since ordering by a constant is a no-op. Instead, the optimizer treats it like it would an expression that depends upon a column. 3. A common table expression (CTE) containing Volatile operators will only be evaluated one time. This will typically prevent inlining of the CTE into the query body. For example: WITH a AS (INSERT ... RETURNING ...) SELECT * FROM a, a Although the "a" CTE is referenced twice, it must be evaluated only one time (and its results cached to satisfy the second reference).

As long as the optimizer provides these guarantees, it is free to rewrite, reorder, duplicate, and eliminate as if no side effects were present. As an example, the optimizer is free to eliminate the unused "nextval" column in this query:

SELECT x FROM (SELECT nextval(seq), x FROM xy) => SELECT x FROM xy

It's also allowed to duplicate side-effecting expressions during predicate pushdown:

SELECT * FROM xy INNER JOIN xz ON xy.x=xz.x WHERE xy.x=random() => SELECT * FROM (SELECT * FROM xy WHERE xy.x=random()) INNER JOIN (SELECT * FROM xz WHERE xz.x=random()) ON xy.x=xz.x

❖

func (vs *VolatilitySet) Add(v tree.Volatility)

Add a volatility to the set.

❖

func (vs *VolatilitySet) AddImmutable()

AddImmutable is a convenience shorthand for adding VolatilityImmutable.

❖

func (vs *VolatilitySet) AddStable()

AddStable is a convenience shorthand for adding VolatilityStable.

❖

func (vs *VolatilitySet) AddVolatile()

AddVolatile is a convenience shorthand for adding VolatilityVolatile.

❖

func (vs VolatilitySet) HasStable() bool

HasStable returns true if the set contains VolatilityStable.

❖

func (vs VolatilitySet) HasVolatile() bool

HasVolatile returns true if the set contains VolatilityVolatile.

❖

func (vs VolatilitySet) IsLeakProof() bool

IsLeakProof returns true if the set is empty or only contains VolatilityLeakProof.

❖

func (vs VolatilitySet) String() string

❖

func (vs *VolatilitySet) UnionWith(other VolatilitySet)

UnionWith sets the receiver to the union of the two volatility sets.

❖

type WithUseInfo struct { // Count is the number of WithScan operators which reference this WithID. Count int // UsedCols is the union of columns used by all WithScan operators which // reference this WithID. UsedCols opt.ColSet }

WithUseInfo contains information about the usage of a specific WithID.

❖

type WithUsesMap map[opt.WithID]WithUseInfo

WithUsesMap stores information about each WithScan referencing an outside WithID, grouped by each WithID.

Path | Synopsis |
---|---|

physical |

Package props imports 14 packages (graph) and is imported by 16 packages. Updated 2020-07-11. Refresh now. Tools for package owners.