cockroach: github.com/cockroachdb/cockroach/pkg/storage Index | Examples | Files | Directories

package storage

import "github.com/cockroachdb/cockroach/pkg/storage"

Package storage provides access to the Store and Range abstractions. Each Cockroach node handles one or more stores, each of which multiplexes to one or more ranges, identified by [start, end) keys. Ranges are contiguous regions of the keyspace. Each range implements an instance of the Raft consensus algorithm to synchronize participating range replicas.

Each store is represented by a single engine.Engine instance. The ranges hosted by a store all have access to the same engine, but write to only a range-limited keyspace within it. Ranges access the underlying engine via the MVCC interface, which provides historical versioned values.

Code:

stopper := stop.NewStopper()
defer stopper.Stop(context.TODO())

st := cluster.MakeTestingClusterSettings()
clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond)

// Model a set of stores in a cluster,
// adding / rebalancing ranges of random sizes.
rpcContext := rpc.NewContext(
    log.AmbientContext{Tracer: st.Tracer},
    &base.Config{Insecure: true},
    clock,
    stopper,
    st,
)
server := rpc.NewServer(rpcContext) // never started
g := gossip.NewTest(1, rpcContext, server, stopper, metric.NewRegistry(), config.DefaultZoneConfigRef())

TimeUntilStoreDead.Override(&st.SV, TestTimeUntilStoreDeadOff)

const generations = 100
const nodes = 20
const printGenerations = generations / 2

// Deterministic must be set as this test is comparing the exact output
// after each rebalance.
sp := NewStorePool(
    log.AmbientContext{Tracer: st.Tracer},
    st,
    g,
    clock,
    func() int {
        return nodes
    },
    newMockNodeLiveness(storagepb.NodeLivenessStatus_LIVE).nodeLivenessFunc,
    /* deterministic */ true,
)
alloc := MakeAllocator(sp, func(string) (time.Duration, bool) {
    return 0, false
})

var wg sync.WaitGroup
g.RegisterCallback(gossip.MakePrefixPattern(gossip.KeyStorePrefix),
    func(_ string, _ roachpb.Value) { wg.Done() },
    // Redundant callbacks are required by this test.
    gossip.Redundant)

// Initialize testStores.
var testStores [nodes]testStore
for i := 0; i < len(testStores); i++ {
    testStores[i].immediateCompaction = true
    testStores[i].StoreID = roachpb.StoreID(i)
    testStores[i].Node = roachpb.NodeDescriptor{NodeID: roachpb.NodeID(i)}
    testStores[i].Capacity = roachpb.StoreCapacity{Capacity: 1 << 30, Available: 1 << 30}
}
// Initialize the cluster with a single range.
testStores[0].add(alloc.randGen.Int63n(1 << 20))

table := tablewriter.NewWriter(os.Stdout)
table.SetAutoFormatHeaders(false)
table.SetAlignment(tablewriter.ALIGN_RIGHT)

header := make([]string, len(testStores)+1)
header[0] = "gen"
for i := 0; i < len(testStores); i++ {
    header[i+1] = fmt.Sprintf("store %d", i)
}
table.SetHeader(header)

for i := 0; i < generations; i++ {
    // First loop through test stores and add data.
    wg.Add(len(testStores))
    for j := 0; j < len(testStores); j++ {
        // Add a pretend range to the testStore if there's already one.
        if testStores[j].Capacity.RangeCount > 0 {
            testStores[j].add(alloc.randGen.Int63n(1 << 20))
        }
        if err := g.AddInfoProto(gossip.MakeStoreKey(roachpb.StoreID(j)), &testStores[j].StoreDescriptor, 0); err != nil {
            panic(err)
        }
    }
    wg.Wait()

    // Next loop through test stores and maybe rebalance.
    for j := 0; j < len(testStores); j++ {
        ts := &testStores[j]
        var rangeUsageInfo RangeUsageInfo
        target, _, details, ok := alloc.RebalanceTarget(
            context.Background(),
            config.EmptyCompleteZoneConfig(),
            nil,
            firstRangeID,
            []roachpb.ReplicaDescriptor{{NodeID: ts.Node.NodeID, StoreID: ts.StoreID}},
            rangeUsageInfo,
            storeFilterThrottled,
        )
        if ok {
            log.Infof(context.TODO(), "rebalancing to %v; details: %s", target, details)
            testStores[j].rebalance(&testStores[int(target.StoreID)], alloc.randGen.Int63n(1<<20))
        }
    }

    if i%(generations/printGenerations) == 0 {
        var totalBytes int64
        for j := 0; j < len(testStores); j++ {
            totalBytes += testStores[j].Capacity.Capacity - testStores[j].Capacity.Available
        }
        row := make([]string, len(testStores)+1)
        row[0] = fmt.Sprintf("%d", i)
        for j := 0; j < len(testStores); j++ {
            ts := testStores[j]
            bytes := ts.Capacity.Capacity - ts.Capacity.Available
            row[j+1] = fmt.Sprintf("%3d %3d%%", ts.Capacity.RangeCount, (100*bytes)/totalBytes)
        }
        table.Append(row)
    }
}

var totBytes int64
var totRanges int32
for i := 0; i < len(testStores); i++ {
    totBytes += testStores[i].Capacity.Capacity - testStores[i].Capacity.Available
    totRanges += testStores[i].Capacity.RangeCount
}
table.Render()
fmt.Printf("Total bytes=%d, ranges=%d\n", totBytes, totRanges)

Output:

+-----+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+
| gen | store 0  | store 1  | store 2  | store 3  | store 4  | store 5  | store 6  | store 7  | store 8  | store 9  | store 10 | store 11 | store 12 | store 13 | store 14 | store 15 | store 16 | store 17 | store 18 | store 19 |
+-----+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+
|   0 |   2 100% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |
|   2 |   4 100% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |
|   4 |   6 100% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |
|   6 |   8 100% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |
|   8 |  10 100% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |
|  10 |  10  68% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   1  15% |   0   0% |   0   0% |   1  11% |   0   0% |   1   5% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |   0   0% |
|  12 |  10  30% |   2   3% |   2   9% |   2   3% |   0   0% |   0   0% |   0   0% |   1   7% |   1   5% |   0   0% |   0   0% |   1   8% |   0   0% |   1  11% |   0   0% |   1   9% |   2   6% |   2   4% |   0   0% |   0   0% |
|  14 |  10  15% |   2   0% |   2   6% |   3   2% |   2   2% |   4   8% |   2   3% |   4   7% |   2   4% |   2   0% |   2   5% |   3   7% |   3   4% |   2   7% |   2   4% |   2   6% |   2   0% |   2   1% |   2   2% |   2   7% |
|  16 |  10   9% |   4   2% |   4   5% |   5   1% |   5   5% |   5   5% |   4   4% |   5   2% |   4   5% |   4   1% |   5   8% |   5   6% |   5   5% |   4   7% |   4   3% |   4   6% |   4   2% |   5   3% |   4   2% |   5   9% |
|  18 |  10   5% |   6   3% |   6   4% |   7   3% |   7   5% |   7   5% |   6   3% |   7   2% |   7   7% |   7   3% |   7   6% |   7   6% |   7   6% |   6   6% |   6   3% |   6   5% |   6   3% |   7   3% |   6   3% |   7   8% |
|  20 |  10   4% |   9   3% |   8   4% |   9   3% |   9   5% |   9   5% |   8   4% |   9   3% |   9   6% |   9   3% |   9   6% |   9   6% |   9   6% |   8   6% |   8   4% |   8   4% |   9   5% |   9   3% |   8   4% |   9   6% |
|  22 |  12   5% |  11   3% |  10   4% |  11   3% |  11   5% |  11   6% |  10   4% |  11   3% |  11   6% |  11   3% |  11   6% |  11   6% |  11   5% |  10   5% |  10   4% |  10   4% |  11   5% |  11   2% |  10   4% |  11   6% |
|  24 |  14   5% |  13   4% |  12   4% |  13   3% |  13   5% |  13   6% |  12   4% |  13   3% |  13   7% |  13   4% |  13   6% |  13   5% |  13   5% |  12   5% |  12   4% |  12   4% |  13   5% |  13   2% |  12   4% |  13   6% |
|  26 |  16   5% |  15   4% |  14   3% |  15   3% |  15   5% |  15   5% |  14   4% |  15   3% |  15   7% |  15   4% |  15   6% |  15   5% |  15   5% |  14   5% |  14   4% |  14   4% |  15   4% |  15   3% |  14   4% |  15   6% |
|  28 |  18   5% |  17   4% |  16   4% |  17   3% |  17   6% |  17   5% |  16   4% |  17   3% |  17   6% |  17   4% |  17   6% |  17   5% |  17   5% |  16   5% |  16   5% |  16   4% |  17   4% |  17   3% |  16   4% |  17   5% |
|  30 |  20   5% |  19   4% |  18   4% |  19   3% |  19   6% |  19   5% |  18   4% |  19   4% |  19   6% |  19   4% |  19   6% |  19   4% |  19   5% |  18   5% |  18   5% |  18   4% |  19   4% |  19   3% |  18   4% |  19   5% |
|  32 |  22   5% |  21   4% |  20   4% |  21   3% |  21   5% |  21   5% |  20   4% |  21   4% |  21   6% |  21   5% |  21   6% |  21   4% |  21   5% |  20   5% |  20   5% |  20   4% |  21   4% |  21   3% |  20   4% |  21   5% |
|  34 |  24   5% |  23   4% |  22   3% |  23   3% |  23   5% |  23   5% |  22   4% |  23   4% |  23   6% |  23   5% |  23   6% |  23   5% |  23   5% |  22   5% |  22   5% |  22   4% |  23   4% |  23   3% |  22   4% |  23   5% |
|  36 |  26   4% |  25   4% |  24   4% |  25   3% |  25   5% |  25   5% |  24   4% |  25   4% |  25   6% |  25   5% |  25   5% |  25   5% |  25   5% |  24   5% |  24   5% |  24   4% |  25   4% |  25   3% |  24   4% |  25   5% |
|  38 |  28   4% |  27   4% |  26   4% |  27   3% |  27   5% |  27   5% |  26   4% |  27   4% |  27   5% |  27   5% |  27   5% |  27   5% |  27   5% |  26   5% |  26   4% |  26   4% |  27   5% |  27   3% |  26   4% |  27   5% |
|  40 |  30   4% |  29   4% |  28   4% |  29   4% |  29   5% |  29   5% |  28   4% |  29   4% |  29   5% |  29   5% |  29   5% |  29   5% |  29   6% |  28   5% |  28   4% |  28   4% |  29   5% |  29   3% |  28   4% |  29   5% |
|  42 |  32   4% |  31   4% |  30   4% |  31   3% |  31   5% |  31   5% |  30   4% |  31   4% |  31   5% |  31   5% |  31   5% |  31   5% |  31   6% |  30   5% |  30   4% |  30   4% |  31   5% |  31   3% |  30   4% |  31   5% |
|  44 |  34   4% |  33   4% |  32   4% |  33   4% |  33   5% |  33   5% |  32   4% |  33   4% |  33   5% |  33   5% |  33   5% |  33   5% |  33   5% |  32   4% |  32   4% |  32   4% |  33   5% |  33   3% |  32   5% |  33   5% |
|  46 |  36   4% |  35   4% |  34   4% |  35   4% |  35   5% |  35   5% |  34   4% |  35   4% |  35   5% |  35   5% |  35   5% |  35   5% |  35   5% |  34   4% |  34   4% |  34   4% |  35   5% |  35   3% |  34   4% |  35   5% |
|  48 |  38   4% |  37   4% |  36   4% |  37   4% |  37   5% |  37   5% |  36   4% |  37   4% |  37   5% |  37   5% |  37   5% |  37   5% |  37   5% |  36   4% |  36   4% |  36   4% |  37   5% |  37   3% |  36   4% |  37   5% |
|  50 |  40   4% |  39   4% |  38   4% |  39   4% |  39   5% |  39   5% |  38   4% |  39   4% |  39   5% |  39   5% |  39   5% |  39   4% |  39   5% |  38   4% |  38   4% |  38   4% |  39   5% |  39   3% |  38   4% |  39   5% |
|  52 |  42   4% |  41   4% |  40   4% |  41   4% |  41   5% |  41   5% |  40   4% |  41   4% |  41   5% |  41   4% |  41   5% |  41   4% |  41   5% |  40   4% |  40   5% |  40   4% |  41   5% |  41   3% |  40   4% |  41   5% |
|  54 |  44   4% |  43   4% |  42   4% |  43   4% |  43   5% |  43   5% |  42   4% |  43   4% |  43   5% |  43   4% |  43   5% |  43   4% |  43   5% |  42   4% |  42   5% |  42   4% |  43   5% |  43   3% |  42   4% |  43   5% |
|  56 |  46   4% |  45   4% |  44   4% |  45   4% |  45   5% |  45   5% |  44   4% |  45   4% |  45   5% |  45   4% |  45   5% |  45   4% |  45   5% |  44   4% |  44   5% |  44   4% |  45   5% |  45   3% |  44   5% |  45   5% |
|  58 |  48   4% |  47   4% |  46   4% |  47   4% |  47   5% |  47   5% |  46   4% |  47   4% |  47   5% |  47   4% |  47   5% |  47   4% |  47   5% |  46   4% |  46   5% |  46   4% |  47   5% |  47   3% |  46   5% |  47   5% |
|  60 |  50   4% |  49   4% |  48   4% |  49   4% |  49   5% |  49   5% |  48   4% |  49   4% |  49   5% |  49   4% |  49   5% |  49   4% |  49   5% |  48   4% |  48   5% |  48   4% |  49   5% |  49   3% |  48   5% |  49   5% |
|  62 |  52   4% |  51   4% |  50   4% |  51   4% |  51   5% |  51   5% |  50   4% |  51   4% |  51   5% |  51   4% |  51   5% |  51   4% |  51   5% |  50   4% |  50   5% |  50   4% |  51   5% |  51   3% |  50   5% |  51   5% |
|  64 |  54   5% |  53   4% |  52   4% |  53   4% |  53   5% |  53   5% |  52   4% |  53   4% |  53   5% |  53   4% |  53   5% |  53   4% |  53   5% |  52   5% |  52   5% |  52   4% |  53   5% |  53   3% |  52   5% |  53   5% |
|  66 |  56   5% |  55   4% |  54   4% |  55   4% |  55   5% |  55   5% |  54   4% |  55   4% |  55   5% |  55   4% |  55   5% |  55   4% |  55   5% |  54   5% |  54   5% |  54   4% |  55   4% |  55   3% |  54   5% |  55   5% |
|  68 |  58   5% |  57   4% |  56   4% |  57   4% |  57   5% |  57   4% |  56   4% |  57   4% |  57   5% |  57   4% |  57   5% |  57   4% |  57   5% |  56   5% |  56   5% |  56   4% |  57   4% |  57   4% |  56   5% |  57   5% |
|  70 |  60   5% |  59   4% |  58   4% |  59   4% |  59   5% |  59   5% |  58   4% |  59   4% |  59   5% |  59   5% |  59   5% |  59   4% |  59   5% |  58   5% |  58   5% |  58   4% |  59   4% |  59   4% |  58   5% |  59   5% |
|  72 |  62   5% |  61   4% |  60   4% |  61   4% |  61   5% |  61   5% |  60   5% |  61   4% |  61   5% |  61   5% |  61   5% |  61   4% |  61   5% |  60   5% |  60   5% |  60   4% |  61   4% |  61   4% |  60   5% |  61   5% |
|  74 |  64   5% |  63   4% |  62   4% |  63   4% |  63   5% |  63   4% |  62   5% |  63   4% |  63   5% |  63   4% |  63   5% |  63   5% |  63   5% |  62   5% |  62   5% |  62   4% |  63   4% |  63   4% |  62   5% |  63   5% |
|  76 |  66   4% |  65   4% |  64   4% |  65   4% |  65   5% |  65   5% |  64   4% |  65   4% |  65   5% |  65   5% |  65   5% |  65   4% |  65   5% |  64   5% |  64   5% |  64   4% |  65   4% |  65   4% |  64   5% |  65   5% |
|  78 |  68   4% |  67   4% |  66   4% |  67   4% |  67   5% |  67   5% |  66   4% |  67   4% |  67   5% |  67   5% |  67   5% |  67   5% |  67   5% |  66   5% |  66   5% |  66   4% |  67   4% |  67   4% |  66   5% |  67   5% |
|  80 |  70   4% |  69   4% |  68   4% |  69   4% |  69   5% |  69   5% |  68   4% |  69   4% |  69   5% |  69   5% |  69   5% |  69   4% |  69   5% |  68   4% |  68   5% |  68   4% |  69   4% |  69   4% |  68   5% |  69   5% |
|  82 |  72   4% |  71   4% |  70   4% |  71   4% |  71   5% |  71   4% |  70   4% |  71   4% |  71   5% |  71   5% |  71   5% |  71   4% |  71   5% |  70   4% |  70   5% |  70   4% |  71   4% |  71   4% |  70   5% |  71   5% |
|  84 |  74   4% |  73   5% |  72   4% |  73   4% |  73   5% |  73   4% |  72   4% |  73   4% |  73   5% |  73   5% |  73   4% |  73   4% |  73   5% |  72   4% |  72   5% |  72   4% |  73   4% |  73   4% |  72   5% |  73   5% |
|  86 |  76   4% |  75   5% |  74   4% |  75   4% |  75   5% |  75   5% |  74   4% |  75   4% |  75   5% |  75   5% |  75   4% |  75   4% |  75   5% |  74   4% |  74   5% |  74   4% |  75   4% |  75   4% |  74   5% |  75   5% |
|  88 |  78   4% |  77   5% |  76   4% |  77   4% |  77   5% |  77   5% |  76   5% |  77   4% |  77   5% |  77   5% |  77   4% |  77   4% |  77   5% |  76   4% |  76   5% |  76   4% |  77   4% |  77   4% |  76   5% |  77   5% |
|  90 |  80   4% |  79   5% |  78   5% |  79   4% |  79   5% |  79   5% |  78   5% |  79   4% |  79   5% |  79   4% |  79   5% |  79   4% |  79   5% |  78   4% |  78   5% |  78   4% |  79   4% |  79   4% |  78   5% |  79   5% |
|  92 |  82   4% |  81   5% |  80   5% |  81   4% |  81   5% |  81   5% |  80   4% |  81   4% |  81   5% |  81   4% |  81   4% |  81   5% |  81   5% |  80   4% |  80   5% |  80   4% |  81   4% |  81   4% |  80   5% |  81   5% |
|  94 |  84   4% |  83   5% |  82   4% |  83   4% |  83   5% |  83   5% |  82   5% |  83   4% |  83   5% |  83   4% |  83   5% |  83   5% |  83   5% |  82   4% |  82   5% |  82   4% |  83   4% |  83   4% |  82   5% |  83   5% |
|  96 |  86   4% |  85   5% |  84   4% |  85   4% |  85   5% |  85   5% |  84   5% |  85   4% |  85   5% |  85   4% |  85   4% |  85   5% |  85   5% |  84   4% |  84   5% |  84   4% |  85   4% |  85   4% |  84   5% |  85   5% |
|  98 |  88   4% |  87   5% |  86   4% |  87   4% |  87   5% |  87   5% |  86   5% |  87   4% |  87   5% |  87   4% |  87   4% |  87   5% |  87   5% |  86   4% |  86   5% |  86   4% |  87   4% |  87   4% |  86   5% |  87   5% |
+-----+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+
Total bytes=913070194, ranges=1755

Index

Examples

Package Files

addressing.go allocator.go allocator_scorer.go api.pb.go cclglue.go consistency_queue.go debug_print.go doc.go gc_queue.go lease_history.go log.go merge_queue.go metrics.go node_liveness.go queue.go queue_helpers_testutil.go raft.go raft.pb.go raft_log_queue.go raft_snapshot_queue.go raft_transport.go refreshraftreason_string.go replica.go replica_application_cmd.go replica_application_cmd_buf.go replica_application_decoder.go replica_application_result.go replica_application_state_machine.go replica_backpressure.go replica_closedts.go replica_command.go replica_consistency.go replica_consistency_diff.go replica_corruption.go replica_destroy.go replica_eval_context.go replica_eval_context_span.go replica_evaluate.go replica_follower_read.go replica_gc_queue.go replica_gossip.go replica_init.go replica_metrics.go replica_placeholder.go replica_proposal.go replica_proposal_buf.go replica_proposal_quota.go replica_raft.go replica_raft_quiesce.go replica_raftstorage.go replica_range_lease.go replica_rangefeed.go replica_rankings.go replica_read.go replica_sideload.go replica_sideload_disk.go replica_sideload_inmem.go replica_split_load.go replica_sst_snapshot_storage.go replica_stats.go replica_tscache.go replica_write.go replicate_queue.go scanner.go scheduler.go split_delay_helper.go split_queue.go split_trigger_helper.go store.go store_bootstrap.go store_create_replica.go store_merge.go store_pool.go store_raft.go store_rebalancer.go store_remove_replica.go store_send.go store_snapshot.go store_snapshot_preemptive.go store_split.go stores.go stores_server.go syncing_write.go testing_knobs.go track_raft_protos.go ts_maintenance_queue.go

Constants

const (

    // RaftLogQueueStaleThreshold is the minimum threshold for stale raft log
    // entries. A stale entry is one which all replicas of the range have
    // progressed past and thus is no longer needed and can be truncated.
    RaftLogQueueStaleThreshold = 100
    // RaftLogQueueStaleSize is the minimum size of the Raft log that we'll
    // truncate even if there are fewer than RaftLogQueueStaleThreshold entries
    // to truncate. The value of 64 KB was chosen experimentally by looking at
    // when Raft log truncation usually occurs when using the number of entries
    // as the sole criteria.
    RaftLogQueueStaleSize = 64 << 10
)
const (

    // ReplicaGCQueueInactivityThreshold is the inactivity duration after which
    // a range will be considered for garbage collection. Exported for testing.
    ReplicaGCQueueInactivityThreshold = 10 * 24 * time.Hour // 10 days
    // ReplicaGCQueueSuspectTimeout is the duration after which a Replica which
    // is suspected to be removed should be processed by the queue.
    // A Replica is suspected to have been removed if either it is in the
    // candidate Raft state (which is a typical sign of having been removed
    // from the group) or it is not in the VOTER_FULL state. Replicas which are
    // in the LEARNER state will never become candidates. It seems possible that
    // a range will quiesce and never tell a VOTER_OUTGOING that is was removed.
    // Cases where a replica gets stuck in VOTER_INCOMING seem farfetched and
    // would require the replica to be removed from the range before it ever
    // learned about its promotion but that state shouldn't last long so we
    // also treat idle replicas in that state as suspect.
    ReplicaGCQueueSuspectTimeout = 1 * time.Second
)
const (
    // TestTimeUntilStoreDead is the test value for TimeUntilStoreDead to
    // quickly mark stores as dead.
    TestTimeUntilStoreDead = 5 * time.Millisecond

    // TestTimeUntilStoreDeadOff is the test value for TimeUntilStoreDead that
    // prevents the store pool from marking stores as dead.
    TestTimeUntilStoreDeadOff = 24 * time.Hour
)
const (
    // TimeSeriesMaintenanceInterval is the minimum interval between two
    // time series maintenance runs on a replica.
    TimeSeriesMaintenanceInterval = 24 * time.Hour // daily

    // TimeSeriesMaintenanceMemoryBudget is the maximum amount of memory that
    // should be consumed by time series maintenance operations at any one time.
    TimeSeriesMaintenanceMemoryBudget = int64(8 * 1024 * 1024) // 8MB
)
const (

    // IntersectingSnapshotMsg is part of the error message returned from
    // canApplySnapshotLocked and is exposed here so testing can rely on it.
    IntersectingSnapshotMsg = "snapshot intersects existing range"
)
const MaxCommandSizeFloor = 4 << 20 // 4MB

MaxCommandSizeFloor is the minimum allowed value for the MaxCommandSize cluster setting.

const MaxQuotaReplicaLivenessDuration = 10 * time.Second

MaxQuotaReplicaLivenessDuration is the maximum duration that a replica can remain inactive while still being counting against the range's available proposal quota.

const (

    // MinStatsDuration defines a lower bound on how long users of replica stats
    // should wait before using those stats for anything. If the duration of a
    // measurement has been less than MinStatsDuration, these methods could easily
    // return outlier/anomalous data.
    MinStatsDuration = 5 * time.Second
)

Variables

var (
    ErrInvalidLengthApi = fmt.Errorf("proto: negative length found during unmarshaling")
    ErrIntOverflowApi   = fmt.Errorf("proto: integer overflow")
)
var (
    // ErrNoLivenessRecord is returned when asking for liveness information
    // about a node for which nothing is known.
    ErrNoLivenessRecord = errors.New("node not in the liveness table")

    // ErrEpochIncremented is returned when a heartbeat request fails because
    // the underlying liveness record has had its epoch incremented.
    ErrEpochIncremented = errors.New("heartbeat failed on epoch increment")

    // ErrEpochAlreadyIncremented is returned by IncrementEpoch when
    // someone else has already incremented the epoch to the desired
    // value.
    ErrEpochAlreadyIncremented = errors.New("epoch already incremented")
)
var (
    ErrInvalidLengthRaft = fmt.Errorf("proto: negative length found during unmarshaling")
    ErrIntOverflowRaft   = fmt.Errorf("proto: integer overflow")
)
var DeclinedReservationsTimeout = settings.RegisterNonNegativeDurationSetting(
    "server.declined_reservation_timeout",
    "the amount of time to consider the store throttled for up-replication after a reservation was declined",
    1*time.Second,
)

DeclinedReservationsTimeout specifies a duration during which the local replicate queue will not consider stores which have rejected a reservation a viable target.

var ExportRequestsLimit = settings.RegisterPositiveIntSetting(
    "kv.bulk_io_write.concurrent_export_requests",
    "number of export requests a store will handle concurrently before queuing",
    3,
)

ExportRequestsLimit is the number of Export requests that can run at once. Each extracts data from RocksDB to a temp file and then uploads it to cloud storage. In order to not exhaust the disk or memory, or saturate the network, limit the number of these that can be run in parallel. This number was chosen by a guessing - it could be improved by more measured heuristics. Exported here since we check it in in the caller to limit generated requests as well to prevent excessive queuing.

var FailedReservationsTimeout = settings.RegisterNonNegativeDurationSetting(
    "server.failed_reservation_timeout",
    "the amount of time to consider the store throttled for up-replication after a failed reservation call",
    5*time.Second,
)

FailedReservationsTimeout specifies a duration during which the local replicate queue will not consider stores which have failed a reservation a viable target.

var FollowerReadsEnabled = settings.RegisterBoolSetting(
    "kv.closed_timestamp.follower_reads_enabled",
    "allow (all) replicas to serve consistent historical reads based on closed timestamp information",
    true,
)

FollowerReadsEnabled controls whether replicas attempt to serve follower reads. The closed timestamp machinery is unaffected by this, i.e. the same information is collected and passed around, regardless of the value of this setting.

var LoadBasedRebalancingMode = settings.RegisterEnumSetting(
    "kv.allocator.load_based_rebalancing",
    "whether to rebalance based on the distribution of QPS across stores",
    "leases and replicas",
    map[int64]string{
        int64(LBRebalancingOff):               "off",
        int64(LBRebalancingLeasesOnly):        "leases",
        int64(LBRebalancingLeasesAndReplicas): "leases and replicas",
    },
)

LoadBasedRebalancingMode controls whether range rebalancing takes additional variables such as write load and disk usage into account. If disabled, rebalancing is done purely based on replica count.

var MaxCommandSize = settings.RegisterValidatedByteSizeSetting(
    "kv.raft.command.max_size",
    "maximum size of a raft command",
    64<<20,
    func(size int64) error {
        if size < MaxCommandSizeFloor {
            return fmt.Errorf("max_size must be greater than %s", humanizeutil.IBytes(MaxCommandSizeFloor))
        }
        return nil
    },
)

MaxCommandSize wraps "kv.raft.command.max_size".

var MergeQueueInterval = func() *settings.DurationSetting {
    s := settings.RegisterNonNegativeDurationSetting(
        "kv.range_merge.queue_interval",
        "how long the merge queue waits between processing replicas",
        time.Second,
    )
    s.SetSensitive()
    return s
}()

MergeQueueInterval is a setting that controls how often the merge queue waits between processing replicas.

var MinLeaseTransferStatsDuration = 30 * time.Second

MinLeaseTransferStatsDuration configures the minimum amount of time a replica must wait for stats about request counts to accumulate before making decisions based on them. The higher this is, the less likely thrashing is (up to a point). Made configurable for the sake of testing.

var RangefeedEnabled = settings.RegisterBoolSetting(
    "kv.rangefeed.enabled",
    "if set, rangefeed registration is enabled",
    false,
)

RangefeedEnabled is a cluster setting that enables rangefeed requests.

var SnapshotRequest_Priority_name = map[int32]string{
    0:  "UNKNOWN",
    1:  "RECOVERY",
    2:  "REBALANCE",
}
var SnapshotRequest_Priority_value = map[string]int32{
    "UNKNOWN":   0,
    "RECOVERY":  1,
    "REBALANCE": 2,
}
var SnapshotRequest_Strategy_name = map[int32]string{
    0: "KV_BATCH",
}
var SnapshotRequest_Strategy_value = map[string]int32{
    "KV_BATCH": 0,
}
var SnapshotRequest_Type_name = map[int32]string{
    0:  "RAFT",
    1:  "LEARNER",
    2:  "PREEMPTIVE",
}
var SnapshotRequest_Type_value = map[string]int32{
    "RAFT":       0,
    "LEARNER":    1,
    "PREEMPTIVE": 2,
}
var SnapshotResponse_Status_name = map[int32]string{
    0:  "UNKNOWN",
    1:  "ACCEPTED",
    2:  "APPLIED",
    3:  "ERROR",
    4:  "DECLINED",
}
var SnapshotResponse_Status_value = map[string]int32{
    "UNKNOWN":  0,
    "ACCEPTED": 1,
    "APPLIED":  2,
    "ERROR":    3,
    "DECLINED": 4,
}
var SplitByLoadEnabled = settings.RegisterBoolSetting(
    "kv.range_split.by_load_enabled",
    "allow automatic splits of ranges based on where load is concentrated",
    true,
)

SplitByLoadEnabled wraps "kv.range_split.by_load_enabled".

var SplitByLoadQPSThreshold = settings.RegisterIntSetting(
    "kv.range_split.load_qps_threshold",
    "the QPS over which, the range becomes a candidate for load based splitting",
    2500,
)

SplitByLoadQPSThreshold wraps "kv.range_split.load_qps_threshold".

var TimeUntilStoreDead = settings.RegisterValidatedDurationSetting(
    timeUntilStoreDeadSettingName,
    "the time after which if there is no new gossiped information about a store, it is considered dead",
    5*time.Minute,
    func(v time.Duration) error {
        // Setting this to less than the interval for gossiping stores is a big
        // no-no, since this value is compared to the age of the most recent gossip
        // from each store to determine whether that store is live. Put a buffer of
        // 15 seconds on top to allow time for gossip to propagate.
        const minTimeUntilStoreDead = gossip.StoresInterval + 15*time.Second
        if v < minTimeUntilStoreDead {
            return errors.Errorf("cannot set %s to less than %v: %v",
                timeUntilStoreDeadSettingName, minTimeUntilStoreDead, v)
        }
        return nil
    },
)

TimeUntilStoreDead wraps "server.time_until_store_dead".

var UseAtomicReplicationChanges = settings.RegisterBoolSetting(
    "kv.atomic_replication_changes.enabled",
    "use atomic replication changes",
    true,
)

UseAtomicReplicationChanges determines whether to issue atomic replication changes. This has no effect until the cluster version is 19.2 or higher.

func ComputeRaftLogSize Uses

func ComputeRaftLogSize(
    ctx context.Context, rangeID roachpb.RangeID, reader engine.Reader, sideloaded SideloadStorage,
) (int64, error)

ComputeRaftLogSize computes the size (in bytes) of the Raft log from the storage engine. This will iterate over the Raft log and sideloaded files, so depending on the size of these it can be mildly to extremely expensive and thus should not be called frequently.

The sideloaded storage may be nil, in which case it is treated as empty.

func DecodeRaftCommand Uses

func DecodeRaftCommand(data []byte) (storagebase.CmdIDKey, []byte)

DecodeRaftCommand splits a raftpb.Entry.Data into its commandID and command portions. The caller is responsible for checking that the data is not empty (which indicates a dummy entry generated by raft rather than a real command). Usage is mostly internal to the storage package but is exported for use by debugging tools.

func EnableLeaseHistory Uses

func EnableLeaseHistory(maxEntries int) func()

EnableLeaseHistory turns on the lease history for testing purposes. Returns a function to return it to its original state that can be deferred.

func GetNeededReplicas Uses

func GetNeededReplicas(zoneConfigReplicaCount int32, clusterNodes int) int

GetNeededReplicas calculates the number of replicas a range should have given its zone config and the number of nodes available for up-replication (i.e. not dead and not decommissioning).

func HasRaftLeader Uses

func HasRaftLeader(raftStatus *raft.Status) bool

HasRaftLeader returns true if the raft group has a raft leader currently.

func InitEngine Uses

func InitEngine(
    ctx context.Context, eng engine.Engine, ident roachpb.StoreIdent, cv cluster.ClusterVersion,
) error

InitEngine writes a new store ident to the underlying engine. To ensure that no crufty data already exists in the engine, it scans the engine contents before writing the new store ident. The engine should be completely empty. It returns an error if called on a non-empty engine.

func IsRangeDescriptorKey Uses

func IsRangeDescriptorKey(key engine.MVCCKey) error

IsRangeDescriptorKey returns nil if the key decodes as a RangeDescriptor.

func IsSnapshotError Uses

func IsSnapshotError(err error) bool

IsSnapshotError returns true iff the error indicates a preemptive snapshot failed.

func IterateIDPrefixKeys Uses

func IterateIDPrefixKeys(
    ctx context.Context,
    eng engine.Reader,
    keyFn func(roachpb.RangeID) roachpb.Key,
    msg protoutil.Message,
    f func(_ roachpb.RangeID) (more bool, _ error),
) error

IterateIDPrefixKeys helps visit system keys that use RangeID prefixing (such as RaftHardStateKey, RaftTombstoneKey, and many others). Such keys could in principle exist at any RangeID, and this helper efficiently discovers all the keys of the desired type (as specified by the supplied `keyFn`) and, for each key-value pair discovered, unmarshals it into `msg` and then invokes `f`.

Iteration stops on the first error (and will pass through that error).

func IterateRangeDescriptors Uses

func IterateRangeDescriptors(
    ctx context.Context, eng engine.Reader, fn func(desc roachpb.RangeDescriptor) (bool, error),
) error

IterateRangeDescriptors calls the provided function with each descriptor from the provided Engine. The return values of this method and fn have semantics similar to engine.MVCCIterate.

func NewReplicaEvalContext Uses

func NewReplicaEvalContext(r *Replica, ss *spanset.SpanSet) batcheval.EvalContext

NewReplicaEvalContext returns a batcheval.EvalContext to use for command evaluation. The supplied SpanSet will be ignored except for race builds, in which case state access is asserted against it. A SpanSet must always be passed.

func PrintKeyValue Uses

func PrintKeyValue(kv engine.MVCCKeyValue)

PrintKeyValue attempts to pretty-print the specified MVCCKeyValue to os.Stdout, falling back to '%q' formatting.

func ReadClusterVersion Uses

func ReadClusterVersion(ctx context.Context, reader engine.Reader) (cluster.ClusterVersion, error)

ReadClusterVersion reads the the cluster version from the store-local version key.

func ReadHLCUpperBound Uses

func ReadHLCUpperBound(ctx context.Context, e engine.Engine) (int64, error)

ReadHLCUpperBound returns the upper bound to the wall time of the HLC If this value does not exist 0 is returned

func ReadMaxHLCUpperBound Uses

func ReadMaxHLCUpperBound(ctx context.Context, engines []engine.Engine) (int64, error)

ReadMaxHLCUpperBound returns the maximum of the stored hlc upper bounds among all the engines. This value is optionally persisted by the server and it is guaranteed to be higher than any wall time used by the HLC. If this value is persisted, HLC wall clock monotonicity is guaranteed across server restarts

func ReadStoreIdent Uses

func ReadStoreIdent(ctx context.Context, eng engine.Engine) (roachpb.StoreIdent, error)

ReadStoreIdent reads the StoreIdent from the store. It returns *NotBootstrappedError if the ident is missing (meaning that the store needs to be bootstrapped).

func ReadVersionFromEngineOrZero Uses

func ReadVersionFromEngineOrZero(
    ctx context.Context, e engine.Engine,
) (cluster.ClusterVersion, error)

ReadVersionFromEngineOrZero reads the persisted cluster version from the engine, falling back to the zero value.

func RegisterMultiRaftServer Uses

func RegisterMultiRaftServer(s *grpc.Server, srv MultiRaftServer)

func RegisterPerReplicaServer Uses

func RegisterPerReplicaServer(s *grpc.Server, srv PerReplicaServer)

func SetImportCmd Uses

func SetImportCmd(fn ImportCmdFunc)

SetImportCmd allows setting the function that will be called as the implementation of the Import command. Only allowed to be called by Init.

func SprintKey Uses

func SprintKey(key engine.MVCCKey) string

SprintKey pretty-prings the specified MVCCKey.

func SprintKeyValue Uses

func SprintKeyValue(kv engine.MVCCKeyValue, printKey bool) string

SprintKeyValue is like PrintKeyValue, but returns a string. If printKey is true, prints the key and the value together; otherwise, prints just the value.

func SynthesizeClusterVersionFromEngines Uses

func SynthesizeClusterVersionFromEngines(
    ctx context.Context, engines []engine.Engine, minSupportedVersion, serverVersion roachpb.Version,
) (cluster.ClusterVersion, error)

SynthesizeClusterVersionFromEngines implements the core of (*Stores).SynthesizeClusterVersion.

Returns the cluster version that was read from the engines or, if there's no bootstrapped engines, returns minSupportedVersion.

Args:

minSupportedVersion: The minimum version supported by this binary. An error
  is returned if any engine has a version lower that this. This version is
  written to the engines if no store has a version in it.
serverVersion: The maximum version supported by this binary. An error is
  returned if any engine has a higher version.

func TrackRaftProtos Uses

func TrackRaftProtos() func() []reflect.Type

TrackRaftProtos instruments proto marshaling to track protos which are marshaled downstream of raft. It returns a function that removes the instrumentation and returns the list of downstream-of-raft protos.

func WriteClusterVersion Uses

func WriteClusterVersion(
    ctx context.Context, writer engine.ReadWriter, cv cluster.ClusterVersion,
) error

WriteClusterVersion writes the given cluster version to the store-local cluster version key.

func WriteClusterVersionToEngines Uses

func WriteClusterVersionToEngines(
    ctx context.Context, engines []engine.Engine, cv cluster.ClusterVersion,
) error

WriteClusterVersionToEngines writes the given version to the given engines, without any sanity checks.

func WriteInitialClusterData Uses

func WriteInitialClusterData(
    ctx context.Context,
    eng engine.Engine,
    initialValues []roachpb.KeyValue,
    bootstrapVersion roachpb.Version,
    numStores int,
    splits []roachpb.RKey,
    nowNanos int64,
) error

WriteInitialClusterData writes bootstrapping data to an engine. It creates system ranges (filling in meta1 and meta2) and the default zone config.

Args: eng: the engine to which data is to be written. initialValues: an optional list of k/v to be written as well after each

value's checksum is initialized.

bootstrapVersion: the version at which the cluster is bootstrapped. numStores: the number of stores this node will have. splits: an optional list of split points. Range addressing will be created

for all the splits. The list needs to be sorted.

nowNanos: the timestamp at which to write the initial engine data.

type Allocator Uses

type Allocator struct {
    // contains filtered or unexported fields
}

Allocator tries to spread replicas as evenly as possible across the stores in the cluster.

func MakeAllocator Uses

func MakeAllocator(
    storePool *StorePool, nodeLatencyFn func(addr string) (time.Duration, bool),
) Allocator

MakeAllocator creates a new allocator using the specified StorePool.

func (*Allocator) AllocateTarget Uses

func (a *Allocator) AllocateTarget(
    ctx context.Context,
    zone *config.ZoneConfig,
    rangeID roachpb.RangeID,
    existingReplicas []roachpb.ReplicaDescriptor,
) (*roachpb.StoreDescriptor, string, error)

AllocateTarget returns a suitable store for a new allocation with the required attributes. Nodes already accommodating existing replicas are ruled out as targets. The range ID of the replica being allocated for is also passed in to ensure that we don't try to replace an existing dead replica on a store.

TODO(tbg): AllocateReplacement?

func (*Allocator) ComputeAction Uses

func (a *Allocator) ComputeAction(
    ctx context.Context, zone *config.ZoneConfig, desc *roachpb.RangeDescriptor,
) (AllocatorAction, float64)

ComputeAction determines the exact operation needed to repair the supplied range, as governed by the supplied zone configuration. It returns the required action that should be taken and a priority.

func (Allocator) RebalanceTarget Uses

func (a Allocator) RebalanceTarget(
    ctx context.Context,
    zone *config.ZoneConfig,
    raftStatus *raft.Status,
    rangeID roachpb.RangeID,
    existingReplicas []roachpb.ReplicaDescriptor,
    rangeUsageInfo RangeUsageInfo,
    filter storeFilter,
) (add roachpb.ReplicationTarget, remove roachpb.ReplicationTarget, details string, ok bool)

RebalanceTarget returns a suitable store for a rebalance target with required attributes. Rebalance targets are selected via the same mechanism as AllocateTarget(), except the chosen target must follow some additional criteria. Namely, if chosen, it must further the goal of balancing the cluster.

The supplied parameters are the required attributes for the range and information about the range being considered for rebalancing.

The existing replicas modulo any store with dead replicas are candidates for rebalancing. Note that rebalancing is accomplished by first adding a new replica to the range, then removing the most undesirable replica.

Simply ignoring a rebalance opportunity in the event that the target chosen by AllocateTarget() doesn't fit balancing criteria is perfectly fine, as other stores in the cluster will also be doing their probabilistic best to rebalance. This helps prevent a stampeding herd targeting an abnormally under-utilized store.

The return values are, in order:

1. The target on which to add a new replica, 2. An existing replica to remove, 3. a JSON string for use in the range log, and 4. a boolean indicationg whether 1-3 were populated (i.e. whether a rebalance

opportunity was found).

func (Allocator) RemoveTarget Uses

func (a Allocator) RemoveTarget(
    ctx context.Context,
    zone *config.ZoneConfig,
    candidates []roachpb.ReplicaDescriptor,
    existingReplicas []roachpb.ReplicaDescriptor,
) (roachpb.ReplicaDescriptor, string, error)

RemoveTarget returns a suitable replica to remove from the provided replica set. It first attempts to randomly select a target from the set of stores that have greater than the average number of replicas. Failing that, it falls back to selecting a random target from any of the existing replicas.

func (*Allocator) ShouldTransferLease Uses

func (a *Allocator) ShouldTransferLease(
    ctx context.Context,
    zone *config.ZoneConfig,
    existing []roachpb.ReplicaDescriptor,
    leaseStoreID roachpb.StoreID,
    rangeID roachpb.RangeID,
    stats *replicaStats,
) bool

ShouldTransferLease returns true if the specified store is overfull in terms of leases with respect to the other stores matching the specified attributes.

func (*Allocator) TransferLeaseTarget Uses

func (a *Allocator) TransferLeaseTarget(
    ctx context.Context,
    zone *config.ZoneConfig,
    existing []roachpb.ReplicaDescriptor,
    leaseStoreID roachpb.StoreID,
    rangeID roachpb.RangeID,
    stats *replicaStats,
    checkTransferLeaseSource bool,
    checkCandidateFullness bool,
    alwaysAllowDecisionWithoutStats bool,
) roachpb.ReplicaDescriptor

TransferLeaseTarget returns a suitable replica to transfer the range lease to from the provided list. It excludes the current lease holder replica unless asked to do otherwise by the checkTransferLeaseSource parameter.

type AllocatorAction Uses

type AllocatorAction int

AllocatorAction enumerates the various replication adjustments that may be recommended by the allocator.

const (
    AllocatorNoop AllocatorAction
    AllocatorRemove
    AllocatorAdd
    AllocatorReplaceDead
    AllocatorRemoveDead
    AllocatorReplaceDecommissioning
    AllocatorRemoveDecommissioning
    AllocatorRemoveLearner
    AllocatorConsiderRebalance
    AllocatorRangeUnavailable
    AllocatorFinalizeAtomicReplicationChange
)

These are the possible allocator actions.

func (AllocatorAction) String Uses

func (a AllocatorAction) String() string

type CollectChecksumRequest Uses

type CollectChecksumRequest struct {
    StoreRequestHeader `protobuf:"bytes,1,opt,name=header,proto3,embedded=header" json:"header"`
    RangeID            github_com_cockroachdb_cockroach_pkg_roachpb.RangeID `protobuf:"varint,2,opt,name=range_id,json=rangeId,proto3,casttype=github.com/cockroachdb/cockroach/pkg/roachpb.RangeID" json:"range_id,omitempty"`
    // checksum_id identifies the corresponding roachpb.ComputeChecksumRequest.
    ChecksumID github_com_cockroachdb_cockroach_pkg_util_uuid.UUID `protobuf:"bytes,3,opt,name=checksum_id,json=checksumId,proto3,customtype=github.com/cockroachdb/cockroach/pkg/util/uuid.UUID" json:"checksum_id"`
    Checksum   []byte                                              `protobuf:"bytes,4,opt,name=checksum,proto3" json:"checksum,omitempty"`
}

A CollectChecksumRequest asks the addressed replica for the result of a roachpb.ComputeChecksumRequest.

func (*CollectChecksumRequest) Descriptor Uses

func (*CollectChecksumRequest) Descriptor() ([]byte, []int)

func (*CollectChecksumRequest) Marshal Uses

func (m *CollectChecksumRequest) Marshal() (dAtA []byte, err error)

func (*CollectChecksumRequest) MarshalTo Uses

func (m *CollectChecksumRequest) MarshalTo(dAtA []byte) (int, error)

func (*CollectChecksumRequest) ProtoMessage Uses

func (*CollectChecksumRequest) ProtoMessage()

func (*CollectChecksumRequest) Reset Uses

func (m *CollectChecksumRequest) Reset()

func (*CollectChecksumRequest) Size Uses

func (m *CollectChecksumRequest) Size() (n int)

func (*CollectChecksumRequest) String Uses

func (m *CollectChecksumRequest) String() string

func (*CollectChecksumRequest) Unmarshal Uses

func (m *CollectChecksumRequest) Unmarshal(dAtA []byte) error

func (*CollectChecksumRequest) XXX_DiscardUnknown Uses

func (m *CollectChecksumRequest) XXX_DiscardUnknown()

func (*CollectChecksumRequest) XXX_Marshal Uses

func (m *CollectChecksumRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*CollectChecksumRequest) XXX_Merge Uses

func (dst *CollectChecksumRequest) XXX_Merge(src proto.Message)

func (*CollectChecksumRequest) XXX_Size Uses

func (m *CollectChecksumRequest) XXX_Size() int

func (*CollectChecksumRequest) XXX_Unmarshal Uses

func (m *CollectChecksumRequest) XXX_Unmarshal(b []byte) error

type CollectChecksumResponse Uses

type CollectChecksumResponse struct {
    // The checksum is the sha512 hash of the requested computation. It is empty
    // if the computation failed.
    Checksum []byte `protobuf:"bytes,1,opt,name=checksum,proto3" json:"checksum,omitempty"`
    // snapshot is set if the roachpb.ComputeChecksumRequest had snapshot = true
    // and the response checksum is different from the request checksum.
    //
    // TODO(tschottdorf): with larger ranges, this is no longer tenable.
    // See https://github.com/cockroachdb/cockroach/issues/21128.
    Snapshot *roachpb.RaftSnapshotData `protobuf:"bytes,2,opt,name=snapshot,proto3" json:"snapshot,omitempty"`
    // delta carries the stats of the range minus the recomputed stats.
    Delta enginepb.MVCCStatsDelta `protobuf:"bytes,3,opt,name=delta,proto3" json:"delta"`
    // persisted carries the persisted stats of the replica.
    Persisted enginepb.MVCCStats `protobuf:"bytes,4,opt,name=persisted,proto3" json:"persisted"`
}

func (*CollectChecksumResponse) Descriptor Uses

func (*CollectChecksumResponse) Descriptor() ([]byte, []int)

func (*CollectChecksumResponse) Marshal Uses

func (m *CollectChecksumResponse) Marshal() (dAtA []byte, err error)

func (*CollectChecksumResponse) MarshalTo Uses

func (m *CollectChecksumResponse) MarshalTo(dAtA []byte) (int, error)

func (*CollectChecksumResponse) ProtoMessage Uses

func (*CollectChecksumResponse) ProtoMessage()

func (*CollectChecksumResponse) Reset Uses

func (m *CollectChecksumResponse) Reset()

func (*CollectChecksumResponse) Size Uses

func (m *CollectChecksumResponse) Size() (n int)

func (*CollectChecksumResponse) String Uses

func (m *CollectChecksumResponse) String() string

func (*CollectChecksumResponse) Unmarshal Uses

func (m *CollectChecksumResponse) Unmarshal(dAtA []byte) error

func (*CollectChecksumResponse) XXX_DiscardUnknown Uses

func (m *CollectChecksumResponse) XXX_DiscardUnknown()

func (*CollectChecksumResponse) XXX_Marshal Uses

func (m *CollectChecksumResponse) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*CollectChecksumResponse) XXX_Merge Uses

func (dst *CollectChecksumResponse) XXX_Merge(src proto.Message)

func (*CollectChecksumResponse) XXX_Size Uses

func (m *CollectChecksumResponse) XXX_Size() int

func (*CollectChecksumResponse) XXX_Unmarshal Uses

func (m *CollectChecksumResponse) XXX_Unmarshal(b []byte) error

type ConfChangeContext Uses

type ConfChangeContext struct {
    CommandID string `protobuf:"bytes,1,opt,name=command_id,json=commandId" json:"command_id"`
    // Payload is the application-level command (i.e. an encoded
    // storagepb.RaftCommand).
    Payload []byte `protobuf:"bytes,2,opt,name=payload" json:"payload,omitempty"`
}

ConfChangeContext is encoded in the raftpb.ConfChange.Context field.

func (*ConfChangeContext) Descriptor Uses

func (*ConfChangeContext) Descriptor() ([]byte, []int)

func (*ConfChangeContext) Marshal Uses

func (m *ConfChangeContext) Marshal() (dAtA []byte, err error)

func (*ConfChangeContext) MarshalTo Uses

func (m *ConfChangeContext) MarshalTo(dAtA []byte) (int, error)

func (*ConfChangeContext) ProtoMessage Uses

func (*ConfChangeContext) ProtoMessage()

func (*ConfChangeContext) Reset Uses

func (m *ConfChangeContext) Reset()

func (*ConfChangeContext) Size Uses

func (m *ConfChangeContext) Size() (n int)

func (*ConfChangeContext) String Uses

func (m *ConfChangeContext) String() string

func (*ConfChangeContext) Unmarshal Uses

func (m *ConfChangeContext) Unmarshal(dAtA []byte) error

func (*ConfChangeContext) XXX_DiscardUnknown Uses

func (m *ConfChangeContext) XXX_DiscardUnknown()

func (*ConfChangeContext) XXX_Marshal Uses

func (m *ConfChangeContext) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*ConfChangeContext) XXX_Merge Uses

func (dst *ConfChangeContext) XXX_Merge(src proto.Message)

func (*ConfChangeContext) XXX_Size Uses

func (m *ConfChangeContext) XXX_Size() int

func (*ConfChangeContext) XXX_Unmarshal Uses

func (m *ConfChangeContext) XXX_Unmarshal(b []byte) error

type ConsistencyCheckResult Uses

type ConsistencyCheckResult struct {
    Replica  roachpb.ReplicaDescriptor
    Response CollectChecksumResponse
    Err      error
}

A ConsistencyCheckResult contains the outcome of a CollectChecksum call.

type ConsistencyTestingKnobs Uses

type ConsistencyTestingKnobs struct {
    // If non-nil, OnBadChecksumFatal is called by CheckConsistency() (instead of
    // calling log.Fatal) on a checksum mismatch.
    OnBadChecksumFatal func(roachpb.StoreIdent)
    // If non-nil, BadChecksumReportDiff is called by CheckConsistency() on a
    // checksum mismatch to report the diff between snapshots.
    BadChecksumReportDiff      func(roachpb.StoreIdent, ReplicaSnapshotDiffSlice)
    ConsistencyQueueResultHook func(response roachpb.CheckConsistencyResponse)
}

ConsistencyTestingKnobs is a BatchEvalTestingKnobs struct used to control the behavior of the consistency checker for tests.

type DestroyReason Uses

type DestroyReason int

DestroyReason indicates if a replica is alive, destroyed, corrupted or pending destruction.

type GCInfo Uses

type GCInfo struct {
    // Now is the timestamp used for age computations.
    Now hlc.Timestamp
    // Policy is the policy used for this garbage collection cycle.
    Policy config.GCPolicy
    // Stats about the userspace key-values considered, namely the number of
    // keys with GC'able data, the number of "old" intents and the number of
    // associated distinct transactions.
    NumKeysAffected, IntentsConsidered, IntentTxns int
    // TransactionSpanTotal is the total number of entries in the transaction span.
    TransactionSpanTotal int
    // Summary of transactions which were found GCable (assuming that
    // potentially necessary intent resolutions did not fail).
    TransactionSpanGCAborted, TransactionSpanGCCommitted int
    TransactionSpanGCStaging, TransactionSpanGCPending   int
    // AbortSpanTotal is the total number of transactions present in the AbortSpan.
    AbortSpanTotal int
    // AbortSpanConsidered is the number of AbortSpan entries old enough to be
    // considered for removal. An "entry" corresponds to one transaction;
    // more than one key-value pair may be associated with it.
    AbortSpanConsidered int
    // AbortSpanGCNum is the number of AbortSpan entries fit for removal (due
    // to their transactions having terminated).
    AbortSpanGCNum int
    // PushTxn is the total number of pushes attempted in this cycle.
    PushTxn int
    // ResolveTotal is the total number of attempted intent resolutions in
    // this cycle.
    ResolveTotal int
    // Threshold is the computed expiration timestamp. Equal to `Now - Policy`.
    Threshold hlc.Timestamp
    // AffectedVersionsKeyBytes is the number of (fully encoded) bytes deleted from keys in the storage engine.
    // Note that this does not account for compression that the storage engine uses to store data on disk. Real
    // space savings tends to be smaller due to this compression, and space may be released only at a later point
    // in time.
    AffectedVersionsKeyBytes int64
    // AffectedVersionsValBytes is the number of (fully encoded) bytes deleted from values in the storage engine.
    // See AffectedVersionsKeyBytes for caveats.
    AffectedVersionsValBytes int64
}

GCInfo contains statistics and insights from a GC run.

func RunGC Uses

func RunGC(
    ctx context.Context,
    desc *roachpb.RangeDescriptor,
    snap engine.Reader,
    now hlc.Timestamp,
    policy config.GCPolicy,
    gcer GCer,
    cleanupIntentsFn cleanupIntentsFunc,
    cleanupTxnIntentsAsyncFn cleanupTxnIntentsAsyncFunc,
) (GCInfo, error)

RunGC runs garbage collection for the specified descriptor on the provided Engine (which is not mutated). It uses the provided gcFn to run garbage collection once on all implicated spans, cleanupIntentsFn to resolve intents synchronously, and cleanupTxnIntentsAsyncFn to asynchronously cleanup intents and associated transaction record on success.

type GCThreshold Uses

type GCThreshold struct {
    Key hlc.Timestamp
    Txn hlc.Timestamp
}

GCThreshold holds the key and txn span GC thresholds, respectively.

type GCer Uses

type GCer interface {
    SetGCThreshold(context.Context, GCThreshold) error
    GC(context.Context, []roachpb.GCRequest_GCKey) error
}

A GCer is an abstraction used by the GC queue to carry out chunked deletions.

type HeartbeatCallback Uses

type HeartbeatCallback func(context.Context)

HeartbeatCallback is invoked whenever this node updates its own liveness status, indicating that it is alive.

type HotReplicaInfo Uses

type HotReplicaInfo struct {
    Desc *roachpb.RangeDescriptor
    QPS  float64
}

HotReplicaInfo contains a range descriptor and its QPS.

type ImportCmdFunc Uses

type ImportCmdFunc func(context.Context, batcheval.CommandArgs) (*roachpb.ImportResponse, error)

ImportCmdFunc is the type of the function that will be called as the implementation of the Import command.

type IncomingSnapshot Uses

type IncomingSnapshot struct {
    SnapUUID uuid.UUID
    // The storage interface for the underlying SSTs.
    SSSS *SSTSnapshotStorageScratch
    // The Raft log entries for this snapshot.
    LogEntries [][]byte
    // The replica state at the time the snapshot was generated (never nil).
    State *storagepb.ReplicaState
    //
    // When true, this snapshot contains an unreplicated TruncatedState. When
    // false, the TruncatedState is replicated (see the reference below) and the
    // recipient must avoid also writing the unreplicated TruncatedState. The
    // migration to an unreplicated TruncatedState will be carried out during
    // the next log truncation (assuming cluster version is bumped at that
    // point).
    // See the comment on VersionUnreplicatedRaftTruncatedState for details.
    UsesUnreplicatedTruncatedState bool
    // contains filtered or unexported fields
}

IncomingSnapshot contains the data for an incoming streaming snapshot message.

type IsLiveCallback Uses

type IsLiveCallback func(nodeID roachpb.NodeID)

IsLiveCallback is invoked when a node's IsLive state changes to true. Callbacks can be registered via NodeLiveness.RegisterCallback().

type IsLiveMap Uses

type IsLiveMap map[roachpb.NodeID]IsLiveMapEntry

IsLiveMap is a type alias for a map from NodeID to IsLiveMapEntry.

type IsLiveMapEntry Uses

type IsLiveMapEntry struct {
    IsLive bool
    Epoch  int64
}

IsLiveMapEntry encapsulates data about current liveness for a node.

type KeyRange Uses

type KeyRange interface {
    Desc() *roachpb.RangeDescriptor

    btree.Item
    fmt.Stringer
    // contains filtered or unexported methods
}

KeyRange is an interface type for the replicasByKey BTree, to compare Replica and ReplicaPlaceholder.

type LBRebalancingMode Uses

type LBRebalancingMode int64

LBRebalancingMode controls if and when we do store-level rebalancing based on load.

const (
    // LBRebalancingOff means that we do not do store-level rebalancing
    // based on load statistics.
    LBRebalancingOff LBRebalancingMode = iota
    // LBRebalancingLeasesOnly means that we rebalance leases based on
    // store-level QPS imbalances.
    LBRebalancingLeasesOnly
    // LBRebalancingLeasesAndReplicas means that we rebalance both leases and
    // replicas based on store-level QPS imbalances.
    LBRebalancingLeasesAndReplicas
)

type LivenessMetrics Uses

type LivenessMetrics struct {
    LiveNodes          *metric.Gauge
    HeartbeatSuccesses *metric.Counter
    HeartbeatFailures  *metric.Counter
    EpochIncrements    *metric.Counter
    HeartbeatLatency   *metric.Histogram
}

LivenessMetrics holds metrics for use with node liveness activity.

type MultiRaftClient Uses

type MultiRaftClient interface {
    RaftMessageBatch(ctx context.Context, opts ...grpc.CallOption) (MultiRaft_RaftMessageBatchClient, error)
    RaftSnapshot(ctx context.Context, opts ...grpc.CallOption) (MultiRaft_RaftSnapshotClient, error)
}

MultiRaftClient is the client API for MultiRaft service.

For semantics around ctx use and closing/ending streaming RPCs, please refer to https://godoc.org/google.golang.org/grpc#ClientConn.NewStream.

func NewMultiRaftClient Uses

func NewMultiRaftClient(cc *grpc.ClientConn) MultiRaftClient

type MultiRaftServer Uses

type MultiRaftServer interface {
    RaftMessageBatch(MultiRaft_RaftMessageBatchServer) error
    RaftSnapshot(MultiRaft_RaftSnapshotServer) error
}

MultiRaftServer is the server API for MultiRaft service.

type MultiRaft_RaftMessageBatchClient Uses

type MultiRaft_RaftMessageBatchClient interface {
    Send(*RaftMessageRequestBatch) error
    Recv() (*RaftMessageResponse, error)
    grpc.ClientStream
}

type MultiRaft_RaftMessageBatchServer Uses

type MultiRaft_RaftMessageBatchServer interface {
    Send(*RaftMessageResponse) error
    Recv() (*RaftMessageRequestBatch, error)
    grpc.ServerStream
}

type MultiRaft_RaftSnapshotClient Uses

type MultiRaft_RaftSnapshotClient interface {
    Send(*SnapshotRequest) error
    Recv() (*SnapshotResponse, error)
    grpc.ClientStream
}

type MultiRaft_RaftSnapshotServer Uses

type MultiRaft_RaftSnapshotServer interface {
    Send(*SnapshotResponse) error
    Recv() (*SnapshotRequest, error)
    grpc.ServerStream
}

type NodeCountFunc Uses

type NodeCountFunc func() int

The NodeCountFunc returns a count of the total number of nodes the user intends for their to be in the cluster. The count includes dead nodes, but not decommissioned nodes.

type NodeLiveness Uses

type NodeLiveness struct {
    // contains filtered or unexported fields
}

NodeLiveness is a centralized failure detector that coordinates with the epoch-based range system to provide for leases of indefinite length (replacing frequent per-range lease renewals with heartbeats to the liveness system).

It is also used as a general-purpose failure detector, but it is not ideal for this purpose. It is inefficient due to the use of replicated durable writes, and is not very sensitive (it primarily tests connectivity from the node to the liveness range; a node with a failing disk could still be considered live by this system).

The persistent state of node liveness is stored in the KV layer, near the beginning of the keyspace. These are normal MVCC keys, written by CPut operations in 1PC transactions (the use of transactions and MVCC is regretted because it means that the liveness span depends on MVCC GC and can get overwhelmed if GC is not working. Transactions were used only to piggyback on the transaction commit trigger). The leaseholder of the liveness range gossips its contents whenever they change (only the changed portion); other nodes rarely read from this range directly.

The use of conditional puts is crucial to maintain the guarantees needed by epoch-based leases. Both the Heartbeat and IncrementEpoch on this type require an expected value to be passed in; see comments on those methods for more.

TODO(bdarnell): Also document interaction with draining and decommissioning.

func NewNodeLiveness Uses

func NewNodeLiveness(
    ambient log.AmbientContext,
    clock *hlc.Clock,
    db *client.DB,
    engines []engine.Engine,
    g *gossip.Gossip,
    livenessThreshold time.Duration,
    renewalDuration time.Duration,
    st *cluster.Settings,
    histogramWindow time.Duration,
) *NodeLiveness

NewNodeLiveness returns a new instance of NodeLiveness configured with the specified gossip instance.

func (*NodeLiveness) AsLiveClock Uses

func (nl *NodeLiveness) AsLiveClock() closedts.LiveClockFn

AsLiveClock returns a closedts.LiveClockFn that takes a current timestamp off the clock and returns it only if node liveness indicates that the node is live at that timestamp and the returned epoch.

func (*NodeLiveness) DisableAllHeartbeatsForTest Uses

func (nl *NodeLiveness) DisableAllHeartbeatsForTest() func()

DisableAllHeartbeatsForTest disables all node liveness heartbeats, including those triggered from outside the normal StartHeartbeat loop. Returns a closure to call to re-enable heartbeats. Only safe for use in tests.

func (*NodeLiveness) GetIsLiveMap Uses

func (nl *NodeLiveness) GetIsLiveMap() IsLiveMap

GetIsLiveMap returns a map of nodeID to boolean liveness status of each node. This excludes nodes that were removed completely (dead + decommissioning).

func (*NodeLiveness) GetLiveness Uses

func (nl *NodeLiveness) GetLiveness(nodeID roachpb.NodeID) (storagepb.Liveness, error)

GetLiveness returns the liveness record for the specified nodeID. ErrNoLivenessRecord is returned in the event that nothing is yet known about nodeID via liveness gossip.

func (*NodeLiveness) GetLivenessStatusMap Uses

func (nl *NodeLiveness) GetLivenessStatusMap() map[roachpb.NodeID]storagepb.NodeLivenessStatus

GetLivenessStatusMap generates map from NodeID to LivenessStatus. This includes only node known to gossip. To include all nodes, Callers should consider calling (statusServer).NodesWithLiveness() instead where possible.

GetLivenessStatusMap() includes removed nodes (dead + decommissioned).

func (*NodeLiveness) GetLivenessThreshold Uses

func (nl *NodeLiveness) GetLivenessThreshold() time.Duration

GetLivenessThreshold returns the maximum duration between heartbeats before a node is considered not-live.

func (*NodeLiveness) GetLivenesses Uses

func (nl *NodeLiveness) GetLivenesses() []storagepb.Liveness

GetLivenesses returns a slice containing the liveness status of every node on the cluster known to gossip. Callers should consider calling (statusServer).NodesWithLiveness() instead where possible.

func (*NodeLiveness) GetNodeCount Uses

func (nl *NodeLiveness) GetNodeCount() int

GetNodeCount returns a count of the number of nodes in the cluster, including dead nodes, but excluding decommissioning or decommissioned nodes.

func (*NodeLiveness) Heartbeat Uses

func (nl *NodeLiveness) Heartbeat(ctx context.Context, liveness storagepb.Liveness) error

Heartbeat is called to update a node's expiration timestamp. This method does a conditional put on the node liveness record, and if successful, stores the updated liveness record in the nodes map.

The liveness argument is the expected previous value of this node's liveness.

If this method returns nil, the node's liveness has been extended, relative to the previous value. It may or may not still be alive when this method returns.

On failure, this method returns ErrEpochIncremented, although this may not necessarily mean that the epoch was actually incremented. TODO(bdarnell): Fix error semantics here.

This method is rarely called directly; heartbeats are normally sent by the StartHeartbeat loop. TODO(bdarnell): Should we just remove this synchronous heartbeat completely?

func (*NodeLiveness) IncrementEpoch Uses

func (nl *NodeLiveness) IncrementEpoch(ctx context.Context, liveness storagepb.Liveness) error

IncrementEpoch is called to attempt to revoke another node's current epoch, causing an expiration of all its leases. This method does a conditional put on the node liveness record, and if successful, stores the updated liveness record in the nodes map. If this method is called on a node ID which is considered live according to the most recent information gathered through gossip, an error is returned.

The liveness argument is used as the expected value on the conditional put. If this method returns nil, there was a match and the epoch has been incremented. This means that the expiration time in the supplied liveness accurately reflects the time at which the epoch ended.

If this method returns ErrEpochAlreadyIncremented, the epoch has already been incremented past the one in the liveness argument, but the conditional put did not find a match. This means that another node performed a successful IncrementEpoch, but we can't tell at what time the epoch actually ended. (Usually when multiple IncrementEpoch calls race, they're using the same expected value. But when there is a severe backlog, it's possible for one increment to get stuck in a queue long enough for the dead node to make another successful heartbeat, and a second increment to come in after that)

func (*NodeLiveness) IsHealthy Uses

func (nl *NodeLiveness) IsHealthy(nodeID roachpb.NodeID) (bool, error)

IsHealthy returns whether or not the specified node IsLive and is in a LIVE state, i.e. not draining, decommissioning, or otherwise unhealthy.

func (*NodeLiveness) IsLive Uses

func (nl *NodeLiveness) IsLive(nodeID roachpb.NodeID) (bool, error)

IsLive returns whether or not the specified node is considered live based on whether or not its liveness has expired regardless of the liveness status. It is an error if the specified node is not in the local liveness table.

func (*NodeLiveness) Metrics Uses

func (nl *NodeLiveness) Metrics() LivenessMetrics

Metrics returns a struct which contains metrics related to node liveness activity.

func (*NodeLiveness) PauseHeartbeat Uses

func (nl *NodeLiveness) PauseHeartbeat(pause bool)

PauseHeartbeat stops or restarts the periodic heartbeat depending on the pause parameter. When pause is true, waits until it acquires the heartbeatToken (unless heartbeat was already paused); this ensures that no heartbeats happen after this is called. This function is only safe for use in tests.

func (*NodeLiveness) RegisterCallback Uses

func (nl *NodeLiveness) RegisterCallback(cb IsLiveCallback)

RegisterCallback registers a callback to be invoked any time a node's IsLive() state changes to true.

func (*NodeLiveness) Self Uses

func (nl *NodeLiveness) Self() (storagepb.Liveness, error)

Self returns the liveness record for this node. ErrNoLivenessRecord is returned in the event that the node has neither heartbeat its liveness record successfully, nor received a gossip message containing a former liveness update on restart.

func (*NodeLiveness) SetDecommissioning Uses

func (nl *NodeLiveness) SetDecommissioning(
    ctx context.Context, nodeID roachpb.NodeID, decommission bool,
) (changeCommitted bool, err error)

SetDecommissioning runs a best-effort attempt of marking the the liveness record as decommissioning. It returns whether the function committed a transaction that updated the liveness record.

func (*NodeLiveness) SetDraining Uses

func (nl *NodeLiveness) SetDraining(ctx context.Context, drain bool)

SetDraining attempts to update this node's liveness record to put itself into the draining state.

func (*NodeLiveness) StartHeartbeat Uses

func (nl *NodeLiveness) StartHeartbeat(
    ctx context.Context, stopper *stop.Stopper, alive HeartbeatCallback,
)

StartHeartbeat starts a periodic heartbeat to refresh this node's last heartbeat in the node liveness table. The optionally provided HeartbeatCallback will be invoked whenever this node updates its own liveness.

type NodeLivenessFunc Uses

type NodeLivenessFunc func(roachpb.NodeID, time.Time, time.Duration) storagepb.NodeLivenessStatus

A NodeLivenessFunc accepts a node ID, current time and threshold before a node is considered dead and returns whether or not the node is live.

func MakeStorePoolNodeLivenessFunc Uses

func MakeStorePoolNodeLivenessFunc(nodeLiveness *NodeLiveness) NodeLivenessFunc

MakeStorePoolNodeLivenessFunc returns a function which determines the status of a node based on information provided by the specified NodeLiveness.

type NoopGCer Uses

type NoopGCer struct{}

NoopGCer implements GCer by doing nothing.

func (NoopGCer) GC Uses

func (NoopGCer) GC(context.Context, []roachpb.GCRequest_GCKey) error

GC implements storage.GCer.

func (NoopGCer) SetGCThreshold Uses

func (NoopGCer) SetGCThreshold(context.Context, GCThreshold) error

SetGCThreshold implements storage.GCer.

type NotBootstrappedError Uses

type NotBootstrappedError struct{}

A NotBootstrappedError indicates that an engine has not yet been bootstrapped due to a store identifier not being present.

func (*NotBootstrappedError) Error Uses

func (e *NotBootstrappedError) Error() string

Error formats error.

type OutgoingSnapshot Uses

type OutgoingSnapshot struct {
    SnapUUID uuid.UUID
    // The Raft snapshot message to send. Contains SnapUUID as its data.
    RaftSnap raftpb.Snapshot
    // The RocksDB snapshot that will be streamed from.
    EngineSnap engine.Reader
    // The complete range iterator for the snapshot to stream.
    Iter *rditer.ReplicaDataIterator
    // The replica state within the snapshot.
    State storagepb.ReplicaState
    // Allows access the the original Replica's sideloaded storage. Note that
    // this isn't a snapshot of the sideloaded storage congruent with EngineSnap
    // or RaftSnap -- a log truncation could have removed files from the
    // sideloaded storage in the meantime.
    WithSideloaded func(func(SideloadStorage) error) error
    RaftEntryCache *raftentry.Cache
    // contains filtered or unexported fields
}

OutgoingSnapshot contains the data required to stream a snapshot to a recipient. Once one is created, it needs to be closed via Close() to prevent resource leakage.

func (*OutgoingSnapshot) Close Uses

func (s *OutgoingSnapshot) Close()

Close releases the resources associated with the snapshot.

func (*OutgoingSnapshot) String Uses

func (s *OutgoingSnapshot) String() string

type PerReplicaClient Uses

type PerReplicaClient interface {
    CollectChecksum(ctx context.Context, in *CollectChecksumRequest, opts ...grpc.CallOption) (*CollectChecksumResponse, error)
    WaitForApplication(ctx context.Context, in *WaitForApplicationRequest, opts ...grpc.CallOption) (*WaitForApplicationResponse, error)
    WaitForReplicaInit(ctx context.Context, in *WaitForReplicaInitRequest, opts ...grpc.CallOption) (*WaitForReplicaInitResponse, error)
}

PerReplicaClient is the client API for PerReplica service.

For semantics around ctx use and closing/ending streaming RPCs, please refer to https://godoc.org/google.golang.org/grpc#ClientConn.NewStream.

func NewPerReplicaClient Uses

func NewPerReplicaClient(cc *grpc.ClientConn) PerReplicaClient

type PerReplicaServer Uses

type PerReplicaServer interface {
    CollectChecksum(context.Context, *CollectChecksumRequest) (*CollectChecksumResponse, error)
    WaitForApplication(context.Context, *WaitForApplicationRequest) (*WaitForApplicationResponse, error)
    WaitForReplicaInit(context.Context, *WaitForReplicaInitRequest) (*WaitForReplicaInitResponse, error)
}

PerReplicaServer is the server API for PerReplica service.

type ProposalData Uses

type ProposalData struct {

    // Local contains the results of evaluating the request tying the upstream
    // evaluation of the request to the downstream application of the command.
    // Nil when the proposal came from another node (i.e. the evaluation wasn't
    // done here).
    Local *result.LocalResult

    // Request is the client's original BatchRequest.
    // TODO(tschottdorf): tests which use TestingCommandFilter use this.
    // Decide how that will work in the future, presumably the
    // CommandFilter would run at proposal time or we allow an opaque
    // struct to be attached to a proposal which is then available as it
    // applies. Other than tests, we only need a few bits of the request
    // here; this could be replaced with isLease and isChangeReplicas
    // booleans.
    Request *roachpb.BatchRequest
    // contains filtered or unexported fields
}

ProposalData is data about a command which allows it to be evaluated, proposed to raft, and for the result of the command to be returned to the caller.

type RaftHeartbeat Uses

type RaftHeartbeat struct {
    RangeID       github_com_cockroachdb_cockroach_pkg_roachpb.RangeID   `protobuf:"varint,1,opt,name=range_id,json=rangeId,casttype=github.com/cockroachdb/cockroach/pkg/roachpb.RangeID" json:"range_id"`
    FromReplicaID github_com_cockroachdb_cockroach_pkg_roachpb.ReplicaID `protobuf:"varint,2,opt,name=from_replica_id,json=fromReplicaId,casttype=github.com/cockroachdb/cockroach/pkg/roachpb.ReplicaID" json:"from_replica_id"`
    ToReplicaID   github_com_cockroachdb_cockroach_pkg_roachpb.ReplicaID `protobuf:"varint,3,opt,name=to_replica_id,json=toReplicaId,casttype=github.com/cockroachdb/cockroach/pkg/roachpb.ReplicaID" json:"to_replica_id"`
    Term          uint64                                                 `protobuf:"varint,4,opt,name=term" json:"term"`
    Commit        uint64                                                 `protobuf:"varint,5,opt,name=commit" json:"commit"`
    Quiesce       bool                                                   `protobuf:"varint,6,opt,name=quiesce" json:"quiesce"`
    // ToIsLearner was added in v19.2 to aid in the transition from preemptive
    // snapshots to learner replicas. If a Replica learns its ID from a message
    // which indicates that it is a learner and it is not currently a part of the
    // range (due to being from a preemptive snapshot) then it must delete all of
    // its data.
    //
    // TODO(ajwerner): remove in 20.2 once we ensure that preemptive snapshots can
    // no longer be present and that we're never talking to a 19.2 node.
    ToIsLearner bool `protobuf:"varint,7,opt,name=to_is_learner,json=toIsLearner" json:"to_is_learner"`
}

RaftHeartbeat is a request that contains the barebones information for a raftpb.MsgHeartbeat raftpb.Message. RaftHeartbeats are coalesced and sent in a RaftMessageRequest, and reconstructed by the receiver into individual raftpb.Message protos.

func (*RaftHeartbeat) Descriptor Uses

func (*RaftHeartbeat) Descriptor() ([]byte, []int)

func (*RaftHeartbeat) Marshal Uses

func (m *RaftHeartbeat) Marshal() (dAtA []byte, err error)

func (*RaftHeartbeat) MarshalTo Uses

func (m *RaftHeartbeat) MarshalTo(dAtA []byte) (int, error)

func (*RaftHeartbeat) ProtoMessage Uses

func (*RaftHeartbeat) ProtoMessage()

func (*RaftHeartbeat) Reset Uses

func (m *RaftHeartbeat) Reset()

func (*RaftHeartbeat) Size Uses

func (m *RaftHeartbeat) Size() (n int)

func (*RaftHeartbeat) String Uses

func (m *RaftHeartbeat) String() string

func (*RaftHeartbeat) Unmarshal Uses

func (m *RaftHeartbeat) Unmarshal(dAtA []byte) error

func (*RaftHeartbeat) XXX_DiscardUnknown Uses

func (m *RaftHeartbeat) XXX_DiscardUnknown()

func (*RaftHeartbeat) XXX_Marshal Uses

func (m *RaftHeartbeat) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*RaftHeartbeat) XXX_Merge Uses

func (dst *RaftHeartbeat) XXX_Merge(src proto.Message)

func (*RaftHeartbeat) XXX_Size Uses

func (m *RaftHeartbeat) XXX_Size() int

func (*RaftHeartbeat) XXX_Unmarshal Uses

func (m *RaftHeartbeat) XXX_Unmarshal(b []byte) error

type RaftMessageHandler Uses

type RaftMessageHandler interface {
    // HandleRaftRequest is called for each incoming Raft message. The request is
    // always processed asynchronously and the response is sent over respStream.
    // If an error is encountered during asynchronous processing, it will be
    // streamed back to the sender of the message as a RaftMessageResponse.
    HandleRaftRequest(ctx context.Context, req *RaftMessageRequest,
        respStream RaftMessageResponseStream) *roachpb.Error

    // HandleRaftResponse is called for each raft response. Note that
    // not all messages receive a response. An error is returned if and only if
    // the underlying Raft connection should be closed.
    HandleRaftResponse(context.Context, *RaftMessageResponse) error

    // HandleSnapshot is called for each new incoming snapshot stream, after
    // parsing the initial SnapshotRequest_Header on the stream.
    HandleSnapshot(header *SnapshotRequest_Header, respStream SnapshotResponseStream) error
}

RaftMessageHandler is the interface that must be implemented by arguments to RaftTransport.Listen.

type RaftMessageRequest Uses

type RaftMessageRequest struct {
    RangeID github_com_cockroachdb_cockroach_pkg_roachpb.RangeID `protobuf:"varint,1,opt,name=range_id,json=rangeId,casttype=github.com/cockroachdb/cockroach/pkg/roachpb.RangeID" json:"range_id"`
    // Optionally, the start key of the sending replica. This is only populated
    // as a "hint" under certain conditions.
    RangeStartKey github_com_cockroachdb_cockroach_pkg_roachpb.RKey `protobuf:"bytes,8,opt,name=range_start_key,json=rangeStartKey,casttype=github.com/cockroachdb/cockroach/pkg/roachpb.RKey" json:"range_start_key,omitempty"`
    FromReplica   roachpb.ReplicaDescriptor                         `protobuf:"bytes,2,opt,name=from_replica,json=fromReplica" json:"from_replica"`
    ToReplica     roachpb.ReplicaDescriptor                         `protobuf:"bytes,3,opt,name=to_replica,json=toReplica" json:"to_replica"`
    Message       raftpb.Message                                    `protobuf:"bytes,4,opt,name=message" json:"message"`
    // Is this a quiesce request? A quiesce request is a MsgHeartbeat
    // which is requesting the recipient to stop ticking its local
    // replica as long as the current Raft state matches the heartbeat
    // Term/Commit. If the Term/Commit match, the recipient is marked as
    // quiescent. If they don't match, the message is passed along to
    // Raft which will generate a MsgHeartbeatResp that will unquiesce
    // the sender.
    Quiesce bool `protobuf:"varint,5,opt,name=quiesce" json:"quiesce"`
    // A coalesced heartbeat request is any RaftMessageRequest with a nonzero number of
    // heartbeats or heartbeat_resps.
    Heartbeats     []RaftHeartbeat `protobuf:"bytes,6,rep,name=heartbeats" json:"heartbeats"`
    HeartbeatResps []RaftHeartbeat `protobuf:"bytes,7,rep,name=heartbeat_resps,json=heartbeatResps" json:"heartbeat_resps"`
}

RaftMessageRequest is the request used to send raft messages using our protobuf-based RPC codec. If a RaftMessageRequest has a non-empty number of heartbeats or heartbeat_resps, the contents of the message field is treated as a dummy message and discarded. A coalesced heartbeat request's replica descriptor's range ID must be zero.

func (*RaftMessageRequest) Descriptor Uses

func (*RaftMessageRequest) Descriptor() ([]byte, []int)

func (*RaftMessageRequest) Marshal Uses

func (m *RaftMessageRequest) Marshal() (dAtA []byte, err error)

func (*RaftMessageRequest) MarshalTo Uses

func (m *RaftMessageRequest) MarshalTo(dAtA []byte) (int, error)

func (*RaftMessageRequest) ProtoMessage Uses

func (*RaftMessageRequest) ProtoMessage()

func (*RaftMessageRequest) Reset Uses

func (m *RaftMessageRequest) Reset()

func (*RaftMessageRequest) Size Uses

func (m *RaftMessageRequest) Size() (n int)

func (*RaftMessageRequest) String Uses

func (m *RaftMessageRequest) String() string

func (*RaftMessageRequest) Unmarshal Uses

func (m *RaftMessageRequest) Unmarshal(dAtA []byte) error

func (*RaftMessageRequest) XXX_DiscardUnknown Uses

func (m *RaftMessageRequest) XXX_DiscardUnknown()

func (*RaftMessageRequest) XXX_Marshal Uses

func (m *RaftMessageRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*RaftMessageRequest) XXX_Merge Uses

func (dst *RaftMessageRequest) XXX_Merge(src proto.Message)

func (*RaftMessageRequest) XXX_Size Uses

func (m *RaftMessageRequest) XXX_Size() int

func (*RaftMessageRequest) XXX_Unmarshal Uses

func (m *RaftMessageRequest) XXX_Unmarshal(b []byte) error

type RaftMessageRequestBatch Uses

type RaftMessageRequestBatch struct {
    Requests []RaftMessageRequest `protobuf:"bytes,1,rep,name=requests" json:"requests"`
}

func (*RaftMessageRequestBatch) Descriptor Uses

func (*RaftMessageRequestBatch) Descriptor() ([]byte, []int)

func (*RaftMessageRequestBatch) Marshal Uses

func (m *RaftMessageRequestBatch) Marshal() (dAtA []byte, err error)

func (*RaftMessageRequestBatch) MarshalTo Uses

func (m *RaftMessageRequestBatch) MarshalTo(dAtA []byte) (int, error)

func (*RaftMessageRequestBatch) ProtoMessage Uses

func (*RaftMessageRequestBatch) ProtoMessage()

func (*RaftMessageRequestBatch) Reset Uses

func (m *RaftMessageRequestBatch) Reset()

func (*RaftMessageRequestBatch) Size Uses

func (m *RaftMessageRequestBatch) Size() (n int)

func (*RaftMessageRequestBatch) String Uses

func (m *RaftMessageRequestBatch) String() string

func (*RaftMessageRequestBatch) Unmarshal Uses

func (m *RaftMessageRequestBatch) Unmarshal(dAtA []byte) error

func (*RaftMessageRequestBatch) XXX_DiscardUnknown Uses

func (m *RaftMessageRequestBatch) XXX_DiscardUnknown()

func (*RaftMessageRequestBatch) XXX_Marshal Uses

func (m *RaftMessageRequestBatch) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*RaftMessageRequestBatch) XXX_Merge Uses

func (dst *RaftMessageRequestBatch) XXX_Merge(src proto.Message)

func (*RaftMessageRequestBatch) XXX_Size Uses

func (m *RaftMessageRequestBatch) XXX_Size() int

func (*RaftMessageRequestBatch) XXX_Unmarshal Uses

func (m *RaftMessageRequestBatch) XXX_Unmarshal(b []byte) error

type RaftMessageResponse Uses

type RaftMessageResponse struct {
    RangeID     github_com_cockroachdb_cockroach_pkg_roachpb.RangeID `protobuf:"varint,1,opt,name=range_id,json=rangeId,casttype=github.com/cockroachdb/cockroach/pkg/roachpb.RangeID" json:"range_id"`
    FromReplica roachpb.ReplicaDescriptor                            `protobuf:"bytes,2,opt,name=from_replica,json=fromReplica" json:"from_replica"`
    ToReplica   roachpb.ReplicaDescriptor                            `protobuf:"bytes,3,opt,name=to_replica,json=toReplica" json:"to_replica"`
    Union       RaftMessageResponseUnion                             `protobuf:"bytes,4,opt,name=union" json:"union"`
}

RaftMessageResponse may be sent to the sender of a RaftMessageRequest. RaftMessage does not use the usual request/response pattern; it is primarily modeled as a one-way stream of requests. Normal 'responses' are usually sent as new requests on a separate stream in the other direction. RaftMessageResponse is not sent for every RaftMessageRequest, but may be used for certain error conditions.

func (*RaftMessageResponse) Descriptor Uses

func (*RaftMessageResponse) Descriptor() ([]byte, []int)

func (*RaftMessageResponse) Marshal Uses

func (m *RaftMessageResponse) Marshal() (dAtA []byte, err error)

func (*RaftMessageResponse) MarshalTo Uses

func (m *RaftMessageResponse) MarshalTo(dAtA []byte) (int, error)

func (*RaftMessageResponse) ProtoMessage Uses

func (*RaftMessageResponse) ProtoMessage()

func (*RaftMessageResponse) Reset Uses

func (m *RaftMessageResponse) Reset()

func (*RaftMessageResponse) Size Uses

func (m *RaftMessageResponse) Size() (n int)

func (*RaftMessageResponse) String Uses

func (m *RaftMessageResponse) String() string

func (*RaftMessageResponse) Unmarshal Uses

func (m *RaftMessageResponse) Unmarshal(dAtA []byte) error

func (*RaftMessageResponse) XXX_DiscardUnknown Uses

func (m *RaftMessageResponse) XXX_DiscardUnknown()

func (*RaftMessageResponse) XXX_Marshal Uses

func (m *RaftMessageResponse) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*RaftMessageResponse) XXX_Merge Uses

func (dst *RaftMessageResponse) XXX_Merge(src proto.Message)

func (*RaftMessageResponse) XXX_Size Uses

func (m *RaftMessageResponse) XXX_Size() int

func (*RaftMessageResponse) XXX_Unmarshal Uses

func (m *RaftMessageResponse) XXX_Unmarshal(b []byte) error

type RaftMessageResponseStream Uses

type RaftMessageResponseStream interface {
    Context() context.Context
    Send(*RaftMessageResponse) error
}

RaftMessageResponseStream is the subset of the MultiRaft_RaftMessageServer interface that is needed for sending responses.

type RaftMessageResponseUnion Uses

type RaftMessageResponseUnion struct {
    Error *roachpb.Error `protobuf:"bytes,1,opt,name=error" json:"error,omitempty"`
}

func (*RaftMessageResponseUnion) Descriptor Uses

func (*RaftMessageResponseUnion) Descriptor() ([]byte, []int)

func (*RaftMessageResponseUnion) GetValue Uses

func (this *RaftMessageResponseUnion) GetValue() interface{}

func (*RaftMessageResponseUnion) Marshal Uses

func (m *RaftMessageResponseUnion) Marshal() (dAtA []byte, err error)

func (*RaftMessageResponseUnion) MarshalTo Uses

func (m *RaftMessageResponseUnion) MarshalTo(dAtA []byte) (int, error)

func (*RaftMessageResponseUnion) ProtoMessage Uses

func (*RaftMessageResponseUnion) ProtoMessage()

func (*RaftMessageResponseUnion) Reset Uses

func (m *RaftMessageResponseUnion) Reset()

func (*RaftMessageResponseUnion) SetValue Uses

func (this *RaftMessageResponseUnion) SetValue(value interface{}) bool

func (*RaftMessageResponseUnion) Size Uses

func (m *RaftMessageResponseUnion) Size() (n int)

func (*RaftMessageResponseUnion) String Uses

func (m *RaftMessageResponseUnion) String() string

func (*RaftMessageResponseUnion) Unmarshal Uses

func (m *RaftMessageResponseUnion) Unmarshal(dAtA []byte) error

func (*RaftMessageResponseUnion) XXX_DiscardUnknown Uses

func (m *RaftMessageResponseUnion) XXX_DiscardUnknown()

func (*RaftMessageResponseUnion) XXX_Marshal Uses

func (m *RaftMessageResponseUnion) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*RaftMessageResponseUnion) XXX_Merge Uses

func (dst *RaftMessageResponseUnion) XXX_Merge(src proto.Message)

func (*RaftMessageResponseUnion) XXX_Size Uses

func (m *RaftMessageResponseUnion) XXX_Size() int

func (*RaftMessageResponseUnion) XXX_Unmarshal Uses

func (m *RaftMessageResponseUnion) XXX_Unmarshal(b []byte) error

type RaftTransport Uses

type RaftTransport struct {
    log.AmbientContext
    // contains filtered or unexported fields
}

RaftTransport handles the rpc messages for raft.

The raft transport is asynchronous with respect to the caller, and internally multiplexes outbound messages. Internally, each message is queued on a per-destination queue before being asynchronously delivered.

Callers are required to construct a RaftSender before being able to dispatch messages, and must provide an error handler which will be invoked asynchronously in the event that the recipient of any message closes its inbound RPC stream. This callback is asynchronous with respect to the outbound message which caused the remote to hang up; all that is known is which remote hung up.

func NewDummyRaftTransport Uses

func NewDummyRaftTransport(st *cluster.Settings) *RaftTransport

NewDummyRaftTransport returns a dummy raft transport for use in tests which need a non-nil raft transport that need not function.

func NewRaftTransport Uses

func NewRaftTransport(
    ambient log.AmbientContext,
    st *cluster.Settings,
    dialer *nodedialer.Dialer,
    grpcServer *grpc.Server,
    stopper *stop.Stopper,
) *RaftTransport

NewRaftTransport creates a new RaftTransport.

func (*RaftTransport) Listen Uses

func (t *RaftTransport) Listen(storeID roachpb.StoreID, handler RaftMessageHandler)

Listen registers a raftMessageHandler to receive proxied messages.

func (*RaftTransport) RaftMessageBatch Uses

func (t *RaftTransport) RaftMessageBatch(stream MultiRaft_RaftMessageBatchServer) error

RaftMessageBatch proxies the incoming requests to the listening server interface.

func (*RaftTransport) RaftSnapshot Uses

func (t *RaftTransport) RaftSnapshot(stream MultiRaft_RaftSnapshotServer) error

RaftSnapshot handles incoming streaming snapshot requests.

func (*RaftTransport) SendAsync Uses

func (t *RaftTransport) SendAsync(req *RaftMessageRequest, class rpc.ConnectionClass) (sent bool)

SendAsync sends a message to the recipient specified in the request. It returns false if the outgoing queue is full. The returned bool may be a false positive but will never be a false negative; if sent is true the message may or may not actually be sent but if it's false the message definitely was not sent. If the method does return true, it is not safe to continue using the reference to the provided request.

func (*RaftTransport) SendSnapshot Uses

func (t *RaftTransport) SendSnapshot(
    ctx context.Context,
    raftCfg *base.RaftConfig,
    storePool *StorePool,
    header SnapshotRequest_Header,
    snap *OutgoingSnapshot,
    newBatch func() engine.Batch,
    sent func(),
) error

SendSnapshot streams the given outgoing snapshot. The caller is responsible for closing the OutgoingSnapshot.

func (*RaftTransport) Stop Uses

func (t *RaftTransport) Stop(storeID roachpb.StoreID)

Stop unregisters a raftMessageHandler.

type RangeUsageInfo Uses

type RangeUsageInfo struct {
    LogicalBytes     int64
    QueriesPerSecond float64
    WritesPerSecond  float64
}

RangeUsageInfo contains usage information (sizes and traffic) needed by the allocator to make rebalancing decisions for a given range.

type RemoveOptions Uses

type RemoveOptions struct {
    DestroyData bool
    // contains filtered or unexported fields
}

RemoveOptions bundles boolean parameters for Store.RemoveReplica.

type Replica Uses

type Replica struct {
    log.AmbientContext

    // TODO(tschottdorf): Duplicates r.mu.state.desc.RangeID; revisit that.
    RangeID roachpb.RangeID // Only set by the constructor
    // contains filtered or unexported fields
}

A Replica is a contiguous keyspace with writes managed via an instance of the Raft consensus algorithm. Many ranges may exist in a store and they are unlikely to be contiguous. Ranges are independent units and are responsible for maintaining their own integrity by replacing failed replicas, splitting and merging as appropriate.

func NewReplica Uses

func NewReplica(
    desc *roachpb.RangeDescriptor, store *Store, replicaID roachpb.ReplicaID,
) (*Replica, error)

NewReplica initializes the replica using the given metadata. If the replica is initialized (i.e. desc contains more than a RangeID), replicaID should be 0 and the replicaID will be discovered from the descriptor.

func (*Replica) AbortSpan Uses

func (r *Replica) AbortSpan() *abortspan.AbortSpan

AbortSpan returns the Replica's AbortSpan.

func (*Replica) AdminMerge Uses

func (r *Replica) AdminMerge(
    ctx context.Context, args roachpb.AdminMergeRequest, reason string,
) (roachpb.AdminMergeResponse, *roachpb.Error)

AdminMerge extends this range to subsume the range that comes next in the key space. The merge is performed inside of a distributed transaction which writes the left hand side range descriptor (the subsuming range) and deletes the range descriptor for the right hand side range (the subsumed range). It also updates the range addressing metadata. The handover of responsibility for the reassigned key range is carried out seamlessly through a merge trigger carried out as part of the commit of that transaction. A merge requires that the two ranges are collocated on the same set of replicas.

The supplied RangeDescriptor is used as a form of optimistic lock. See the comment of "AdminSplit" for more information on this pattern.

func (*Replica) AdminSplit Uses

func (r *Replica) AdminSplit(
    ctx context.Context, args roachpb.AdminSplitRequest, reason string,
) (reply roachpb.AdminSplitResponse, _ *roachpb.Error)

AdminSplit divides the range into into two ranges using args.SplitKey.

func (*Replica) AdminTransferLease Uses

func (r *Replica) AdminTransferLease(ctx context.Context, target roachpb.StoreID) error

AdminTransferLease transfers the LeaderLease to another replica. A valid LeaseStatus must be supplied. Only the current holder of the LeaderLease can do a transfer, because it needs to stop serving reads and proposing Raft commands (CPut is a read) after sending the transfer command. If it did not stop serving reads immediately, it would potentially serve reads with timestamps greater than the start timestamp of the new (transferred) lease. More subtly, the replica can't even serve reads or propose commands with timestamps lower than the start of the new lease because it could lead to read your own write violations (see comments on the stasis period in IsLeaseValid). We could, in principle, serve reads more than the maximum clock offset in the past.

The method waits for any in-progress lease extension to be done, and it also blocks until the transfer is done. If a transfer is already in progress, this method joins in waiting for it to complete if it's transferring to the same replica. Otherwise, a NotLeaseHolderError is returned.

func (*Replica) AdminUnsplit Uses

func (r *Replica) AdminUnsplit(
    ctx context.Context, args roachpb.AdminUnsplitRequest, reason string,
) (roachpb.AdminUnsplitResponse, *roachpb.Error)

AdminUnsplit removes the sticky bit of the range specified by the args.Key.

func (*Replica) CanCreateTxnRecord Uses

func (r *Replica) CanCreateTxnRecord(
    txnID uuid.UUID, txnKey []byte, txnMinTS hlc.Timestamp,
) (ok bool, minCommitTS hlc.Timestamp, reason roachpb.TransactionAbortedReason)

CanCreateTxnRecord determines whether a transaction record can be created for the provided transaction information. Callers must provide the transaction's minimum timestamp across all epochs, along with its ID and its key.

If the method return true, it also returns the minimum provisional commit timestamp that the record can be created with. If the method returns false, it returns the reason that transaction record was rejected. If the method ever determines that a transaction record must be rejected, it will continue to reject that transaction going forwards.

The method performs two critical roles: 1. It protects against replayed requests or new requests from a transaction's

coordinator that could otherwise cause a transaction record to be created
after the transaction has already been finalized and its record cleaned up.

2. It serves as the mechanism by which successful push requests convey

information to transactions who have not yet written their transaction
record. In doing so, it ensures that transaction records are created with a
sufficiently high timestamp after a successful PushTxn(TIMESTAMP) and ensures
that transactions records are never created at all after a successful
PushTxn(ABORT). As a result of this mechanism, a transaction never needs to
explicitly create the transaction record for contending transactions.

This is detailed in the transaction record state machine below:

+-----------------------------------+ | vars | |-----------------------------------| | v1 = rTSCache[txn.id] = timestamp | | v2 = wTSCache[txn.id] = timestamp | +-----------------------------------+ | operations | |-----------------------------------| | v -> t = forward v by timestamp t | +-----------------------------------+

               PushTxn(TIMESTAMP)                                HeartbeatTxn
               then: v1 -> push.ts                             then: update record
                   +------+                                        +------+
 PushTxn(ABORT)    |      |        BeginTxn or HeartbeatTxn        |      |   PushTxn(TIMESTAMP)
then: v2 -> txn.ts |      v        if: v2 < txn.orig               |      v  then: update record
              +-----------------+  then: txn.ts -> v1      +--------------------+
         +----|                 |  else: fail              |                    |----+
         |    |                 |------------------------->|                    |    |
         |    |  no txn record  |                          | txn record written |    |
         +--->|                 |  EndTxn(STAGING)         |     [pending]      |<---+
              |                 |__  if: v2 < txn.orig     |                    |
              +-----------------+  \__ then: v2 -> txn.ts  +--------------------+
                 |            ^       \__ else: fail       _/   |            ^
                 |            |          \__             _/     |            |

EndTxn(!STAGING) | | \__ _/ | EndTxn(STAGING) if: v2 < txn.orig | Eager GC | \____ _/______ | | then: v2 -> txn.ts | or | _/ \ | | HeartbeatTxn else: fail | GC queue | /----------------/ | | | if: epoch update

                 v            | v    EndTxn(!STAGING)        v  v            |
             +--------------------+  or PushTxn(ABORT)     +--------------------+
             |                    |  then: v2 -> txn.ts    |                    |
        +--->|                    |<-----------------------|                    |----+
        |    | txn record written |                        | txn record written |    |
        |    |     [finalized]    |                        |      [staging]     |    |
        +----|                    |                        |                    |<---+
PushTxn(*)   +--------------------+                        +--------------------+
then: no-op                    ^   PushTxn(*) + RecoverTxn    |              EndTxn(STAGING)
                               |     then: v2 -> txn.ts       |              or HeartbeatTxn
                               +------------------------------+            then: update record

In the diagram, CanCreateTxnRecord is consulted in all three of the state transitions that move away from the "no txn record" state. Updating v1 and v2 is performed in updateTimestampCache.

The are three separate simplifications to the transaction model that would allow us to simplify this state machine:

1. as discussed on the comment on txnHeartbeater, it is reasonable to expect

that we will eventually move away from tracking transaction liveness on a
per-transaction basis. This means that we would no longer need transaction
heartbeats and would never need to write a transaction record until a
transaction is ready to complete.

2. one of the two possibilities for the "txn record written [finalized]" state

is that the transaction record is aborted. There used to be two reasons to
persist transaction records with the ABORTED status. The first was because
doing so was the only way for concurrent actors to prevent the record from
being re-written by the transaction going forward. The concurrent actor would
write an aborted transaction record and then wait for the GC to clean it up
later. The other reasons for writing the transaction records with the ABORTED
status was because these records could point at intents, which assisted the
cleanup process for these intents. However, this only held for ABORTED
records written on behalf of the transaction coordinator itself. If a
transaction was aborted by a concurrent actor, its record would not
immediately contain any of the transaction's intents.

The first reason here no longer holds. Concurrent actors now bump the write
timestamp cache when aborting a transaction, which has the same effect as
writing an ABORTED transaction record. The second reason still holds but is
fairly weak. A transaction coordinator can kick off intent resolution for an
aborted transaction without needing to write these intents into the record
itself. In the worst case, this intent resolution fails and each intent is
cleaned up individually as it is discovered. All in all, neither
justification for this state holds much weight anymore.

3. the other possibility for the "txn record written [finalized]" state is that

the transaction record is committed. This state is currently critical for the
transaction model because intent resolution cannot begin before a transaction
record enters this state. However, this doesn't need to be the case forever.
There are proposals to modify the state of committed key-value writes
slightly such that intent resolution could be run for implicitly committed
transactions while their transaction record remains in the  "txn record
written [staging]" state. For this to work, the recovery mechanism for
indeterminate commit errors would need to be able to determine whether an
intent or a **committed value** indicated the success of a write that was
in-flight at the time the transaction record was staged. This poses
challenges migration and garbage collection, but it would have a number of
performance benefits.

If we were to perform change #1, we could remove the "txn record written [pending]" state. If we were to perform change #2 and #3, we could remove the "txn record written [finalized]" state. All together, this would leave us with only two states that the transaction record could be in, written or not written. At that point, it begins to closely resemble any other write in the system.

func (*Replica) ChangeReplicas Uses

func (r *Replica) ChangeReplicas(
    ctx context.Context,
    desc *roachpb.RangeDescriptor,
    priority SnapshotRequest_Priority,
    reason storagepb.RangeLogEventReason,
    details string,
    chgs roachpb.ReplicationChanges,
) (updatedDesc *roachpb.RangeDescriptor, _ error)

ChangeReplicas atomically changes the replicas that are members of a range. The change is performed in a distributed transaction and takes effect when that transaction is committed. This transaction confirms that the supplied RangeDescriptor is up to date and that the supplied slice of ReplicationChanges is a valid transition, meaning that replicas being added are not present, that replicas being removed are present, that no replica is altered more than once, and that no attempt is made at removing the leaseholder (which in particular implies that we can never remove all replicas).

The returned RangeDescriptor is the new value of the range's descriptor following the successful commit of the transaction.

In general, ChangeReplicas will carry out the following steps.

1. Run a distributed transaction that adds all new replicas as learner replicas.

Learner replicas receive the log, but do not have voting rights. They are
used to catch up these new replicas before turning them into voters, which
is important for the continued availability of the range throughout the
replication change. Learners are added (and removed) one by one due to a
technicality (see https://github.com/cockroachdb/cockroach/pull/40268).

The distributed transaction updates both copies of the range descriptor
(the one on the range and that in the meta ranges) to that effect, and
commits with a special trigger instructing Raft (via ProposeConfChange) to
tie a corresponding replication configuration change which goes into
effect (on each replica) when the transaction commit is applied to the
state. Applying the command also updates each replica's local view of
the state to reflect the new descriptor.

If no replicas are being added, this first step is elided.

2. Send Raft snapshots to all learner replicas. This would happen

automatically by the existing recovery mechanisms (raft snapshot queue), but
it is done explicitly as a convenient way to ensure learners are caught up
before the next step is entered. (We ensure that work is not duplicated
between the snapshot queue and the explicit snapshot via the
snapshotLogTruncationConstraints map). Snapshots are subject to both
bandwidth rate limiting and throttling.

If no replicas are being added, this step is similarly elided.

3. Carry out a distributed transaction similar to that which added the

learner replicas, except this time it (atomically) changes all learners to
voters and removes any replicas for which this was requested; voters are
demoted before actually being removed to avoid bug in etcd/raft:
See https://github.com/cockroachdb/cockroach/pull/40268.

If only one replica is being added, raft can chose the simple
configuration change protocol; otherwise it has to use joint consensus. In
this latter mechanism, a first configuration change is made which results
in a configuration ("joint configuration") in which a quorum of both the
old replicas and the new replica sets is required for decision making.
Transitioning into this joint configuration, the RangeDescriptor (which is
the source of truth of the replication configuration) is updated with
corresponding replicas of type VOTER_INCOMING and VOTER_OUTGOING.
Immediately after committing this change, a second transition updates the
descriptor with and activates the final configuration.

Concretely, if the initial members of the range are s1/1, s2/2, and s3/3, and an atomic membership change were to adds s4/4 and s5/5 while removing s1/1 and s2/2, the following range descriptors would form the overall transition:

1. s1/1 s2/2 s3/3 (VOTER_FULL is implied) 2. s1/1 s2/2 s3/3 s4/4LEARNER 3. s1/1 s2/2 s3/3 s4/4LEARNER s5/5LEARNER 4. s1/1VOTER_DEMOTING s2/2VOTER_DEMOTING s3/3 s4/4VOTER_INCOMING s5/5VOTER_INCOMING 5. s1/1LEARNER s2/2LEARNER s3/3 s4/4 s5/5 6. s2/2LEARNER s3/3 s4/4 s5/5 7. s3/3 s4/4 s5/5

A replica that learns that it was removed will queue itself for replicaGC. Note that a removed replica may never apply the configuration change removing itself and thus this trigger may not fire. This is because said replica may not have been a part of the quorum that committed the configuration change; nodes that apply the change will stop sending messages to the removed replica. At that point, the removed replica will typically campaign (since it receives no more heartbeats from the leader) and its former peers respond via a RaftGroupDeletedError (from the Raft transport) as a signal to queue to replicaGC. This second mechanism fails if all peers have rapidly moved elsewhere as well; in that last and rare case, replica GC queue will eventually discover the replica on its own; it has optimizations that handle "abandoned-looking" replicas more eagerly than healthy ones.

func (*Replica) CheckConsistency Uses

func (r *Replica) CheckConsistency(
    ctx context.Context, args roachpb.CheckConsistencyRequest,
) (roachpb.CheckConsistencyResponse, *roachpb.Error)

CheckConsistency runs a consistency check on the range. It first applies a ComputeChecksum through Raft and then issues CollectChecksum commands to the other replicas. These are inspected and a CheckConsistencyResponse is assembled.

When args.Mode is CHECK_VIA_QUEUE and an inconsistency is detected and no diff was requested, the consistency check will be re-run to collect a diff, which is then printed before calling `log.Fatal`. This behavior should be lifted to the consistency checker queue in the future.

func (*Replica) Clock Uses

func (r *Replica) Clock() *hlc.Clock

Clock returns the hlc clock shared by this replica.

func (*Replica) ClusterSettings Uses

func (r *Replica) ClusterSettings() *cluster.Settings

ClusterSettings returns the node's ClusterSettings.

func (*Replica) ContainsKey Uses

func (r *Replica) ContainsKey(key roachpb.Key) bool

ContainsKey returns whether this range contains the specified key.

TODO(bdarnell): This is not the same as RangeDescriptor.ContainsKey.

func (*Replica) ContainsKeyRange Uses

func (r *Replica) ContainsKeyRange(start, end roachpb.Key) bool

ContainsKeyRange returns whether this range contains the specified key range from start to end.

func (*Replica) DB Uses

func (r *Replica) DB() *client.DB

DB returns the Replica's client DB.

func (*Replica) Desc Uses

func (r *Replica) Desc() *roachpb.RangeDescriptor

Desc returns the authoritative range descriptor, acquiring a replica lock in the process.

func (*Replica) DescAndZone Uses

func (r *Replica) DescAndZone() (*roachpb.RangeDescriptor, *config.ZoneConfig)

DescAndZone returns the authoritative range descriptor as well as the zone config for the replica.

func (*Replica) EmitMLAI Uses

func (r *Replica) EmitMLAI()

EmitMLAI registers the replica's last assigned max lease index with the closed timestamp tracker. This is called to emit an update about this replica in the absence of write activity.

func (*Replica) Engine Uses

func (r *Replica) Engine() engine.Engine

Engine returns the Replica's underlying Engine. In most cases the evaluation Batch should be used instead.

func (*Replica) EvalKnobs Uses

func (r *Replica) EvalKnobs() storagebase.BatchEvalTestingKnobs

EvalKnobs returns the EvalContext's Knobs.

func (*Replica) GetExternalStorage Uses

func (r *Replica) GetExternalStorage(
    ctx context.Context, dest roachpb.ExternalStorage,
) (cloud.ExternalStorage, error)

GetExternalStorage returns an ExternalStorage object, based on information parsed from a URI, stored in `dest`.

func (*Replica) GetExternalStorageFromURI Uses

func (r *Replica) GetExternalStorageFromURI(
    ctx context.Context, uri string,
) (cloud.ExternalStorage, error)

GetExternalStorageFromURI returns an ExternalStorage object, based on the given URI.

func (*Replica) GetFirstIndex Uses

func (r *Replica) GetFirstIndex() (uint64, error)

GetFirstIndex is the same function as raftFirstIndexLocked but it requires that r.mu is not held.

func (*Replica) GetGCThreshold Uses

func (r *Replica) GetGCThreshold() hlc.Timestamp

GetGCThreshold returns the GC threshold.

func (*Replica) GetLastReplicaGCTimestamp Uses

func (r *Replica) GetLastReplicaGCTimestamp(ctx context.Context) (hlc.Timestamp, error)

GetLastReplicaGCTimestamp reads the timestamp at which the replica was last checked for removal by the replica gc queue.

func (*Replica) GetLease Uses

func (r *Replica) GetLease() (roachpb.Lease, roachpb.Lease)

GetLease returns the lease and, if available, the proposed next lease.

func (*Replica) GetLeaseAppliedIndex Uses

func (r *Replica) GetLeaseAppliedIndex() uint64

GetLeaseAppliedIndex returns the lease index of the last applied command.

func (*Replica) GetLeaseHistory Uses

func (r *Replica) GetLeaseHistory() []roachpb.Lease

GetLeaseHistory returns the lease history stored on this replica.

func (*Replica) GetLimiters Uses

func (r *Replica) GetLimiters() *batcheval.Limiters

GetLimiters returns the Replica's limiters.

func (*Replica) GetMVCCStats Uses

func (r *Replica) GetMVCCStats() enginepb.MVCCStats

GetMVCCStats returns a copy of the MVCC stats object for this range. This accessor is thread-safe, but provides no guarantees about its synchronization with any concurrent writes.

func (*Replica) GetMaxBytes Uses

func (r *Replica) GetMaxBytes() int64

GetMaxBytes gets the replica's maximum byte threshold.

func (*Replica) GetMinBytes Uses

func (r *Replica) GetMinBytes() int64

GetMinBytes gets the replica's minimum byte threshold.

func (*Replica) GetNodeLocality Uses

func (r *Replica) GetNodeLocality() roachpb.Locality

GetNodeLocality returns the locality of the node this replica belongs to.

func (*Replica) GetRangeID Uses

func (r *Replica) GetRangeID() roachpb.RangeID

GetRangeID returns the Range ID.

func (*Replica) GetReplicaDescriptor Uses

func (r *Replica) GetReplicaDescriptor() (roachpb.ReplicaDescriptor, error)

GetReplicaDescriptor returns the replica for this range from the range descriptor. Returns a *RangeNotFoundError if the replica is not found. No other errors are returned.

func (*Replica) GetSnapshot Uses

func (r *Replica) GetSnapshot(
    ctx context.Context, snapType SnapshotRequest_Type, recipientStore roachpb.StoreID,
) (_ *OutgoingSnapshot, err error)

GetSnapshot returns a snapshot of the replica appropriate for sending to a replica. If this method returns without error, callers must eventually call OutgoingSnapshot.Close.

func (*Replica) GetSplitQPS Uses

func (r *Replica) GetSplitQPS() float64

GetSplitQPS returns the Replica's queries/s request rate.

NOTE: This should only be used for load based splitting, only works when the load based splitting cluster setting is enabled.

Use QueriesPerSecond() for current QPS stats for all other purposes.

func (*Replica) GetTerm Uses

func (r *Replica) GetTerm(i uint64) (uint64, error)

GetTerm returns the term of the given index in the raft log.

func (*Replica) GetTxnWaitQueue Uses

func (r *Replica) GetTxnWaitQueue() *txnwait.Queue

GetTxnWaitQueue returns the Replica's txnwait.Queue.

func (*Replica) IsDestroyed Uses

func (r *Replica) IsDestroyed() (DestroyReason, error)

IsDestroyed returns a non-nil error if the replica has been destroyed and the reason if it has.

func (*Replica) IsFirstRange Uses

func (r *Replica) IsFirstRange() bool

IsFirstRange returns true if this is the first range.

func (*Replica) IsInitialized Uses

func (r *Replica) IsInitialized() bool

IsInitialized is true if we know the metadata of this range, either because we created it or we have received an initial snapshot from another node. It is false when a range has been created in response to an incoming message but we are waiting for our initial snapshot.

func (*Replica) IsLeaseValid Uses

func (r *Replica) IsLeaseValid(lease roachpb.Lease, ts hlc.Timestamp) bool

IsLeaseValid returns true if the replica's lease is owned by this replica and is valid (not expired, not in stasis).

func (*Replica) LastReplicaAdded Uses

func (r *Replica) LastReplicaAdded() (roachpb.ReplicaID, time.Time)

LastReplicaAdded returns the ID of the most recently added replica and the time at which it was added.

func (*Replica) Less Uses

func (r *Replica) Less(i btree.Item) bool

Less implements the btree.Item interface.

func (*Replica) MaybeGossipNodeLiveness Uses

func (r *Replica) MaybeGossipNodeLiveness(ctx context.Context, span roachpb.Span) error

MaybeGossipNodeLiveness gossips information for all node liveness records stored on this range. To scan and gossip, this replica must hold the lease to a range which contains some or all of the node liveness records. After scanning the records, it checks against what's already in gossip and only gossips records which are out of date.

func (*Replica) MaybeGossipSystemConfig Uses

func (r *Replica) MaybeGossipSystemConfig(ctx context.Context) error

MaybeGossipSystemConfig scans the entire SystemConfig span and gossips it. Further calls come from the trigger on EndTransaction or range lease acquisition.

Note that MaybeGossipSystemConfig gossips information only when the lease is actually held. The method does not request a range lease here since RequestLease and applyRaftCommand call the method and we need to avoid deadlocking in redirectOnOrAcquireLease.

MaybeGossipSystemConfig must only be called from Raft commands (which provide the necessary serialization to avoid data races).

TODO(nvanbenschoten,bdarnell): even though this is best effort, we should log louder when we continually fail to gossip system config.

func (*Replica) Metrics Uses

func (r *Replica) Metrics(
    ctx context.Context, now hlc.Timestamp, livenessMap IsLiveMap, clusterNodes int,
) ReplicaMetrics

Metrics returns the current metrics for the replica.

func (*Replica) NodeID Uses

func (r *Replica) NodeID() roachpb.NodeID

NodeID returns the ID of the node this replica belongs to.

func (*Replica) OwnsValidLease Uses

func (r *Replica) OwnsValidLease(ts hlc.Timestamp) bool

OwnsValidLease returns whether this replica is the current valid leaseholder. Note that this method does not check to see if a transfer is pending, but returns the status of the current lease and ownership at the specified point in time.

func (*Replica) QueriesPerSecond Uses

func (r *Replica) QueriesPerSecond() float64

QueriesPerSecond returns the range's average QPS if it is the current leaseholder. If it isn't, this will return 0 because the replica does not know about the reads that the leaseholder is serving.

A "Query" is a BatchRequest (regardless of its contents) arriving at the leaseholder with a gateway node set in the header (i.e. excluding requests that weren't sent through a DistSender, which in practice should be practically none).

func (*Replica) RaftStatus Uses

func (r *Replica) RaftStatus() *raft.Status

RaftStatus returns the current raft status of the replica. It returns nil if the Raft group has not been initialized yet.

func (*Replica) RangeFeed Uses

func (r *Replica) RangeFeed(
    args *roachpb.RangeFeedRequest, stream roachpb.Internal_RangeFeedServer,
) *roachpb.Error

RangeFeed registers a rangefeed over the specified span. It sends updates to the provided stream and returns with an optional error when the rangefeed is complete. The provided ConcurrentRequestLimiter is used to limit the number of rangefeeds using catchup iterators at the same time.

func (*Replica) ReplicaID Uses

func (r *Replica) ReplicaID() roachpb.ReplicaID

ReplicaID returns the ID for the Replica. It may be zero if the replica does not know its ID. Once a Replica has a non-zero ReplicaID it will never change.

func (*Replica) RunConsistencyCheck Uses

func (r *Replica) RunConsistencyCheck(
    ctx context.Context, req roachpb.ComputeChecksumRequest,
) ([]ConsistencyCheckResult, error)

RunConsistencyCheck carries out a round of CheckConsistency/CollectChecksum for the members of this range, returning the results (which it does not act upon). The first result will belong to the local replica, and in particular there is a first result when no error is returned.

func (*Replica) Send Uses

func (r *Replica) Send(
    ctx context.Context, ba roachpb.BatchRequest,
) (*roachpb.BatchResponse, *roachpb.Error)

Send executes a command on this range, dispatching it to the read-only, read-write, or admin execution path as appropriate. ctx should contain the log tags from the store (and up).

func (*Replica) SetZoneConfig Uses

func (r *Replica) SetZoneConfig(zone *config.ZoneConfig)

SetZoneConfig sets the replica's zone config.

func (*Replica) SplitByLoadEnabled Uses

func (r *Replica) SplitByLoadEnabled() bool

SplitByLoadEnabled returns whether load based splitting is enabled. Although this is a method of *Replica, the configuration is really global, shared across all stores.

func (*Replica) SplitByLoadQPSThreshold Uses

func (r *Replica) SplitByLoadQPSThreshold() float64

SplitByLoadQPSThreshold returns the QPS request rate for a given replica.

func (*Replica) State Uses

func (r *Replica) State() storagepb.RangeInfo

State returns a copy of the internal state of the Replica, along with some auxiliary information.

func (*Replica) StoreID Uses

func (r *Replica) StoreID() roachpb.StoreID

StoreID returns the Replica's StoreID.

func (*Replica) String Uses

func (r *Replica) String() string

String returns the string representation of the replica using an inconsistent copy of the range descriptor. Therefore, String does not require a lock and its output may not be atomic with other ongoing work in the replica. This is done to prevent deadlocks in logging sites.

func (*Replica) WritesPerSecond Uses

func (r *Replica) WritesPerSecond() float64

WritesPerSecond returns the range's average keys written per second. A "Write" is a mutation applied by Raft as measured by engine.RocksDBBatchCount(writeBatch). This corresponds roughly to the number of keys mutated by a write. For example, writing 12 intents would count as 24 writes (12 for the metadata, 12 for the versions). A DeleteRange that ultimately only removes one key counts as one (or two if it's transactional).

type ReplicaChecksum Uses

type ReplicaChecksum struct {
    CollectChecksumResponse
    // contains filtered or unexported fields
}

ReplicaChecksum contains progress on a replica checksum computation.

type ReplicaGCQueueMetrics Uses

type ReplicaGCQueueMetrics struct {
    RemoveReplicaCount *metric.Counter
}

ReplicaGCQueueMetrics is the set of metrics for the replica GC queue.

type ReplicaMetrics Uses

type ReplicaMetrics struct {
    Leader      bool
    LeaseValid  bool
    Leaseholder bool
    LeaseType   roachpb.LeaseType
    LeaseStatus storagepb.LeaseStatus

    // Quiescent indicates whether the replica believes itself to be quiesced.
    Quiescent bool
    // Ticking indicates whether the store is ticking the replica. It should be
    // the opposite of Quiescent.
    Ticking bool

    // Is this the replica which collects per-range metrics? This is done either
    // on the leader or, if there is no leader, on the largest live replica ID.
    RangeCounter    bool
    Unavailable     bool
    Underreplicated bool
    Overreplicated  bool
    BehindCount     int64
    LatchInfoLocal  storagepb.LatchManagerInfo
    LatchInfoGlobal storagepb.LatchManagerInfo
    RaftLogTooLarge bool
}

ReplicaMetrics contains details on the current status of the replica.

type ReplicaPlaceholder Uses

type ReplicaPlaceholder struct {
    // contains filtered or unexported fields
}

ReplicaPlaceholder represents a "lock" of a part of the keyspace on a given *Store for the application of a (preemptive or Raft) snapshot. Placeholders are kept synchronously in two places in (*Store).mu, namely the replicaPlaceholders and replicaByKey maps, and exist only while the Raft scheduler tries to apply raft.Ready containing a snapshot to some Replica.

To see why placeholders are necessary, consider the case in which two snapshots arrive at a Store, one for r1 and bounds [a,c) and the other for r2 and [b,c), and assume that the keyspace [a,c) is not associated to any Replica on the receiving Store. This situation can occur because even though "logically" the keyspace always shards cleanly into replicas, incoming snapshots don't always originate from a mutually consistent version of this sharding. For example, a range Q might split, creating a range R, but some Store might be receiving a snapshot of Q before the split as well as a replica of R (which postdates the split). Similar examples are possible with merges as well as with arbitrarily complicated combinations of multiple merges and splits.

Without placeholders, the following interleaving of two concurrent Raft scheduler goroutines g1 and g2 is possible for the above example:

- g1: new raft.Ready for r1 wants to apply snapshot - g1: check for conflicts with existing replicas: none found; [a,c) is empty - g2: new raft.Ready for r2 wants to apply snapshot - g2: check for conflicts with existing replicas: none found; [b,c) is empty - g2: apply snapshot: writes replica for r2 to [b,c) - g2: done - g1: apply snapshot: writes replica for r1 to [a,c) - boom: we now have two replicas on this store that overlap

Placeholders avoid this problem because they provide a serialization point between g1 and g2: When g1 checks for conflicts, it also checks for an existing placeholder (inserting its own atomically when none found), so that g2 would later fail the overlap check on g1's placeholder.

Placeholders are removed by the goroutine that inserted them at the end of the respective Raft cycle, so they usually live only for as long as it takes to write the snapshot to disk. See (*Store).processRaftSnapshotRequest for details.

func (*ReplicaPlaceholder) Desc Uses

func (r *ReplicaPlaceholder) Desc() *roachpb.RangeDescriptor

Desc returns the range Placeholder's descriptor.

func (*ReplicaPlaceholder) Less Uses

func (r *ReplicaPlaceholder) Less(i btree.Item) bool

Less implements the btree.Item interface.

func (*ReplicaPlaceholder) String Uses

func (r *ReplicaPlaceholder) String() string

type ReplicaSnapshotDiff Uses

type ReplicaSnapshotDiff struct {
    // LeaseHolder is set to true of this kv pair is only present on the lease
    // holder.
    LeaseHolder bool
    Key         roachpb.Key
    Timestamp   hlc.Timestamp
    Value       []byte
}

ReplicaSnapshotDiff is a part of a []ReplicaSnapshotDiff which represents a diff between two replica snapshots. For now it's only a diff between their KV pairs.

type ReplicaSnapshotDiffSlice Uses

type ReplicaSnapshotDiffSlice []ReplicaSnapshotDiff

ReplicaSnapshotDiffSlice groups multiple ReplicaSnapshotDiff records and exposes a formatting helper.

func (ReplicaSnapshotDiffSlice) String Uses

func (rsds ReplicaSnapshotDiffSlice) String() string

func (ReplicaSnapshotDiffSlice) WriteTo Uses

func (rsds ReplicaSnapshotDiffSlice) WriteTo(w io.Writer) (int64, error)

WriteTo writes a string representation of itself to the given writer.

type ReplicateQueueMetrics Uses

type ReplicateQueueMetrics struct {
    AddReplicaCount           *metric.Counter
    RemoveReplicaCount        *metric.Counter
    RemoveDeadReplicaCount    *metric.Counter
    RemoveLearnerReplicaCount *metric.Counter
    RebalanceReplicaCount     *metric.Counter
    TransferLeaseCount        *metric.Counter
}

ReplicateQueueMetrics is the set of metrics for the replicate queue.

type SSTSnapshotStorage Uses

type SSTSnapshotStorage struct {
    // contains filtered or unexported fields
}

SSTSnapshotStorage provides an interface to create scratches and owns the directory of scratches created. A scratch manages the SSTs created during a specific snapshot.

func NewSSTSnapshotStorage Uses

func NewSSTSnapshotStorage(engine engine.Engine, limiter *rate.Limiter) SSTSnapshotStorage

NewSSTSnapshotStorage creates a new SST snapshot storage.

func (*SSTSnapshotStorage) Clear Uses

func (sss *SSTSnapshotStorage) Clear() error

Clear removes all created directories and SSTs.

func (*SSTSnapshotStorage) NewSSTSnapshotStorageScratch Uses

func (sss *SSTSnapshotStorage) NewSSTSnapshotStorageScratch(
    rangeID roachpb.RangeID, snapUUID uuid.UUID,
) *SSTSnapshotStorageScratch

NewSSTSnapshotStorageScratch creates a new SST snapshot storage scratch for a specific snapshot.

type SSTSnapshotStorageFile Uses

type SSTSnapshotStorageFile struct {
    // contains filtered or unexported fields
}

SSTSnapshotStorageFile is an SST file managed by a SSTSnapshotStorageScratch.

func (*SSTSnapshotStorageFile) Close Uses

func (sssf *SSTSnapshotStorageFile) Close() error

Close closes the file. Calling this function multiple times is idempotent. The file must have been written to before being closed.

func (*SSTSnapshotStorageFile) Sync Uses

func (sssf *SSTSnapshotStorageFile) Sync() error

Sync syncs the file to disk. Implements writeCloseSyncer in engine.

func (*SSTSnapshotStorageFile) Write Uses

func (sssf *SSTSnapshotStorageFile) Write(contents []byte) (int, error)

Write writes contents to the file while respecting the limiter passed into SSTSnapshotStorageScratch. Writing empty contents is okay and is treated as a noop. The file must have not been closed.

type SSTSnapshotStorageScratch Uses

type SSTSnapshotStorageScratch struct {
    // contains filtered or unexported fields
}

SSTSnapshotStorageScratch keeps track of the SST files incrementally created when receiving a snapshot. Each scratch is associated with a specific snapshot.

func (*SSTSnapshotStorageScratch) Clear Uses

func (ssss *SSTSnapshotStorageScratch) Clear() error

Clear removes the directory and SSTs created for a particular snapshot.

func (*SSTSnapshotStorageScratch) NewFile Uses

func (ssss *SSTSnapshotStorageScratch) NewFile(
    ctx context.Context, chunkSize int64,
) (*SSTSnapshotStorageFile, error)

NewFile adds another file to SSTSnapshotStorageScratch. This file is lazily created when the file is written to the first time. A nonzero value for chunkSize buffers up writes until the buffer is greater than chunkSize.

func (*SSTSnapshotStorageScratch) SSTs Uses

func (ssss *SSTSnapshotStorageScratch) SSTs() []string

SSTs returns the names of the files created.

func (*SSTSnapshotStorageScratch) WriteSST Uses

func (ssss *SSTSnapshotStorageScratch) WriteSST(ctx context.Context, data []byte) error

WriteSST writes SST data to a file. The method closes the provided SST when it is finished using it. If the provided SST is empty, then no file will be created and nothing will be written.

type Server Uses

type Server struct {
    // contains filtered or unexported fields
}

Server implements PerReplicaServer.

func MakeServer Uses

func MakeServer(descriptor *roachpb.NodeDescriptor, stores *Stores) Server

MakeServer returns a new instance of Server.

func (Server) CollectChecksum Uses

func (is Server) CollectChecksum(
    ctx context.Context, req *CollectChecksumRequest,
) (*CollectChecksumResponse, error)

CollectChecksum implements PerReplicaServer.

func (Server) WaitForApplication Uses

func (is Server) WaitForApplication(
    ctx context.Context, req *WaitForApplicationRequest,
) (*WaitForApplicationResponse, error)

WaitForApplication implements PerReplicaServer.

It is the caller's responsibility to cancel or set a timeout on the context. If the context is never canceled, WaitForApplication will retry forever.

func (Server) WaitForReplicaInit Uses

func (is Server) WaitForReplicaInit(
    ctx context.Context, req *WaitForReplicaInitRequest,
) (*WaitForReplicaInitResponse, error)

WaitForReplicaInit implements PerReplicaServer.

It is the caller's responsibility to cancel or set a timeout on the context. If the context is never canceled, WaitForReplicaInit will retry forever.

type SideloadStorage Uses

type SideloadStorage interface {
    // The directory in which the sideloaded files are stored. May or may not
    // exist.
    Dir() string
    // Writes the given contents to the file specified by the given index and
    // term. Overwrites the file if it already exists.
    Put(_ context.Context, index, term uint64, contents []byte) error
    // Load the file at the given index and term. Return errSideloadedFileNotFound when no
    // such file is present.
    Get(_ context.Context, index, term uint64) ([]byte, error)
    // Purge removes the file at the given index and term. It may also
    // remove any leftover files at the same index and earlier terms, but
    // is not required to do so. When no file at the given index and term
    // exists, returns errSideloadedFileNotFound.
    //
    // Returns the total size of the purged payloads.
    Purge(_ context.Context, index, term uint64) (int64, error)
    // Clear files that may have been written by this SideloadStorage.
    Clear(context.Context) error
    // TruncateTo removes all files belonging to an index strictly smaller than
    // the given one. Returns the number of bytes freed, the number of bytes in
    // files that remain, or an error.
    TruncateTo(_ context.Context, index uint64) (freed, retained int64, _ error)
    // Returns an absolute path to the file that Get() would return the contents
    // of. Does not check whether the file actually exists.
    Filename(_ context.Context, index, term uint64) (string, error)
}

SideloadStorage is the interface used for Raft SSTable sideloading. Implementations do not need to be thread safe.

type SnapshotRequest Uses

type SnapshotRequest struct {
    Header *SnapshotRequest_Header `protobuf:"bytes,1,opt,name=header" json:"header,omitempty"`
    // A RocksDB BatchRepr. Multiple kv_batches may be sent across multiple request messages.
    KVBatch []byte `protobuf:"bytes,2,opt,name=kv_batch,json=kvBatch" json:"kv_batch,omitempty"`
    // These are really raftpb.Entry, but we model them as raw bytes to avoid
    // roundtripping through memory. They are separate from the kv_batch to
    // allow flexibility in log implementations.
    LogEntries [][]byte `protobuf:"bytes,3,rep,name=log_entries,json=logEntries" json:"log_entries,omitempty"`
    Final      bool     `protobuf:"varint,4,opt,name=final" json:"final"`
}

SnapshotRequest is the request used to send streaming snapshot requests.

func (*SnapshotRequest) Descriptor Uses

func (*SnapshotRequest) Descriptor() ([]byte, []int)

func (*SnapshotRequest) Marshal Uses

func (m *SnapshotRequest) Marshal() (dAtA []byte, err error)

func (*SnapshotRequest) MarshalTo Uses

func (m *SnapshotRequest) MarshalTo(dAtA []byte) (int, error)

func (*SnapshotRequest) ProtoMessage Uses

func (*SnapshotRequest) ProtoMessage()

func (*SnapshotRequest) Reset Uses

func (m *SnapshotRequest) Reset()

func (*SnapshotRequest) Size Uses

func (m *SnapshotRequest) Size() (n int)

func (*SnapshotRequest) String Uses

func (m *SnapshotRequest) String() string

func (*SnapshotRequest) Unmarshal Uses

func (m *SnapshotRequest) Unmarshal(dAtA []byte) error

func (*SnapshotRequest) XXX_DiscardUnknown Uses

func (m *SnapshotRequest) XXX_DiscardUnknown()

func (*SnapshotRequest) XXX_Marshal Uses

func (m *SnapshotRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*SnapshotRequest) XXX_Merge Uses

func (dst *SnapshotRequest) XXX_Merge(src proto.Message)

func (*SnapshotRequest) XXX_Size Uses

func (m *SnapshotRequest) XXX_Size() int

func (*SnapshotRequest) XXX_Unmarshal Uses

func (m *SnapshotRequest) XXX_Unmarshal(b []byte) error

type SnapshotRequest_Header Uses

type SnapshotRequest_Header struct {
    // The replica state at the time the snapshot was generated. Note
    // that ReplicaState.Desc differs from the above range_descriptor
    // field which holds the updated descriptor after the new replica
    // has been added while ReplicaState.Desc holds the descriptor
    // before the new replica has been added.
    State storagepb.ReplicaState `protobuf:"bytes,5,opt,name=state" json:"state"`
    // The inner raft message is of type MsgSnap, and its snapshot data contains a UUID.
    RaftMessageRequest RaftMessageRequest `protobuf:"bytes,2,opt,name=raft_message_request,json=raftMessageRequest" json:"raft_message_request"`
    // The estimated size of the range, to be used in reservation decisions.
    RangeSize int64 `protobuf:"varint,3,opt,name=range_size,json=rangeSize" json:"range_size"`
    // can_decline is set on preemptive snapshots, but not those generated
    // by raft because at that point it is better to queue up the stream
    // than to cancel it.
    CanDecline bool `protobuf:"varint,4,opt,name=can_decline,json=canDecline" json:"can_decline"`
    // The priority of the snapshot.
    Priority SnapshotRequest_Priority `protobuf:"varint,6,opt,name=priority,enum=cockroach.storage.SnapshotRequest_Priority" json:"priority"`
    // The strategy of the snapshot.
    Strategy SnapshotRequest_Strategy `protobuf:"varint,7,opt,name=strategy,enum=cockroach.storage.SnapshotRequest_Strategy" json:"strategy"`
    // The type of the snapshot.
    Type SnapshotRequest_Type `protobuf:"varint,9,opt,name=type,enum=cockroach.storage.SnapshotRequest_Type" json:"type"`
    // Whether the snapshot uses the unreplicated RaftTruncatedState or not.
    // This is generally always true at 2.2 and above outside of the migration
    // phase, though theoretically it could take a long time for all ranges
    // to update to the new mechanism. This bool is true iff the Raft log at
    // the snapshot's applied index is using the new key. In particular, it
    // is true if the index itself carries out the migration (in which case
    // the data in the snapshot contains neither key).
    //
    // See VersionUnreplicatedRaftTruncatedState.
    UnreplicatedTruncatedState bool `protobuf:"varint,8,opt,name=unreplicated_truncated_state,json=unreplicatedTruncatedState" json:"unreplicated_truncated_state"`
}

func (*SnapshotRequest_Header) Descriptor Uses

func (*SnapshotRequest_Header) Descriptor() ([]byte, []int)

func (*SnapshotRequest_Header) IsPreemptive Uses

func (h *SnapshotRequest_Header) IsPreemptive() bool

IsPreemptive returns whether this is a preemptive snapshot or a Raft snapshot.

func (*SnapshotRequest_Header) Marshal Uses

func (m *SnapshotRequest_Header) Marshal() (dAtA []byte, err error)

func (*SnapshotRequest_Header) MarshalTo Uses

func (m *SnapshotRequest_Header) MarshalTo(dAtA []byte) (int, error)

func (*SnapshotRequest_Header) ProtoMessage Uses

func (*SnapshotRequest_Header) ProtoMessage()

func (*SnapshotRequest_Header) Reset Uses

func (m *SnapshotRequest_Header) Reset()

func (*SnapshotRequest_Header) Size Uses

func (m *SnapshotRequest_Header) Size() (n int)

func (*SnapshotRequest_Header) String Uses

func (m *SnapshotRequest_Header) String() string

func (*SnapshotRequest_Header) Unmarshal Uses

func (m *SnapshotRequest_Header) Unmarshal(dAtA []byte) error

func (*SnapshotRequest_Header) XXX_DiscardUnknown Uses

func (m *SnapshotRequest_Header) XXX_DiscardUnknown()

func (*SnapshotRequest_Header) XXX_Marshal Uses

func (m *SnapshotRequest_Header) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*SnapshotRequest_Header) XXX_Merge Uses

func (dst *SnapshotRequest_Header) XXX_Merge(src proto.Message)

func (*SnapshotRequest_Header) XXX_Size Uses

func (m *SnapshotRequest_Header) XXX_Size() int

func (*SnapshotRequest_Header) XXX_Unmarshal Uses

func (m *SnapshotRequest_Header) XXX_Unmarshal(b []byte) error

type SnapshotRequest_Priority Uses

type SnapshotRequest_Priority int32
const (
    SnapshotRequest_UNKNOWN SnapshotRequest_Priority = 0
    // RECOVERY is used for a Raft-initiated snapshots and for
    // up-replication snapshots (i.e. when a dead node has been
    // removed and the range needs to be up-replicated).
    SnapshotRequest_RECOVERY SnapshotRequest_Priority = 1
    // REBALANCE is used for snapshots involved in rebalancing.
    SnapshotRequest_REBALANCE SnapshotRequest_Priority = 2
)

func (SnapshotRequest_Priority) Enum Uses

func (x SnapshotRequest_Priority) Enum() *SnapshotRequest_Priority

func (SnapshotRequest_Priority) EnumDescriptor Uses

func (SnapshotRequest_Priority) EnumDescriptor() ([]byte, []int)

func (SnapshotRequest_Priority) String Uses

func (x SnapshotRequest_Priority) String() string

func (*SnapshotRequest_Priority) UnmarshalJSON Uses

func (x *SnapshotRequest_Priority) UnmarshalJSON(data []byte) error

type SnapshotRequest_Strategy Uses

type SnapshotRequest_Strategy int32
const (
    // KV_BATCH snapshots stream batches of KV pairs for all keys in a
    // range from the sender the the receiver. These KV pairs are then
    // combined into a large RocksDB WriteBatch that is atomically
    // applied.
    SnapshotRequest_KV_BATCH SnapshotRequest_Strategy = 0
)

func (SnapshotRequest_Strategy) Enum Uses

func (x SnapshotRequest_Strategy) Enum() *SnapshotRequest_Strategy

func (SnapshotRequest_Strategy) EnumDescriptor Uses

func (SnapshotRequest_Strategy) EnumDescriptor() ([]byte, []int)

func (SnapshotRequest_Strategy) String Uses

func (x SnapshotRequest_Strategy) String() string

func (*SnapshotRequest_Strategy) UnmarshalJSON Uses

func (x *SnapshotRequest_Strategy) UnmarshalJSON(data []byte) error

type SnapshotRequest_Type Uses

type SnapshotRequest_Type int32
const (
    SnapshotRequest_RAFT       SnapshotRequest_Type = 0
    SnapshotRequest_LEARNER    SnapshotRequest_Type = 1
    SnapshotRequest_PREEMPTIVE SnapshotRequest_Type = 2
)

func (SnapshotRequest_Type) Enum Uses

func (x SnapshotRequest_Type) Enum() *SnapshotRequest_Type

func (SnapshotRequest_Type) EnumDescriptor Uses

func (SnapshotRequest_Type) EnumDescriptor() ([]byte, []int)

func (SnapshotRequest_Type) String Uses

func (x SnapshotRequest_Type) String() string

func (*SnapshotRequest_Type) UnmarshalJSON Uses

func (x *SnapshotRequest_Type) UnmarshalJSON(data []byte) error

type SnapshotResponse Uses

type SnapshotResponse struct {
    Status  SnapshotResponse_Status `protobuf:"varint,1,opt,name=status,enum=cockroach.storage.SnapshotResponse_Status" json:"status"`
    Message string                  `protobuf:"bytes,2,opt,name=message" json:"message"`
}

func (*SnapshotResponse) Descriptor Uses

func (*SnapshotResponse) Descriptor() ([]byte, []int)

func (*SnapshotResponse) Marshal Uses

func (m *SnapshotResponse) Marshal() (dAtA []byte, err error)

func (*SnapshotResponse) MarshalTo Uses

func (m *SnapshotResponse) MarshalTo(dAtA []byte) (int, error)

func (*SnapshotResponse) ProtoMessage Uses

func (*SnapshotResponse) ProtoMessage()

func (*SnapshotResponse) Reset Uses

func (m *SnapshotResponse) Reset()

func (*SnapshotResponse) Size Uses

func (m *SnapshotResponse) Size() (n int)

func (*SnapshotResponse) String Uses

func (m *SnapshotResponse) String() string

func (*SnapshotResponse) Unmarshal Uses

func (m *SnapshotResponse) Unmarshal(dAtA []byte) error

func (*SnapshotResponse) XXX_DiscardUnknown Uses

func (m *SnapshotResponse) XXX_DiscardUnknown()

func (*SnapshotResponse) XXX_Marshal Uses

func (m *SnapshotResponse) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*SnapshotResponse) XXX_Merge Uses

func (dst *SnapshotResponse) XXX_Merge(src proto.Message)

func (*SnapshotResponse) XXX_Size Uses

func (m *SnapshotResponse) XXX_Size() int

func (*SnapshotResponse) XXX_Unmarshal Uses

func (m *SnapshotResponse) XXX_Unmarshal(b []byte) error

type SnapshotResponseStream Uses

type SnapshotResponseStream interface {
    Context() context.Context
    Send(*SnapshotResponse) error
    Recv() (*SnapshotRequest, error)
}

SnapshotResponseStream is the subset of the MultiRaft_RaftSnapshotServer interface that is needed for sending responses.

type SnapshotResponse_Status Uses

type SnapshotResponse_Status int32
const (
    SnapshotResponse_UNKNOWN  SnapshotResponse_Status = 0
    SnapshotResponse_ACCEPTED SnapshotResponse_Status = 1
    SnapshotResponse_APPLIED  SnapshotResponse_Status = 2
    SnapshotResponse_ERROR    SnapshotResponse_Status = 3
    SnapshotResponse_DECLINED SnapshotResponse_Status = 4
)

func (SnapshotResponse_Status) Enum Uses

func (x SnapshotResponse_Status) Enum() *SnapshotResponse_Status

func (SnapshotResponse_Status) EnumDescriptor Uses

func (SnapshotResponse_Status) EnumDescriptor() ([]byte, []int)

func (SnapshotResponse_Status) String Uses

func (x SnapshotResponse_Status) String() string

func (*SnapshotResponse_Status) UnmarshalJSON Uses

func (x *SnapshotResponse_Status) UnmarshalJSON(data []byte) error

type SnapshotStorePool Uses

type SnapshotStorePool interface {
    // contains filtered or unexported methods
}

SnapshotStorePool narrows StorePool to make sendSnapshot easier to test.

type SpanSetReplicaEvalContext Uses

type SpanSetReplicaEvalContext struct {
    // contains filtered or unexported fields
}

SpanSetReplicaEvalContext is a testing-only implementation of ReplicaEvalContext which verifies that access to state is registered in the SpanSet if one is given.

func (*SpanSetReplicaEvalContext) AbortSpan Uses

func (rec *SpanSetReplicaEvalContext) AbortSpan() *abortspan.AbortSpan

AbortSpan returns the abort span.

func (SpanSetReplicaEvalContext) CanCreateTxnRecord Uses

func (rec SpanSetReplicaEvalContext) CanCreateTxnRecord(
    txnID uuid.UUID, txnKey []byte, txnMinTS hlc.Timestamp,
) (bool, hlc.Timestamp, roachpb.TransactionAbortedReason)

CanCreateTxnRecord determines whether a transaction record can be created for the provided transaction information. See Replica.CanCreateTxnRecord for details about its arguments, return values, and preconditions.

func (*SpanSetReplicaEvalContext) Clock Uses

func (rec *SpanSetReplicaEvalContext) Clock() *hlc.Clock

Clock returns the Replica's clock.

func (*SpanSetReplicaEvalContext) ClusterSettings Uses

func (rec *SpanSetReplicaEvalContext) ClusterSettings() *cluster.Settings

ClusterSettings returns the cluster settings.

func (SpanSetReplicaEvalContext) ContainsKey Uses

func (rec SpanSetReplicaEvalContext) ContainsKey(key roachpb.Key) bool

ContainsKey returns true if the given key is within the Replica's range.

TODO(bdarnell): Replace this method with one on Desc(). See comment on Replica.ContainsKey.

func (*SpanSetReplicaEvalContext) DB Uses

func (rec *SpanSetReplicaEvalContext) DB() *client.DB

DB returns the Replica's client DB.

func (SpanSetReplicaEvalContext) Desc Uses

func (rec SpanSetReplicaEvalContext) Desc() *roachpb.RangeDescriptor

Desc returns the Replica's RangeDescriptor.

func (*SpanSetReplicaEvalContext) Engine Uses

func (rec *SpanSetReplicaEvalContext) Engine() engine.Engine

Engine returns the engine.

func (*SpanSetReplicaEvalContext) EvalKnobs Uses

func (rec *SpanSetReplicaEvalContext) EvalKnobs() storagebase.BatchEvalTestingKnobs

EvalKnobs returns the batch evaluation Knobs.

func (*SpanSetReplicaEvalContext) GetExternalStorage Uses

func (rec *SpanSetReplicaEvalContext) GetExternalStorage(
    ctx context.Context, dest roachpb.ExternalStorage,
) (cloud.ExternalStorage, error)

GetExternalStorage returns an ExternalStorage object, based on information parsed from a URI, stored in `dest`.

func (*SpanSetReplicaEvalContext) GetExternalStorageFromURI Uses

func (rec *SpanSetReplicaEvalContext) GetExternalStorageFromURI(
    ctx context.Context, uri string,
) (cloud.ExternalStorage, error)

GetExternalStorageFromURI returns an ExternalStorage object, based on the given URI.

func (*SpanSetReplicaEvalContext) GetFirstIndex Uses

func (rec *SpanSetReplicaEvalContext) GetFirstIndex() (uint64, error)

GetFirstIndex returns the first index.

func (SpanSetReplicaEvalContext) GetGCThreshold Uses

func (rec SpanSetReplicaEvalContext) GetGCThreshold() hlc.Timestamp

GetGCThreshold returns the GC threshold of the Range, typically updated when keys are garbage collected. Reads and writes at timestamps <= this time will not be served.

func (SpanSetReplicaEvalContext) GetLastReplicaGCTimestamp Uses

func (rec SpanSetReplicaEvalContext) GetLastReplicaGCTimestamp(
    ctx context.Context,
) (hlc.Timestamp, error)

GetLastReplicaGCTimestamp returns the last time the Replica was considered for GC.

func (SpanSetReplicaEvalContext) GetLease Uses

func (rec SpanSetReplicaEvalContext) GetLease() (roachpb.Lease, roachpb.Lease)

GetLease returns the Replica's current and next lease (if any).

func (*SpanSetReplicaEvalContext) GetLeaseAppliedIndex Uses

func (rec *SpanSetReplicaEvalContext) GetLeaseAppliedIndex() uint64

GetLeaseAppliedIndex returns the lease index of the last applied command.

func (*SpanSetReplicaEvalContext) GetLimiters Uses

func (rec *SpanSetReplicaEvalContext) GetLimiters() *batcheval.Limiters

GetLimiters returns the per-store limiters.

func (SpanSetReplicaEvalContext) GetMVCCStats Uses

func (rec SpanSetReplicaEvalContext) GetMVCCStats() enginepb.MVCCStats

GetMVCCStats returns the Replica's MVCCStats.

func (*SpanSetReplicaEvalContext) GetNodeLocality Uses

func (rec *SpanSetReplicaEvalContext) GetNodeLocality() roachpb.Locality

GetNodeLocality returns the node locality.

func (*SpanSetReplicaEvalContext) GetRangeID Uses

func (rec *SpanSetReplicaEvalContext) GetRangeID() roachpb.RangeID

GetRangeID returns the RangeID.

func (SpanSetReplicaEvalContext) GetSplitQPS Uses

func (rec SpanSetReplicaEvalContext) GetSplitQPS() float64

GetSplitQPS returns the Replica's queries/s rate for splitting purposes.

func (*SpanSetReplicaEvalContext) GetTerm Uses

func (rec *SpanSetReplicaEvalContext) GetTerm(i uint64) (uint64, error)

GetTerm returns the term for the given index in the Raft log.

func (*SpanSetReplicaEvalContext) GetTxnWaitQueue Uses

func (rec *SpanSetReplicaEvalContext) GetTxnWaitQueue() *txnwait.Queue

GetTxnWaitQueue returns the txnwait.Queue.

func (*SpanSetReplicaEvalContext) IsFirstRange Uses

func (rec *SpanSetReplicaEvalContext) IsFirstRange() bool

IsFirstRange returns true iff the replica belongs to the first range.

func (*SpanSetReplicaEvalContext) NodeID Uses

func (rec *SpanSetReplicaEvalContext) NodeID() roachpb.NodeID

NodeID returns the NodeID.

func (*SpanSetReplicaEvalContext) StoreID Uses

func (rec *SpanSetReplicaEvalContext) StoreID() roachpb.StoreID

StoreID returns the StoreID.

func (SpanSetReplicaEvalContext) String Uses

func (rec SpanSetReplicaEvalContext) String() string

String implements Stringer.

type Store Uses

type Store struct {
    Ident *roachpb.StoreIdent // pointer to catch access before Start() is called
    // contains filtered or unexported fields
}

A Store maintains a map of ranges by start key. A Store corresponds to one physical device.

func NewStore Uses

func NewStore(
    ctx context.Context, cfg StoreConfig, eng engine.Engine, nodeDesc *roachpb.NodeDescriptor,
) *Store

NewStore returns a new instance of a store.

func (*Store) AdminRelocateRange Uses

func (s *Store) AdminRelocateRange(
    ctx context.Context, rangeDesc roachpb.RangeDescriptor, targets []roachpb.ReplicationTarget,
) error

AdminRelocateRange relocates a given range to a given set of stores. The first store in the slice becomes the new leaseholder.

This is best-effort; it's possible that the replicate queue on the leaseholder could take action at the same time, causing errors.

func (*Store) AllocateRangeID Uses

func (s *Store) AllocateRangeID(ctx context.Context) (roachpb.RangeID, error)

AllocateRangeID allocates a new RangeID from the cluster-wide RangeID allocator.

func (*Store) AllocatorDryRun Uses

func (s *Store) AllocatorDryRun(ctx context.Context, repl *Replica) (tracing.Recording, error)

AllocatorDryRun runs the given replica through the allocator without actually carrying out any changes, returning all trace messages collected along the way. Intended to help power a debug endpoint.

func (*Store) AnnotateCtx Uses

func (s *Store) AnnotateCtx(ctx context.Context) context.Context

AnnotateCtx is a convenience wrapper; see AmbientContext.

func (*Store) Attrs Uses

func (s *Store) Attrs() roachpb.Attributes

Attrs returns the attributes of the underlying store.

func (*Store) Capacity Uses

func (s *Store) Capacity(useCached bool) (roachpb.StoreCapacity, error)

Capacity returns the capacity of the underlying storage engine. Note that this does not include reservations. Note that Capacity() has the side effect of updating some of the store's internal statistics about its replicas.

func (*Store) Clock Uses

func (s *Store) Clock() *hlc.Clock

Clock accessor.

func (*Store) ClusterID Uses

func (s *Store) ClusterID() uuid.UUID

ClusterID accessor.

func (*Store) ClusterNodeCount Uses

func (s *Store) ClusterNodeCount() int

ClusterNodeCount returns this store's view of the number of nodes in the cluster. This is the metric used for adapative zone configs; ranges will not be reported as underreplicated if it is low. Tests that wait for full replication by tracking the underreplicated metric must also check for the expected ClusterNodeCount to avoid catching the cluster while the first node is initialized but the other nodes are not.

func (*Store) ClusterSettings Uses

func (s *Store) ClusterSettings() *cluster.Settings

ClusterSettings returns the node's ClusterSettings.

func (*Store) Compactor Uses

func (s *Store) Compactor() *compactor.Compactor

Compactor accessor.

func (*Store) ComputeMetrics Uses

func (s *Store) ComputeMetrics(ctx context.Context, tick int) error

ComputeMetrics immediately computes the current value of store metrics which cannot be computed incrementally. This method should be invoked periodically by a higher-level system which records store metrics.

The tick argument should increment across repeated calls to this method. It is used to compute some metrics less frequently than others.

func (*Store) ComputeStatsForKeySpan Uses

func (s *Store) ComputeStatsForKeySpan(startKey, endKey roachpb.RKey) (StoreKeySpanStats, error)

ComputeStatsForKeySpan computes the aggregated MVCCStats for all replicas on this store which contain any keys in the supplied range.

func (*Store) DB Uses

func (s *Store) DB() *client.DB

DB accessor.

func (*Store) Descriptor Uses

func (s *Store) Descriptor(useCached bool) (*roachpb.StoreDescriptor, error)

Descriptor returns a StoreDescriptor including current store capacity information.

func (*Store) Engine Uses

func (s *Store) Engine() engine.Engine

Engine accessor.

func (*Store) ForceConsistencyQueueProcess Uses

func (s *Store) ForceConsistencyQueueProcess() error

ForceConsistencyQueueProcess runs all the ranges through the consistency queue.

func (*Store) ForceRaftSnapshotQueueProcess Uses

func (s *Store) ForceRaftSnapshotQueueProcess() error

ForceRaftSnapshotQueueProcess iterates over all ranges, enqueuing any that need raft snapshots, then processes the raft snapshot queue.

func (*Store) ForceReplicationScanAndProcess Uses

func (s *Store) ForceReplicationScanAndProcess() error

ForceReplicationScanAndProcess iterates over all ranges and enqueues any that need to be replicated.

func (*Store) ForceSplitScanAndProcess Uses

func (s *Store) ForceSplitScanAndProcess() error

ForceSplitScanAndProcess iterates over all ranges and enqueues any that may need to be split.

func (*Store) ForceTimeSeriesMaintenanceQueueProcess Uses

func (s *Store) ForceTimeSeriesMaintenanceQueueProcess() error

ForceTimeSeriesMaintenanceQueueProcess iterates over all ranges, enqueuing any that need time series maintenance, then processes the time series maintenance queue.

func (*Store) GetClusterVersion Uses

func (s *Store) GetClusterVersion(ctx context.Context) (cluster.ClusterVersion, error)

GetClusterVersion reads the the cluster version from the store-local version key. Returns an empty version if the key is not found.

func (*Store) GetReplica Uses

func (s *Store) GetReplica(rangeID roachpb.RangeID) (*Replica, error)

GetReplica fetches a replica by Range ID. Returns an error if no replica is found.

func (*Store) GetTxnWaitKnobs Uses

func (s *Store) GetTxnWaitKnobs() txnwait.TestingKnobs

GetTxnWaitKnobs is part of txnwait.StoreInterface.

func (*Store) GetTxnWaitMetrics Uses

func (s *Store) GetTxnWaitMetrics() *txnwait.Metrics

GetTxnWaitMetrics is called by txnwait.Queue instances to get a reference to the shared metrics instance.

func (*Store) Gossip Uses

func (s *Store) Gossip() *gossip.Gossip

Gossip accessor.

func (*Store) GossipStore Uses

func (s *Store) GossipStore(ctx context.Context, useCached bool) error

GossipStore broadcasts the store on the gossip network.

func (*Store) HandleRaftRequest Uses

func (s *Store) HandleRaftRequest(
    ctx context.Context, req *RaftMessageRequest, respStream RaftMessageResponseStream,
) *roachpb.Error

HandleRaftRequest dispatches a raft message to the appropriate Replica. It requires that s.mu is not held.

func (*Store) HandleRaftResponse Uses

func (s *Store) HandleRaftResponse(ctx context.Context, resp *RaftMessageResponse) error

HandleRaftResponse implements the RaftMessageHandler interface. Per the interface specification, an error is returned if and only if the underlying Raft connection should be closed. It requires that s.mu is not held.

func (*Store) HandleRaftUncoalescedRequest Uses

func (s *Store) HandleRaftUncoalescedRequest(
    ctx context.Context, req *RaftMessageRequest, respStream RaftMessageResponseStream,
) *roachpb.Error

HandleRaftUncoalescedRequest dispatches a raft message to the appropriate Replica. It requires that s.mu is not held.

func (*Store) HandleSnapshot Uses

func (s *Store) HandleSnapshot(
    header *SnapshotRequest_Header, stream SnapshotResponseStream,
) error

HandleSnapshot reads an incoming streaming snapshot and applies it if possible.

func (*Store) HottestReplicas Uses

func (s *Store) HottestReplicas() []HotReplicaInfo

HottestReplicas returns the hottest replicas on a store, sorted by their QPS. Only contains ranges for which this store is the leaseholder.

Note that this uses cached information, so it's cheap but may be slightly out of date.

func (*Store) IsDraining Uses

func (s *Store) IsDraining() bool

IsDraining accessor.

func (*Store) IsStarted Uses

func (s *Store) IsStarted() bool

IsStarted returns true if the Store has been started.

func (*Store) LookupReplica Uses

func (s *Store) LookupReplica(key roachpb.RKey) *Replica

LookupReplica looks up the replica that contains the specified key. It returns nil if no such replica exists.

func (*Store) ManuallyEnqueue Uses

func (s *Store) ManuallyEnqueue(
    ctx context.Context, queueName string, repl *Replica, skipShouldQueue bool,
) (tracing.Recording, string, error)

ManuallyEnqueue runs the given replica through the requested queue, returning all trace events collected along the way as well as the error message returned from the queue's process method, if any. Intended to help power an admin debug endpoint.

func (*Store) MergeRange Uses

func (s *Store) MergeRange(
    ctx context.Context,
    leftRepl *Replica,
    newLeftDesc, rightDesc roachpb.RangeDescriptor,
    freezeStart hlc.Timestamp,
) error

MergeRange expands the left-hand replica, leftRepl, to absorb the right-hand replica, identified by rightDesc. freezeStart specifies the time at which the right-hand replica promised to stop serving traffic and is used to initialize the timestamp cache's low water mark for the right-hand keyspace. The right-hand replica must exist on this store and the raftMus for both the left-hand and right-hand replicas must be held.

func (*Store) Metrics Uses

func (s *Store) Metrics() *StoreMetrics

Metrics returns the store's metric struct.

func (*Store) MustForceMergeScanAndProcess Uses

func (s *Store) MustForceMergeScanAndProcess()

MustForceMergeScanAndProcess iterates over all ranges and enqueues any that may need to be merged.

func (*Store) MustForceRaftLogScanAndProcess Uses

func (s *Store) MustForceRaftLogScanAndProcess()

MustForceRaftLogScanAndProcess iterates over all ranges and enqueues any that need their raft logs truncated and then process each of them.

func (*Store) MustForceReplicaGCScanAndProcess Uses

func (s *Store) MustForceReplicaGCScanAndProcess()

MustForceReplicaGCScanAndProcess iterates over all ranges and enqueues any that may need to be GC'd.

func (*Store) RaftStatus Uses

func (s *Store) RaftStatus(rangeID roachpb.RangeID) *raft.Status

RaftStatus returns the current raft status of the local replica of the given range.

func (*Store) RangeFeed Uses

func (s *Store) RangeFeed(
    args *roachpb.RangeFeedRequest, stream roachpb.Internal_RangeFeedServer,
) *roachpb.Error

RangeFeed registers a rangefeed over the specified span. It sends updates to the provided stream and returns with an optional error when the rangefeed is complete.

func (*Store) ReadLastUpTimestamp Uses

func (s *Store) ReadLastUpTimestamp(ctx context.Context) (hlc.Timestamp, error)

ReadLastUpTimestamp returns the "last up" timestamp recorded in this store. This value can be used to approximate the last time the engine was was being served as a store by a running node. If the store does not contain a "last up" timestamp (for example, on a newly bootstrapped store), the zero timestamp is returned instead.

func (*Store) Registry Uses

func (s *Store) Registry() *metric.Registry

Registry returns the store registry.

func (*Store) RemoveReplica Uses

func (s *Store) RemoveReplica(
    ctx context.Context, rep *Replica, nextReplicaID roachpb.ReplicaID, opts RemoveOptions,
) error

RemoveReplica removes the replica from the store's replica map and from the sorted replicasByKey btree.

The NextReplicaID from the replica descriptor that was used to make the removal decision is passed in. Removal is aborted if the replica ID has advanced to or beyond the NextReplicaID since the removal decision was made.

If opts.DestroyReplica is false, replica.destroyRaftMuLocked is not called.

The passed replica must be initialized.

func (*Store) ReplicaCount Uses

func (s *Store) ReplicaCount() int

ReplicaCount returns the number of replicas contained by this store. This method is O(n) in the number of replicas and should not be called from performance critical code.

func (*Store) Send Uses

func (s *Store) Send(
    ctx context.Context, ba roachpb.BatchRequest,
) (br *roachpb.BatchResponse, pErr *roachpb.Error)

Send fetches a range based on the header's replica, assembles method, args & reply into a Raft Cmd struct and executes the command using the fetched range.

An incoming request may be transactional or not. If it is not transactional, the timestamp at which it executes may be higher than that optionally specified through the incoming BatchRequest, and it is not guaranteed that all operations are written at the same timestamp. If it is transactional, a timestamp must not be set - it is deduced automatically from the transaction. In particular, the read (original) timestamp will be used for all reads and the write (provisional commit) timestamp will be used for all writes. See the comments on txn.TxnMeta.Timestamp and txn.OrigTimestamp for more details.

Should a transactional operation be forced to a higher timestamp (for instance due to the timestamp cache or finding a committed value in the path of one of its writes), the response will have a transaction set which should be used to update the client transaction object.

func (*Store) SetDraining Uses

func (s *Store) SetDraining(drain bool)

SetDraining (when called with 'true') causes incoming lease transfers to be rejected, prevents all of the Store's Replicas from acquiring or extending range leases, and attempts to transfer away any leases owned. When called with 'false', returns to the normal mode of operation.

func (*Store) SetReplicateQueueActive Uses

func (s *Store) SetReplicateQueueActive(active bool)

SetReplicateQueueActive controls the replication queue. Only intended for tests.

func (*Store) SplitRange Uses

func (s *Store) SplitRange(
    ctx context.Context, leftRepl, rightReplOrNil *Replica, split *roachpb.SplitTrigger,
) error

SplitRange shortens the original range to accommodate the new range. The new range is added to the ranges map and the replicasByKey btree. origRng.raftMu and newRng.raftMu must be held.

This is only called from the split trigger in the context of the execution of a Raft command. Note that rightRepl will be nil if the replica described by rightDesc is known to have been removed.

func (*Store) Start Uses

func (s *Store) Start(ctx context.Context, stopper *stop.Stopper) error

Start the engine, set the GC and read the StoreIdent.

func (*Store) Stopper Uses

func (s *Store) Stopper() *stop.Stopper

Stopper accessor.

func (*Store) StoreID Uses

func (s *Store) StoreID() roachpb.StoreID

StoreID accessor.

func (*Store) String Uses

func (s *Store) String() string

String formats a store for debug output.

func (*Store) TestingKnobs Uses

func (s *Store) TestingKnobs() *StoreTestingKnobs

TestingKnobs accessor.

func (*Store) VisitReplicas Uses

func (s *Store) VisitReplicas(visitor func(*Replica) bool)

VisitReplicas invokes the visitor on the Store's Replicas until the visitor returns false. Replicas which are added to the Store after iteration begins may or may not be observed.

func (*Store) WaitForInit Uses

func (s *Store) WaitForInit()

WaitForInit waits for any asynchronous processes begun in Start() to complete their initialization. In particular, this includes gossiping. In some cases this may block until the range GC queue has completed its scan. Only for testing.

func (*Store) WriteHLCUpperBound Uses

func (s *Store) WriteHLCUpperBound(ctx context.Context, time int64) error

WriteHLCUpperBound records an upper bound to the wall time of the HLC

func (*Store) WriteLastUpTimestamp Uses

func (s *Store) WriteLastUpTimestamp(ctx context.Context, time hlc.Timestamp) error

WriteLastUpTimestamp records the supplied timestamp into the "last up" key on this store. This value should be refreshed whenever this store's node updates its own liveness record; it is used by a restarting store to determine the approximate time that it stopped.

type StoreConfig Uses

type StoreConfig struct {
    AmbientCtx log.AmbientContext
    base.RaftConfig

    DefaultZoneConfig       *config.ZoneConfig
    DefaultSystemZoneConfig *config.ZoneConfig
    Settings                *cluster.Settings
    Clock                   *hlc.Clock
    DB                      *client.DB
    Gossip                  *gossip.Gossip
    NodeLiveness            *NodeLiveness
    StorePool               *StorePool
    Transport               *RaftTransport
    NodeDialer              *nodedialer.Dialer
    RPCContext              *rpc.Context
    RangeDescriptorCache    kvbase.RangeDescriptorCache

    ClosedTimestamp *container.Container

    // SQLExecutor is used by the store to execute SQL statements.
    SQLExecutor sqlutil.InternalExecutor

    // TimeSeriesDataStore is an interface used by the store's time series
    // maintenance queue to dispatch individual maintenance tasks.
    TimeSeriesDataStore TimeSeriesDataStore

    // CoalescedHeartbeatsInterval is the interval for which heartbeat messages
    // are queued and then sent as a single coalesced heartbeat; it is a
    // fraction of the RaftTickInterval so that heartbeats don't get delayed by
    // an entire tick. Delaying coalescing heartbeat responses has a bad
    // interaction with quiescence because the coalesced (delayed) heartbeat
    // response can unquiesce the leader. Consider:
    //
    // T+0: leader queues MsgHeartbeat
    // T+1: leader sends MsgHeartbeat
    //                                        follower receives MsgHeartbeat
    //                                        follower queues MsgHeartbeatResp
    // T+2: leader queues quiesce message
    //                                        follower sends MsgHeartbeatResp
    //      leader receives MsgHeartbeatResp
    // T+3: leader sends quiesce message
    //
    // Thus we want to make sure that heartbeats are responded to faster than
    // the quiesce cadence.
    CoalescedHeartbeatsInterval time.Duration

    // RaftHeartbeatIntervalTicks is the number of ticks that pass between heartbeats.
    RaftHeartbeatIntervalTicks int

    // ScanInterval is the default value for the scan interval
    ScanInterval time.Duration

    // ScanMinIdleTime is the minimum time the scanner will be idle between ranges.
    // If enabled (> 0), the scanner may complete in more than ScanInterval for
    // stores with many ranges.
    ScanMinIdleTime time.Duration

    // ScanMaxIdleTime is the maximum time the scanner will be idle between ranges.
    // If enabled (> 0), the scanner may complete in less than ScanInterval for small
    // stores.
    ScanMaxIdleTime time.Duration

    // If LogRangeEvents is true, major changes to ranges will be logged into
    // the range event log.
    LogRangeEvents bool

    // RaftEntryCacheSize is the size in bytes of the Raft log entry cache
    // shared by all Raft groups managed by the store.
    RaftEntryCacheSize uint64

    // IntentResolverTaskLimit is the maximum number of asynchronous tasks that
    // may be started by the intent resolver. -1 indicates no asynchronous tasks
    // are allowed. 0 uses the default value (defaultIntentResolverTaskLimit)
    // which is non-zero.
    IntentResolverTaskLimit int

    TestingKnobs StoreTestingKnobs

    // TimestampCachePageSize is (server.Config).TimestampCachePageSize
    TimestampCachePageSize uint32

    // HistogramWindowInterval is (server.Config).HistogramWindowInterval
    HistogramWindowInterval time.Duration

    // EnableEpochRangeLeases controls whether epoch-based range leases are used.
    EnableEpochRangeLeases bool

    // GossipWhenCapacityDeltaExceedsFraction specifies the fraction from the last
    // gossiped store capacity values which need be exceeded before the store will
    // gossip immediately without waiting for the periodic gossip interval.
    GossipWhenCapacityDeltaExceedsFraction float64

    // ExternalStorage creates ExternalStorage objects which allows access to external files
    ExternalStorage        cloud.ExternalStorageFactory
    ExternalStorageFromURI cloud.ExternalStorageFromURIFactory
    // contains filtered or unexported fields
}

A StoreConfig encompasses the auxiliary objects and configuration required to create a store. All fields holding a pointer or an interface are required to create a store; the rest will have sane defaults set if omitted.

func TestStoreConfig Uses

func TestStoreConfig(clock *hlc.Clock) StoreConfig

TestStoreConfig has some fields initialized with values relevant in tests.

func (*StoreConfig) LeaseExpiration Uses

func (sc *StoreConfig) LeaseExpiration() int64

LeaseExpiration returns an int64 to increment a manual clock with to make sure that all active range leases expire.

func (*StoreConfig) SetDefaults Uses

func (sc *StoreConfig) SetDefaults()

SetDefaults initializes unset fields in StoreConfig to values suitable for use on a local network. TODO(tschottdorf): see if this ought to be configurable via flags.

func (*StoreConfig) Valid Uses

func (sc *StoreConfig) Valid() bool

Valid returns true if the StoreConfig is populated correctly. We don't check for Gossip and DB since some of our tests pass that as nil.

type StoreKeySpanStats Uses

type StoreKeySpanStats struct {
    ReplicaCount         int
    MVCC                 enginepb.MVCCStats
    ApproximateDiskBytes uint64
}

StoreKeySpanStats carries the result of a stats computation over a key range.

type StoreList Uses

type StoreList struct {
    // contains filtered or unexported fields
}

StoreList holds a list of store descriptors and associated count and used stats for those stores.

func (StoreList) String Uses

func (sl StoreList) String() string

type StoreMetrics Uses

type StoreMetrics struct {

    // Replica metrics.
    ReplicaCount                  *metric.Gauge // Does not include uninitialized or reserved replicas.
    ReservedReplicaCount          *metric.Gauge
    RaftLeaderCount               *metric.Gauge
    RaftLeaderNotLeaseHolderCount *metric.Gauge
    LeaseHolderCount              *metric.Gauge
    QuiescentCount                *metric.Gauge

    // Range metrics.
    RangeCount                *metric.Gauge
    UnavailableRangeCount     *metric.Gauge
    UnderReplicatedRangeCount *metric.Gauge
    OverReplicatedRangeCount  *metric.Gauge

    // Lease request metrics for successful and failed lease requests. These
    // count proposals (i.e. it does not matter how many replicas apply the
    // lease).
    LeaseRequestSuccessCount  *metric.Counter
    LeaseRequestErrorCount    *metric.Counter
    LeaseTransferSuccessCount *metric.Counter
    LeaseTransferErrorCount   *metric.Counter
    LeaseExpirationCount      *metric.Gauge
    LeaseEpochCount           *metric.Gauge

    // Storage metrics.
    LiveBytes          *metric.Gauge
    KeyBytes           *metric.Gauge
    ValBytes           *metric.Gauge
    TotalBytes         *metric.Gauge
    IntentBytes        *metric.Gauge
    LiveCount          *metric.Gauge
    KeyCount           *metric.Gauge
    ValCount           *metric.Gauge
    IntentCount        *metric.Gauge
    IntentAge          *metric.Gauge
    GcBytesAge         *metric.Gauge
    LastUpdateNanos    *metric.Gauge
    ResolveCommitCount *metric.Counter
    ResolveAbortCount  *metric.Counter
    ResolvePoisonCount *metric.Counter
    Capacity           *metric.Gauge
    Available          *metric.Gauge
    Used               *metric.Gauge
    Reserved           *metric.Gauge
    SysBytes           *metric.Gauge
    SysCount           *metric.Gauge

    // Rebalancing metrics.
    AverageQueriesPerSecond *metric.GaugeFloat64
    AverageWritesPerSecond  *metric.GaugeFloat64

    // Follower read metrics.
    FollowerReadsCount *metric.Counter

    // RocksDB metrics.
    RdbBlockCacheHits           *metric.Gauge
    RdbBlockCacheMisses         *metric.Gauge
    RdbBlockCacheUsage          *metric.Gauge
    RdbBlockCachePinnedUsage    *metric.Gauge
    RdbBloomFilterPrefixChecked *metric.Gauge
    RdbBloomFilterPrefixUseful  *metric.Gauge
    RdbMemtableTotalSize        *metric.Gauge
    RdbFlushes                  *metric.Gauge
    RdbCompactions              *metric.Gauge
    RdbTableReadersMemEstimate  *metric.Gauge
    RdbReadAmplification        *metric.Gauge
    RdbNumSSTables              *metric.Gauge
    RdbPendingCompaction        *metric.Gauge

    // Range event metrics.
    RangeSplits                     *metric.Counter
    RangeMerges                     *metric.Counter
    RangeAdds                       *metric.Counter
    RangeRemoves                    *metric.Counter
    RangeSnapshotsGenerated         *metric.Counter
    RangeSnapshotsNormalApplied     *metric.Counter
    RangeSnapshotsLearnerApplied    *metric.Counter
    RangeSnapshotsPreemptiveApplied *metric.Counter
    RangeRaftLeaderTransfers        *metric.Counter

    // Raft processing metrics.
    RaftTicks                 *metric.Counter
    RaftWorkingDurationNanos  *metric.Counter
    RaftTickingDurationNanos  *metric.Counter
    RaftCommandsApplied       *metric.Counter
    RaftLogCommitLatency      *metric.Histogram
    RaftCommandCommitLatency  *metric.Histogram
    RaftHandleReadyLatency    *metric.Histogram
    RaftApplyCommittedLatency *metric.Histogram

    // Raft message metrics.
    RaftRcvdMsgProp           *metric.Counter
    RaftRcvdMsgApp            *metric.Counter
    RaftRcvdMsgAppResp        *metric.Counter
    RaftRcvdMsgVote           *metric.Counter
    RaftRcvdMsgVoteResp       *metric.Counter
    RaftRcvdMsgPreVote        *metric.Counter
    RaftRcvdMsgPreVoteResp    *metric.Counter
    RaftRcvdMsgSnap           *metric.Counter
    RaftRcvdMsgHeartbeat      *metric.Counter
    RaftRcvdMsgHeartbeatResp  *metric.Counter
    RaftRcvdMsgTransferLeader *metric.Counter
    RaftRcvdMsgTimeoutNow     *metric.Counter
    RaftRcvdMsgDropped        *metric.Counter

    // Raft log metrics.
    RaftLogFollowerBehindCount *metric.Gauge
    RaftLogTruncated           *metric.Counter

    RaftEnqueuedPending            *metric.Gauge
    RaftCoalescedHeartbeatsPending *metric.Gauge

    // Replica queue metrics.
    GCQueueSuccesses                          *metric.Counter
    GCQueueFailures                           *metric.Counter
    GCQueuePending                            *metric.Gauge
    GCQueueProcessingNanos                    *metric.Counter
    MergeQueueSuccesses                       *metric.Counter
    MergeQueueFailures                        *metric.Counter
    MergeQueuePending                         *metric.Gauge
    MergeQueueProcessingNanos                 *metric.Counter
    MergeQueuePurgatory                       *metric.Gauge
    RaftLogQueueSuccesses                     *metric.Counter
    RaftLogQueueFailures                      *metric.Counter
    RaftLogQueuePending                       *metric.Gauge
    RaftLogQueueProcessingNanos               *metric.Counter
    RaftSnapshotQueueSuccesses                *metric.Counter
    RaftSnapshotQueueFailures                 *metric.Counter
    RaftSnapshotQueuePending                  *metric.Gauge
    RaftSnapshotQueueProcessingNanos          *metric.Counter
    ConsistencyQueueSuccesses                 *metric.Counter
    ConsistencyQueueFailures                  *metric.Counter
    ConsistencyQueuePending                   *metric.Gauge
    ConsistencyQueueProcessingNanos           *metric.Counter
    ReplicaGCQueueSuccesses                   *metric.Counter
    ReplicaGCQueueFailures                    *metric.Counter
    ReplicaGCQueuePending                     *metric.Gauge
    ReplicaGCQueueProcessingNanos             *metric.Counter
    ReplicateQueueSuccesses                   *metric.Counter
    ReplicateQueueFailures                    *metric.Counter
    ReplicateQueuePending                     *metric.Gauge
    ReplicateQueueProcessingNanos             *metric.Counter
    ReplicateQueuePurgatory                   *metric.Gauge
    SplitQueueSuccesses                       *metric.Counter
    SplitQueueFailures                        *metric.Counter
    SplitQueuePending                         *metric.Gauge
    SplitQueueProcessingNanos                 *metric.Counter
    SplitQueuePurgatory                       *metric.Gauge
    TimeSeriesMaintenanceQueueSuccesses       *metric.Counter
    TimeSeriesMaintenanceQueueFailures        *metric.Counter
    TimeSeriesMaintenanceQueuePending         *metric.Gauge
    TimeSeriesMaintenanceQueueProcessingNanos *metric.Counter

    // GCInfo cumulative totals.
    GCNumKeysAffected            *metric.Counter
    GCIntentsConsidered          *metric.Counter
    GCIntentTxns                 *metric.Counter
    GCTransactionSpanScanned     *metric.Counter
    GCTransactionSpanGCAborted   *metric.Counter
    GCTransactionSpanGCCommitted *metric.Counter
    GCTransactionSpanGCStaging   *metric.Counter
    GCTransactionSpanGCPending   *metric.Counter
    GCAbortSpanScanned           *metric.Counter
    GCAbortSpanConsidered        *metric.Counter
    GCAbortSpanGCNum             *metric.Counter
    GCPushTxn                    *metric.Counter
    GCResolveTotal               *metric.Counter
    GCResolveSuccess             *metric.Counter

    // Slow request counts.
    SlowLatchRequests *metric.Gauge
    SlowLeaseRequests *metric.Gauge
    SlowRaftRequests  *metric.Gauge

    // Backpressure counts.
    BackpressuredOnSplitRequests *metric.Gauge

    // AddSSTable stats: how many AddSSTable commands were proposed and how many
    // were applied? How many applications required writing a copy?
    AddSSTableProposals           *metric.Counter
    AddSSTableApplications        *metric.Counter
    AddSSTableApplicationCopies   *metric.Counter
    AddSSTableProposalTotalDelay  *metric.Counter
    AddSSTableProposalEngineDelay *metric.Counter

    // Encryption-at-rest stats.
    // EncryptionAlgorithm is an enum representing the cipher in use, so we use a gauge.
    EncryptionAlgorithm *metric.Gauge

    // RangeFeed counts.
    RangeFeedMetrics *rangefeed.Metrics

    // Closed timestamp metrics.
    ClosedTimestampMaxBehindNanos *metric.Gauge
    // contains filtered or unexported fields
}

StoreMetrics is the set of metrics for a given store.

type StorePool Uses

type StorePool struct {
    log.AmbientContext
    // contains filtered or unexported fields
}

StorePool maintains a list of all known stores in the cluster and information on their health.

func NewStorePool Uses

func NewStorePool(
    ambient log.AmbientContext,
    st *cluster.Settings,
    g *gossip.Gossip,
    clock *hlc.Clock,
    nodeCountFn NodeCountFunc,
    nodeLivenessFn NodeLivenessFunc,
    deterministic bool,
) *StorePool

NewStorePool creates a StorePool and registers the store updating callback with gossip.

func (*StorePool) ClusterNodeCount Uses

func (sp *StorePool) ClusterNodeCount() int

ClusterNodeCount returns the number of nodes that are possible allocation targets. This includes dead nodes, but not decommissioning or decommissioned nodes.

func (*StorePool) GetStores Uses

func (sp *StorePool) GetStores() map[roachpb.StoreID]roachpb.StoreDescriptor

GetStores returns information on all the stores with descriptor in the pool. Stores without descriptor (a node that didn't come up yet after a cluster restart) will not be part of the returned set.

func (*StorePool) String Uses

func (sp *StorePool) String() string

type StoreRebalancer Uses

type StoreRebalancer struct {
    log.AmbientContext
    // contains filtered or unexported fields
}

StoreRebalancer is responsible for examining how the associated store's load compares to the load on other stores in the cluster and transferring leases or replicas away if the local store is overloaded.

This isn't implemented as a Queue because the Queues all operate on one replica at a time, making a local decision about each replica. Queues don't really know how the replica they're looking at compares to other replicas on the store. Our goal is balancing stores, though, so it's preferable to make decisions about each store and then carefully pick replicas to move that will best accomplish the store-level goals.

func NewStoreRebalancer Uses

func NewStoreRebalancer(
    ambientCtx log.AmbientContext,
    st *cluster.Settings,
    rq *replicateQueue,
    replRankings *replicaRankings,
) *StoreRebalancer

NewStoreRebalancer creates a StoreRebalancer to work in tandem with the provided replicateQueue.

func (*StoreRebalancer) Start Uses

func (sr *StoreRebalancer) Start(ctx context.Context, stopper *stop.Stopper)

Start runs an infinite loop in a goroutine which regularly checks whether the store is overloaded along any important dimension (e.g. range count, QPS, disk usage), and if so attempts to correct that by moving leases or replicas elsewhere.

This worker acts on store-level imbalances, whereas the replicate queue makes decisions based on the zone config constraints and diversity of individual ranges. This means that there are two different workers that could potentially be making decisions about a given range, so they have to be careful to avoid stepping on each others' toes.

TODO(a-robinson): Expose metrics to make this understandable without having to dive into logspy.

type StoreRebalancerMetrics Uses

type StoreRebalancerMetrics struct {
    LeaseTransferCount  *metric.Counter
    RangeRebalanceCount *metric.Counter
}

StoreRebalancerMetrics is the set of metrics for the store-level rebalancer.

type StoreRequestHeader Uses

type StoreRequestHeader struct {
    NodeID  github_com_cockroachdb_cockroach_pkg_roachpb.NodeID  `protobuf:"varint,1,opt,name=node_id,json=nodeId,proto3,casttype=github.com/cockroachdb/cockroach/pkg/roachpb.NodeID" json:"node_id,omitempty"`
    StoreID github_com_cockroachdb_cockroach_pkg_roachpb.StoreID `protobuf:"varint,2,opt,name=store_id,json=storeId,proto3,casttype=github.com/cockroachdb/cockroach/pkg/roachpb.StoreID" json:"store_id,omitempty"`
}

StoreRequestHeader locates a Store on a Node.

func (*StoreRequestHeader) Descriptor Uses

func (*StoreRequestHeader) Descriptor() ([]byte, []int)

func (*StoreRequestHeader) Marshal Uses

func (m *StoreRequestHeader) Marshal() (dAtA []byte, err error)

func (*StoreRequestHeader) MarshalTo Uses

func (m *StoreRequestHeader) MarshalTo(dAtA []byte) (int, error)

func (*StoreRequestHeader) ProtoMessage Uses

func (*StoreRequestHeader) ProtoMessage()

func (*StoreRequestHeader) Reset Uses

func (m *StoreRequestHeader) Reset()

func (*StoreRequestHeader) Size Uses

func (m *StoreRequestHeader) Size() (n int)

func (*StoreRequestHeader) String Uses

func (m *StoreRequestHeader) String() string

func (*StoreRequestHeader) Unmarshal Uses

func (m *StoreRequestHeader) Unmarshal(dAtA []byte) error

func (*StoreRequestHeader) XXX_DiscardUnknown Uses

func (m *StoreRequestHeader) XXX_DiscardUnknown()

func (*StoreRequestHeader) XXX_Marshal Uses

func (m *StoreRequestHeader) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*StoreRequestHeader) XXX_Merge Uses

func (dst *StoreRequestHeader) XXX_Merge(src proto.Message)

func (*StoreRequestHeader) XXX_Size Uses

func (m *StoreRequestHeader) XXX_Size() int

func (*StoreRequestHeader) XXX_Unmarshal Uses

func (m *StoreRequestHeader) XXX_Unmarshal(b []byte) error

type StoreTestingKnobs Uses

type StoreTestingKnobs struct {
    EvalKnobs               storagebase.BatchEvalTestingKnobs
    IntentResolverKnobs     storagebase.IntentResolverTestingKnobs
    TxnWaitKnobs            txnwait.TestingKnobs
    ConsistencyTestingKnobs ConsistencyTestingKnobs

    // TestingRequestFilter is called before evaluating each command on a
    // replica. The filter is run before the request acquires latches, so
    // blocking in the filter will not block interfering requests. If it
    // returns an error, the command will not be evaluated.
    TestingRequestFilter storagebase.ReplicaRequestFilter

    // TestingLatchFilter is called before evaluating each command on a replica
    // but after acquiring latches for the command. Blocking in the filter will
    // block interfering requests. If it returns an error, the command will not
    // be evaluated.
    TestingLatchFilter storagebase.ReplicaRequestFilter

    // TestingProposalFilter is called before proposing each command.
    TestingProposalFilter storagebase.ReplicaProposalFilter

    // TestingApplyFilter is called before applying the results of a
    // command on each replica. If it returns an error, the command will
    // not be applied. If it returns an error on some replicas but not
    // others, the behavior is poorly defined.
    TestingApplyFilter storagebase.ReplicaApplyFilter

    // TestingPostApplyFilter is called after a command is applied to
    // rocksdb but before in-memory side effects have been processed.
    // It is only called on the replica the proposed the command.
    TestingPostApplyFilter storagebase.ReplicaApplyFilter

    // TestingResponseFilter is called after the replica processes a
    // command in order for unittests to modify the batch response,
    // error returned to the client, or to simulate network failures.
    TestingResponseFilter storagebase.ReplicaResponseFilter

    // A hack to manipulate the clock before sending a batch request to a replica.
    // TODO(kaneda): This hook is not encouraged to use. Get rid of it once
    // we make TestServer take a ManualClock.
    ClockBeforeSend func(*hlc.Clock, roachpb.BatchRequest)
    // MaxOffset, if set, overrides the server clock's MaxOffset at server
    // creation time.
    // See also DisableMaxOffsetCheck.
    MaxOffset time.Duration
    // DisableMaxOffsetCheck disables the rejection (in Store.Send) of requests
    // with the timestamp too much in the future. Normally, this rejection is a
    // good sanity check, but certain tests unfortunately insert a "message from
    // the future" into the system to advance the clock of a TestServer. We
    // should get rid of such practices once we make TestServer take a
    // ManualClock.
    DisableMaxOffsetCheck bool
    // DontPreventUseOfOldLeaseOnStart disables the initialization of
    // replica.mu.minLeaseProposedTS on replica.Init(). This has the effect of
    // allowing the replica to use the lease that it had in a previous life (in
    // case the tests persisted the engine used in said previous life).
    DontPreventUseOfOldLeaseOnStart bool
    // DisableAutomaticLeaseRenewal enables turning off the background worker
    // that attempts to automatically renew expiration-based leases.
    DisableAutomaticLeaseRenewal bool
    // LeaseRequestEvent, if set, is called when replica.requestLeaseLocked() is
    // called to acquire a new lease. This can be used to assert that a request
    // triggers a lease acquisition.
    LeaseRequestEvent func(ts hlc.Timestamp)
    // LeaseTransferBlockedOnExtensionEvent, if set, is called when
    // replica.TransferLease() encounters an in-progress lease extension.
    // nextLeader is the replica that we're trying to transfer the lease to.
    LeaseTransferBlockedOnExtensionEvent func(nextLeader roachpb.ReplicaDescriptor)
    // DisableGCQueue disables the GC queue.
    DisableGCQueue bool
    // DisableMergeQueue disables the merge queue.
    DisableMergeQueue bool
    // DisableReplicateQueue disables the raft log queue.
    DisableRaftLogQueue bool
    // DisableReplicaGCQueue disables the replica GC queue.
    DisableReplicaGCQueue bool
    // DisableReplicateQueue disables the replication queue.
    DisableReplicateQueue bool
    // DisableReplicaRebalancing disables rebalancing of replicas but otherwise
    // leaves the replicate queue operational.
    DisableReplicaRebalancing bool
    // DisableLoadBasedSplitting turns off LBS so no splits happen because of load.
    DisableLoadBasedSplitting bool
    // DisableSplitQueue disables the split queue.
    DisableSplitQueue bool
    // DisableTimeSeriesMaintenanceQueue disables the time series maintenance
    // queue.
    DisableTimeSeriesMaintenanceQueue bool
    // DisableRaftSnapshotQueue disables the raft snapshot queue.
    DisableRaftSnapshotQueue bool
    // DisableConsistencyQueue disables the consistency checker.
    DisableConsistencyQueue bool
    // DisableScanner disables the replica scanner.
    DisableScanner bool
    // DisablePeriodicGossips disables periodic gossiping.
    DisablePeriodicGossips bool
    // DisableLeaderFollowsLeaseholder disables attempts to transfer raft
    // leadership when it diverges from the range's leaseholder.
    DisableLeaderFollowsLeaseholder bool
    // DisableRefreshReasonNewLeader disables refreshing pending commands when a new
    // leader is discovered.
    DisableRefreshReasonNewLeader bool
    // DisableRefreshReasonNewLeaderOrConfigChange disables refreshing pending
    // commands when a new leader is discovered or when a config change is
    // dropped.
    DisableRefreshReasonNewLeaderOrConfigChange bool
    // DisableRefreshReasonTicks disables refreshing pending commands when a
    // snapshot is applied.
    DisableRefreshReasonSnapshotApplied bool
    // DisableRefreshReasonTicks disables refreshing pending commands
    // periodically.
    DisableRefreshReasonTicks bool
    // DisableEagerReplicaRemoval prevents the Replica from destroying itself
    // when it encounters a ChangeReplicasTrigger which would remove it or when
    // a ReplicaTooOldError in a RaftMessageResponse would lead to removal.
    // This option can lead to nasty cases during shutdown where a replica will
    // spin attempting to acquire a split or merge lock on a RHS which will
    // always fail and is generally not safe but is useful for testing.
    DisableEagerReplicaRemoval bool
    // RefreshReasonTicksPeriod overrides the default period over which
    // pending commands are refreshed. The period is specified as a multiple
    // of Raft group ticks.
    RefreshReasonTicksPeriod int
    // DisableProcessRaft disables the process raft loop.
    DisableProcessRaft bool
    // DisableLastProcessedCheck disables checking on replica queue last processed times.
    DisableLastProcessedCheck bool
    // ReplicateQueueAcceptsUnsplit allows the replication queue to
    // process ranges that need to be split, for use in tests that use
    // the replication queue but disable the split queue.
    ReplicateQueueAcceptsUnsplit bool
    // SplitQueuePurgatoryChan allows a test to control the channel used to
    // trigger split queue purgatory processing.
    SplitQueuePurgatoryChan <-chan time.Time
    // SkipMinSizeCheck, if set, makes the store creation process skip the check
    // for a minimum size.
    SkipMinSizeCheck bool
    // DisableLeaseCapacityGossip disables the ability of a changing number of
    // leases to trigger the store to gossip its capacity. With this enabled,
    // only changes in the number of replicas can cause the store to gossip its
    // capacity.
    DisableLeaseCapacityGossip bool
    // SystemLogsGCPeriod is used to override the period of GC of system logs.
    SystemLogsGCPeriod time.Duration
    // SystemLogsGCGCDone is used to notify when system logs GC is done.
    SystemLogsGCGCDone chan<- struct{}
    // DontRetryPushTxnFailures will propagate a push txn failure immediately
    // instead of utilizing the txn wait queue to wait for the transaction to
    // finish or be pushed by a higher priority contender.
    DontRetryPushTxnFailures bool
    // DontRecoverIndeterminateCommits will propagate indeterminate commit
    // errors from failed txn pushes immediately instead of utilizing the txn
    // recovery manager to recovery from the indeterminate state.
    DontRecoverIndeterminateCommits bool
    // TraceAllRaftEvents enables raft event tracing even when the current
    // vmodule would not have enabled it.
    TraceAllRaftEvents bool
    // EnableUnconditionalRefreshesInRaftReady will always set the refresh reason
    // in handleRaftReady to refreshReasonNewLeaderOrConfigChange.
    EnableUnconditionalRefreshesInRaftReady bool

    // ReceiveSnapshot is run after receiving a snapshot header but before
    // acquiring snapshot quota or doing shouldAcceptSnapshotData checks. If an
    // error is returned from the hook, it's sent as an ERROR SnapshotResponse.
    ReceiveSnapshot func(*SnapshotRequest_Header) error
    // ReplicaAddSkipRollback causes replica addition to skip the learner rollback
    // that happens when promotion to a voter fails.
    ReplicaAddSkipLearnerRollback func() bool
    // ReplicaAddStopAfterLearnerSnapshot causes replica addition to return early
    // if the func returns true. Specifically, after the learner txn is successful
    // and after the LEARNER type snapshot, but before promoting it to a voter.
    // This ensures the `*Replica` will be materialized on the Store when it
    // returns.
    ReplicaAddStopAfterLearnerSnapshot func([]roachpb.ReplicationTarget) bool
    // ReplicaSkipLearnerSnapshot causes snapshots to never be sent to learners
    // if the func returns true. Adding replicas proceeds as usual, though if
    // the added replica has no prior state which can be caught up from the raft
    // log, the result will be an voter that is unable to participate in quorum.
    ReplicaSkipLearnerSnapshot func() bool
    // ReplicaAddStopAfterJointConfig causes replica addition to return early if
    // the func returns true. This happens before transitioning out of a joint
    // configuration, after the joint configuration has been entered by means
    // of a first ChangeReplicas transaction. If the replication change does
    // not use joint consensus, this early return is identical to the regular
    // return path.
    ReplicaAddStopAfterJointConfig func() bool
    // ReplicationAlwaysUseJointConfig causes replica addition to always go
    // through a joint configuration, even when this isn't necessary (because
    // the replication change affects only one replica).
    ReplicationAlwaysUseJointConfig func() bool
    // BeforeSnapshotSSTIngestion is run just before the SSTs are ingested when
    // applying a snapshot.
    BeforeSnapshotSSTIngestion func(IncomingSnapshot, SnapshotRequest_Type, []string) error
    // BeforeRelocateOne intercepts the return values of s.relocateOne before
    // they're being put into effect.
    BeforeRelocateOne func(_ []roachpb.ReplicationChange, leaseTarget *roachpb.ReplicationTarget, _ error)

    // MaxApplicationBatchSize enforces a maximum size on application batches.
    // This can be useful for testing conditions which require commands to be
    // applied in separate batches.
    MaxApplicationBatchSize int
}

StoreTestingKnobs is a part of the context used to control parts of the system. The Testing*Filter functions are called at various points in the request pipeline if they are non-nil. These can be used either for synchronization (e.g. to write to a channel when a particular point is reached) or to change the behavior by returning an error (which aborts all further processing for the command).

func (*StoreTestingKnobs) ModuleTestingKnobs Uses

func (*StoreTestingKnobs) ModuleTestingKnobs()

ModuleTestingKnobs is part of the base.ModuleTestingKnobs interface.

type Stores Uses

type Stores struct {
    log.AmbientContext
    // contains filtered or unexported fields
}

Stores provides methods to access a collection of stores. There's a visitor pattern and also an implementation of the client.Sender interface which directs a call to the appropriate store based on the call's key range. Stores also implements the gossip.Storage interface, which allows gossip bootstrap information to be persisted consistently to every store and the most recent bootstrap information to be read at node startup.

func NewStores Uses

func NewStores(
    ambient log.AmbientContext, clock *hlc.Clock, minVersion, serverVersion roachpb.Version,
) *Stores

NewStores returns a local-only sender which directly accesses a collection of stores.

func (*Stores) AddStore Uses

func (ls *Stores) AddStore(s *Store)

AddStore adds the specified store to the store map.

func (*Stores) GetReplicaForRangeID Uses

func (ls *Stores) GetReplicaForRangeID(rangeID roachpb.RangeID) (*Replica, error)

GetReplicaForRangeID returns the replica which contains the specified range, or nil if it's not found.

func (*Stores) GetStore Uses

func (ls *Stores) GetStore(storeID roachpb.StoreID) (*Store, error)

GetStore looks up the store by store ID. Returns an error if not found.

func (*Stores) GetStoreCount Uses

func (ls *Stores) GetStoreCount() int

GetStoreCount returns the number of stores this node is exporting.

func (*Stores) HasStore Uses

func (ls *Stores) HasStore(storeID roachpb.StoreID) bool

HasStore returns true if the specified store is owned by this Stores.

func (*Stores) OnClusterVersionChange Uses

func (ls *Stores) OnClusterVersionChange(ctx context.Context, cv cluster.ClusterVersion) error

OnClusterVersionChange is invoked when the running node receives a notification indicating that the cluster version has changed. It checks the currently persisted version and updates if it is older than the provided update.

func (*Stores) RangeFeed Uses

func (ls *Stores) RangeFeed(
    args *roachpb.RangeFeedRequest, stream roachpb.Internal_RangeFeedServer,
) *roachpb.Error

RangeFeed registers a rangefeed over the specified span. It sends updates to the provided stream and returns with an optional error when the rangefeed is complete.

func (*Stores) ReadBootstrapInfo Uses

func (ls *Stores) ReadBootstrapInfo(bi *gossip.BootstrapInfo) error

ReadBootstrapInfo implements the gossip.Storage interface. Read attempts to read gossip bootstrap info from every known store and finds the most recent from all stores to initialize the bootstrap info argument. Returns an error on any issues reading data for the stores (but excluding the case in which no data has been persisted yet).

func (*Stores) RemoveStore Uses

func (ls *Stores) RemoveStore(s *Store)

RemoveStore removes the specified store from the store map.

func (*Stores) Send Uses

func (ls *Stores) Send(
    ctx context.Context, ba roachpb.BatchRequest,
) (*roachpb.BatchResponse, *roachpb.Error)

Send implements the client.Sender interface. The store is looked up from the store map using the ID specified in the request.

func (*Stores) SynthesizeClusterVersion Uses

func (ls *Stores) SynthesizeClusterVersion(ctx context.Context) (cluster.ClusterVersion, error)

SynthesizeClusterVersion reads and returns the ClusterVersion protobuf (written to any of the configured stores (all of which are bootstrapped)). The returned value is also replicated to all stores for consistency, in case a new store was added or an old store re-configured. In case of non-identical versions across the stores, returns a version that carries the smallest Version.

If there aren't any stores, returns the minimum supported version of the binary.

func (*Stores) VisitStores Uses

func (ls *Stores) VisitStores(visitor func(s *Store) error) error

VisitStores implements a visitor pattern over stores in the storeMap. The specified function is invoked with each store in turn. Care is taken to invoke the visitor func without the lock held to avoid inconsistent lock orderings, as some visitor functions may call back into the Stores object. Stores are visited in random order.

func (*Stores) WriteBootstrapInfo Uses

func (ls *Stores) WriteBootstrapInfo(bi *gossip.BootstrapInfo) error

WriteBootstrapInfo implements the gossip.Storage interface. Write persists the supplied bootstrap info to every known store. Returns nil on success; otherwise returns first error encountered writing to the stores.

func (*Stores) WriteClusterVersion Uses

func (ls *Stores) WriteClusterVersion(ctx context.Context, cv cluster.ClusterVersion) error

WriteClusterVersion persists the supplied ClusterVersion to every configured store. Returns nil on success; otherwise returns first error encountered writing to the stores.

WriteClusterVersion makes no attempt to validate the supplied version.

type TimeSeriesDataStore Uses

type TimeSeriesDataStore interface {
    ContainsTimeSeries(roachpb.RKey, roachpb.RKey) bool
    MaintainTimeSeries(
        context.Context,
        engine.Reader,
        roachpb.RKey,
        roachpb.RKey,
        *client.DB,
        *mon.BytesMonitor,
        int64,
        hlc.Timestamp,
    ) error
}

TimeSeriesDataStore is an interface defined in the storage package that can be implemented by the higher-level time series system. This allows the storage queues to run periodic time series maintenance; importantly, this maintenance can then be informed by data from the local store.

type WaitForApplicationRequest Uses

type WaitForApplicationRequest struct {
    StoreRequestHeader `protobuf:"bytes,1,opt,name=header,proto3,embedded=header" json:"header"`
    RangeID            github_com_cockroachdb_cockroach_pkg_roachpb.RangeID `protobuf:"varint,2,opt,name=range_id,json=rangeId,proto3,casttype=github.com/cockroachdb/cockroach/pkg/roachpb.RangeID" json:"range_id,omitempty"`
    LeaseIndex         uint64                                               `protobuf:"varint,3,opt,name=lease_index,json=leaseIndex,proto3" json:"lease_index,omitempty"`
}

WaitForApplicationRequest blocks until the addressed replica has applied the command with the specified lease index.

func (*WaitForApplicationRequest) Descriptor Uses

func (*WaitForApplicationRequest) Descriptor() ([]byte, []int)

func (*WaitForApplicationRequest) Marshal Uses

func (m *WaitForApplicationRequest) Marshal() (dAtA []byte, err error)

func (*WaitForApplicationRequest) MarshalTo Uses

func (m *WaitForApplicationRequest) MarshalTo(dAtA []byte) (int, error)

func (*WaitForApplicationRequest) ProtoMessage Uses

func (*WaitForApplicationRequest) ProtoMessage()

func (*WaitForApplicationRequest) Reset Uses

func (m *WaitForApplicationRequest) Reset()

func (*WaitForApplicationRequest) Size Uses

func (m *WaitForApplicationRequest) Size() (n int)

func (*WaitForApplicationRequest) String Uses

func (m *WaitForApplicationRequest) String() string

func (*WaitForApplicationRequest) Unmarshal Uses

func (m *WaitForApplicationRequest) Unmarshal(dAtA []byte) error

func (*WaitForApplicationRequest) XXX_DiscardUnknown Uses

func (m *WaitForApplicationRequest) XXX_DiscardUnknown()

func (*WaitForApplicationRequest) XXX_Marshal Uses

func (m *WaitForApplicationRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*WaitForApplicationRequest) XXX_Merge Uses

func (dst *WaitForApplicationRequest) XXX_Merge(src proto.Message)

func (*WaitForApplicationRequest) XXX_Size Uses

func (m *WaitForApplicationRequest) XXX_Size() int

func (*WaitForApplicationRequest) XXX_Unmarshal Uses

func (m *WaitForApplicationRequest) XXX_Unmarshal(b []byte) error

type WaitForApplicationResponse Uses

type WaitForApplicationResponse struct {
}

func (*WaitForApplicationResponse) Descriptor Uses

func (*WaitForApplicationResponse) Descriptor() ([]byte, []int)

func (*WaitForApplicationResponse) Marshal Uses

func (m *WaitForApplicationResponse) Marshal() (dAtA []byte, err error)

func (*WaitForApplicationResponse) MarshalTo Uses

func (m *WaitForApplicationResponse) MarshalTo(dAtA []byte) (int, error)

func (*WaitForApplicationResponse) ProtoMessage Uses

func (*WaitForApplicationResponse) ProtoMessage()

func (*WaitForApplicationResponse) Reset Uses

func (m *WaitForApplicationResponse) Reset()

func (*WaitForApplicationResponse) Size Uses

func (m *WaitForApplicationResponse) Size() (n int)

func (*WaitForApplicationResponse) String Uses

func (m *WaitForApplicationResponse) String() string

func (*WaitForApplicationResponse) Unmarshal Uses

func (m *WaitForApplicationResponse) Unmarshal(dAtA []byte) error

func (*WaitForApplicationResponse) XXX_DiscardUnknown Uses

func (m *WaitForApplicationResponse) XXX_DiscardUnknown()

func (*WaitForApplicationResponse) XXX_Marshal Uses

func (m *WaitForApplicationResponse) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*WaitForApplicationResponse) XXX_Merge Uses

func (dst *WaitForApplicationResponse) XXX_Merge(src proto.Message)

func (*WaitForApplicationResponse) XXX_Size Uses

func (m *WaitForApplicationResponse) XXX_Size() int

func (*WaitForApplicationResponse) XXX_Unmarshal Uses

func (m *WaitForApplicationResponse) XXX_Unmarshal(b []byte) error

type WaitForReplicaInitRequest Uses

type WaitForReplicaInitRequest struct {
    StoreRequestHeader `protobuf:"bytes,1,opt,name=header,proto3,embedded=header" json:"header"`
    RangeID            github_com_cockroachdb_cockroach_pkg_roachpb.RangeID `protobuf:"varint,2,opt,name=range_id,json=rangeId,proto3,casttype=github.com/cockroachdb/cockroach/pkg/roachpb.RangeID" json:"range_id,omitempty"`
}

func (*WaitForReplicaInitRequest) Descriptor Uses

func (*WaitForReplicaInitRequest) Descriptor() ([]byte, []int)

func (*WaitForReplicaInitRequest) Marshal Uses

func (m *WaitForReplicaInitRequest) Marshal() (dAtA []byte, err error)

func (*WaitForReplicaInitRequest) MarshalTo Uses

func (m *WaitForReplicaInitRequest) MarshalTo(dAtA []byte) (int, error)

func (*WaitForReplicaInitRequest) ProtoMessage Uses

func (*WaitForReplicaInitRequest) ProtoMessage()

func (*WaitForReplicaInitRequest) Reset Uses

func (m *WaitForReplicaInitRequest) Reset()

func (*WaitForReplicaInitRequest) Size Uses

func (m *WaitForReplicaInitRequest) Size() (n int)

func (*WaitForReplicaInitRequest) String Uses

func (m *WaitForReplicaInitRequest) String() string

func (*WaitForReplicaInitRequest) Unmarshal Uses

func (m *WaitForReplicaInitRequest) Unmarshal(dAtA []byte) error

func (*WaitForReplicaInitRequest) XXX_DiscardUnknown Uses

func (m *WaitForReplicaInitRequest) XXX_DiscardUnknown()

func (*WaitForReplicaInitRequest) XXX_Marshal Uses

func (m *WaitForReplicaInitRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*WaitForReplicaInitRequest) XXX_Merge Uses

func (dst *WaitForReplicaInitRequest) XXX_Merge(src proto.Message)

func (*WaitForReplicaInitRequest) XXX_Size Uses

func (m *WaitForReplicaInitRequest) XXX_Size() int

func (*WaitForReplicaInitRequest) XXX_Unmarshal Uses

func (m *WaitForReplicaInitRequest) XXX_Unmarshal(b []byte) error

type WaitForReplicaInitResponse Uses

type WaitForReplicaInitResponse struct {
}

func (*WaitForReplicaInitResponse) Descriptor Uses

func (*WaitForReplicaInitResponse) Descriptor() ([]byte, []int)

func (*WaitForReplicaInitResponse) Marshal Uses

func (m *WaitForReplicaInitResponse) Marshal() (dAtA []byte, err error)

func (*WaitForReplicaInitResponse) MarshalTo Uses

func (m *WaitForReplicaInitResponse) MarshalTo(dAtA []byte) (int, error)

func (*WaitForReplicaInitResponse) ProtoMessage Uses

func (*WaitForReplicaInitResponse) ProtoMessage()

func (*WaitForReplicaInitResponse) Reset Uses

func (m *WaitForReplicaInitResponse) Reset()

func (*WaitForReplicaInitResponse) Size Uses

func (m *WaitForReplicaInitResponse) Size() (n int)

func (*WaitForReplicaInitResponse) String Uses

func (m *WaitForReplicaInitResponse) String() string

func (*WaitForReplicaInitResponse) Unmarshal Uses

func (m *WaitForReplicaInitResponse) Unmarshal(dAtA []byte) error

func (*WaitForReplicaInitResponse) XXX_DiscardUnknown Uses

func (m *WaitForReplicaInitResponse) XXX_DiscardUnknown()

func (*WaitForReplicaInitResponse) XXX_Marshal Uses

func (m *WaitForReplicaInitResponse) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*WaitForReplicaInitResponse) XXX_Merge Uses

func (dst *WaitForReplicaInitResponse) XXX_Merge(src proto.Message)

func (*WaitForReplicaInitResponse) XXX_Size Uses

func (m *WaitForReplicaInitResponse) XXX_Size() int

func (*WaitForReplicaInitResponse) XXX_Unmarshal Uses

func (m *WaitForReplicaInitResponse) XXX_Unmarshal(b []byte) error

Directories

PathSynopsis
abortspan
applyPackage apply provides abstractions and routines associated with the application of committed raft entries to a replicated state machine.
batcheval
batcheval/result
bulk
closedtsPackage closedts houses the interfaces and basic definitions used by the various components of the closed timestamp subsystems.
closedts/container
closedts/ctpb
closedts/minpropPackage minprop exports a main data structure, Tracker, which checkpoints closed timestamps and associated Raft Lease Applied indexes positions for which (under additional conditions) it is legal to serve follower reads.
closedts/provider
closedts/provider/testutils
closedts/storage
closedts/transport
closedts/transport/testutils
cloud
compactor
constraint
copysetsPackage copysets provides an implementation of copysets presented in https://web.stanford.edu/~skatti/pubs/usenix13-copysets.pdf.
diskmap
enginePackage engine provides low-level storage.
engine/enginepb
idalloc
intentresolver
raftentryPackage raftentry provides a cache for entries to avoid extra deserializations.
rangefeed
rditer
reports
spanlatchPackage spanlatch provides a latch management structure for serializing access to keys and key ranges.
spanset
split
stateloader
storagebase
storagepb
tscachePackage tscache provides a timestamp cache structure that records the maximum timestamp that key ranges were read from and written to.
txnrecovery
txnwait

Package storage imports 101 packages (graph) and is imported by 93 packages. Updated 2019-11-12. Refresh now. Tools for package owners.