slim: github.com/openacid/slim/trie Index | Examples | Files | Directories

package trie

import "github.com/openacid/slim/trie"

Package trie provides SlimTrie implementation.

A SlimTrie is a static, compressed Trie data structure. It removes unnecessary trie-node(single branch node etc). And it internally uses 3 compacted array to store a trie.

SlimTrie memory overhead is about 14 bits per key(without value), or less.

Key value map or key-range value map

SlimTrie is natively something like a key value map. Actually besides as a key value map, to index a map of key range to value with SlimTrie is also very simple:

Gives a set of key the same value, and use RangeGet() instead of Get(). SlimTrie does not store branches for adjacent leaves with the same value.

See SlimTrie.RangeGet .

False Positive

Just like Bloomfilter, SlimTrie does not contain full information of keys, thus there could be a false positive return: It returns some value and "true" but the key is not in there.

Code:

package main

import (
    "fmt"

    "github.com/openacid/low/size"
    "github.com/openacid/slim/encode"
)

var (
    m = encode.String16{}
)

func main() {

    data := make([]byte, 0)

    keys := getKeys("50kl10")
    offsets := []uint16{}

    for i, k := range keys {
        v := fmt.Sprintf("is the %d-th word", i)

        offsets = append(offsets, uint16(len(data)))

        data = append(data, m.Encode(k)...)
        data = append(data, m.Encode(v)...)

    }

    vsize := 2.0

    st, _ := NewSlimTrie(encode.U16{}, keys, offsets)
    ksize := size.Of(keys)

    sz := size.Of(st)

    avgIdxLen := float64(sz)/float64(len(keys)) - vsize
    avgKeyLen := float64(ksize) / float64(len(keys))

    ratio := avgIdxLen / avgKeyLen * 100

    fmt.Printf(
        "Orignal:: %.1f byte/key --> SlimTrie index: %.1f byte/index\n"+
            "Saved %.1f%%",
        avgKeyLen,
        avgIdxLen,
        100-ratio,
    )

}

Index

Examples

Package Files

bitmap.go errors.go gen_report.go nodes.pb.go prefix.go select.go slimtrie.go slimtrie_create.go slimtrie_getint.go slimtrie_marshal.go slimtrie_query.go slimtrie_str.go slimtrie_ver.go

Constants

const (
    // MaxNodeCnt is the max number of node. Node id in SlimTrie is int32.
    MaxNodeCnt = (1 << 31) - 1

    // NodeTypeInner represents an inner node.
    //
    // Since 0.5.10
    NodeTypeInner = int32(1)

    // NodeTypeLeaf represents a leaf node.
    //
    // Since 0.5.10
    NodeTypeLeaf = int32(0)
)

Variables

var (

    // ErrKeyOutOfOrder means keys to create Trie are not ascendingly ordered.
    ErrKeyOutOfOrder = errors.New("keys not ascending sorted")

    // ErrIncompatible means it is trying to unmarshal data from an incompatible
    // version.
    ErrIncompatible = errors.New("incompatible with marshaled data")
)

func Bool Uses

func Bool(v bool) *bool

type Bitmap Uses

type Bitmap struct {
    // Words contains bitmap
    //
    // Since 0.5.10
    Words []uint64 `protobuf:"varint,20,rep,packed,name=Words,proto3" json:"Words,omitempty"`
    // RankIndex speeds up rank() by pre-calculated it
    //
    // Since 0.5.10
    RankIndex []int32 `protobuf:"varint,30,rep,packed,name=RankIndex,proto3" json:"RankIndex,omitempty"`
    // SelectIndex speeds up select() by pre-calculated it
    //
    // Since 0.5.10
    SelectIndex          []int32  `protobuf:"varint,40,rep,packed,name=SelectIndex,proto3" json:"SelectIndex,omitempty"`
    XXX_NoUnkeyedLiteral struct{} `json:"-"`
    XXX_unrecognized     []byte   `json:"-"`
    XXX_sizecache        int32    `json:"-"`
}

Bitmap is an array of bits.

Since 0.5.10

func (*Bitmap) Descriptor Uses

func (*Bitmap) Descriptor() ([]byte, []int)

func (*Bitmap) GetRankIndex Uses

func (m *Bitmap) GetRankIndex() []int32

func (*Bitmap) GetSelectIndex Uses

func (m *Bitmap) GetSelectIndex() []int32

func (*Bitmap) GetWords Uses

func (m *Bitmap) GetWords() []uint64

func (*Bitmap) ProtoMessage Uses

func (*Bitmap) ProtoMessage()

func (*Bitmap) Reset Uses

func (m *Bitmap) Reset()

func (*Bitmap) String Uses

func (m *Bitmap) String() string

func (*Bitmap) XXX_DiscardUnknown Uses

func (m *Bitmap) XXX_DiscardUnknown()

func (*Bitmap) XXX_Marshal Uses

func (m *Bitmap) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*Bitmap) XXX_Merge Uses

func (dst *Bitmap) XXX_Merge(src proto.Message)

func (*Bitmap) XXX_Size Uses

func (m *Bitmap) XXX_Size() int

func (*Bitmap) XXX_Unmarshal Uses

func (m *Bitmap) XXX_Unmarshal(b []byte) error

type Nodes Uses

type Nodes struct {
    // BigInnerCnt is number of big (257 bit) inner node.
    //
    // Since 0.5.10
    BigInnerCnt int32 `protobuf:"varint,11,opt,name=BigInnerCnt,proto3" json:"BigInnerCnt,omitempty"`
    // BigInnerOffset is the offset caused by "BigInner" nodes:
    //
    // Supposing that the i-th inner node is the j-th short inner node(an inner
    // node can be a short).
    //
    // The offset of this node in "Inners" is
    //
    //     257 * BigInnerCnt +
    //     17 * (i-BigInnerCnt-j) +
    //     ShortSize * j
    //
    // Thus we could create 2 variables to reduce offset calculation time:
    //
    //     BigInnerOffset = (257 - 17) * BigInnerCnt
    //     ShortMinusInner = ShortSize - 17
    //
    // The the offset is:
    //
    //     BigInnerOffset + 17 * i + ShortMinusInner * j
    //
    // Since 0.5.10
    BigInnerOffset int32 `protobuf:"varint,12,opt,name=BigInnerOffset,proto3" json:"BigInnerOffset,omitempty"`
    // ShortMinusInner is ShortSize minus 17.
    // See BigInnerOffset.
    //
    // Since 0.5.10
    ShortMinusInner int32 `protobuf:"varint,13,opt,name=ShortMinusInner,proto3" json:"ShortMinusInner,omitempty"`
    // ShortSize is the number of bit of short bitmap that reduce most memory
    // cost.
    //
    // Since 0.5.10
    ShortSize int32 `protobuf:"varint,14,opt,name=ShortSize,proto3" json:"ShortSize,omitempty"`
    // ShortMask has the lower ShortSize bit set to 1.
    //
    // Since 0.5.10
    ShortMask uint64 `protobuf:"varint,15,opt,name=ShortMask,proto3" json:"ShortMask,omitempty"`
    // NodeTypeBM is a bitmap in which a "1" indicates the i-th node is an inner
    // node, otherwise it is a leaf.
    //
    // Since 0.5.10
    NodeTypeBM *Bitmap `protobuf:"bytes,20,opt,name=NodeTypeBM,proto3" json:"NodeTypeBM,omitempty"`
    // Inners is a array of var-length node label bitmaps.
    // The size of an element bitmap is aligned to 4.
    //
    // Since 0.5.10
    Inners *Bitmap `protobuf:"bytes,30,opt,name=Inners,proto3" json:"Inners,omitempty"`
    // ShortBM indicate most used inner node bitmaps.
    // These nodes takes 4 bits and the actual bitmaps are stored separate.
    //
    // Since 0.5.10
    ShortBM *Bitmap `protobuf:"bytes,31,opt,name=ShortBM,proto3" json:"ShortBM,omitempty"`
    // ShortTable is a mapping of short bitmap to full 17-bit bitmap.
    //
    // Since 0.5.10
    ShortTable []uint32 `protobuf:"varint,32,rep,packed,name=ShortTable,proto3" json:"ShortTable,omitempty"`
    // InnerPrefixes of inner nodes.
    // There are two usages with this field:
    // - If inner node prefix is stored, it is a var-len array of stored prefix string.
    // - If only inner node prefix length is stored, it is a array with fixed-size elts. An array elt is the length in 4-bit of a prefix.
    //
    // In full-prefix mode:
    // An array element is a control byte followed by several data bytes.
    //
    // The 0-th bit in the control byte indicates whether a prefix is
    // truncated(not aligned to 8-bit).
    //
    // An inner node may have a prefix, if the starting bit of the node > the end
    // of previous node.
    //
    // The end of a prefix may not be 8-bit aligned.
    // Thus we need a bitmap to indicated this.
    // If prefix length is not 8-bit aligned, the trailing bits a filled with a
    // "1" followed by "0"s.
    // To retrieve the accurate prefix, remove the bits from the last "1".
    // E.g.:
    //
    //   prefix:                  11001100 11000011
    //   stored prefix:  00000000 11001100 11010011;  control byte = 0
    //
    //   prefix:                  11001100 110
    //   stored prefix:  00000001 11001100 11010000;  control byte = 1
    //
    // Since 0.5.10
    InnerPrefixes *VLenArray `protobuf:"bytes,38,opt,name=InnerPrefixes,proto3" json:"InnerPrefixes,omitempty"`
    // LeafPrefixes stores prefix of every leaf if it is not nil.
    // A leaf prefix unlike inner node prefix, is just a byte sequence, without a control byte.
    //
    // Since 0.5.10
    LeafPrefixes *VLenArray `protobuf:"bytes,58,opt,name=LeafPrefixes,proto3" json:"LeafPrefixes,omitempty"`
    // Leaves stores serialized leaf values.
    //
    // Since 0.5.10
    Leaves               *VLenArray `protobuf:"bytes,60,opt,name=Leaves,proto3" json:"Leaves,omitempty"`
    XXX_NoUnkeyedLiteral struct{}   `json:"-"`
    XXX_unrecognized     []byte     `json:"-"`
    XXX_sizecache        int32      `json:"-"`
}

Nodes is array of all inner nodes in slim trie. It is NOT a public type and do not rely on it. Since protobuf just makes all message public.

Since 0.5.10

func (*Nodes) Descriptor Uses

func (*Nodes) Descriptor() ([]byte, []int)

func (*Nodes) GetBigInnerCnt Uses

func (m *Nodes) GetBigInnerCnt() int32

func (*Nodes) GetBigInnerOffset Uses

func (m *Nodes) GetBigInnerOffset() int32

func (*Nodes) GetInnerPrefixes Uses

func (m *Nodes) GetInnerPrefixes() *VLenArray

func (*Nodes) GetInners Uses

func (m *Nodes) GetInners() *Bitmap

func (*Nodes) GetLeafPrefixes Uses

func (m *Nodes) GetLeafPrefixes() *VLenArray

func (*Nodes) GetLeaves Uses

func (m *Nodes) GetLeaves() *VLenArray

func (*Nodes) GetNodeTypeBM Uses

func (m *Nodes) GetNodeTypeBM() *Bitmap

func (*Nodes) GetShortBM Uses

func (m *Nodes) GetShortBM() *Bitmap

func (*Nodes) GetShortMask Uses

func (m *Nodes) GetShortMask() uint64

func (*Nodes) GetShortMinusInner Uses

func (m *Nodes) GetShortMinusInner() int32

func (*Nodes) GetShortSize Uses

func (m *Nodes) GetShortSize() int32

func (*Nodes) GetShortTable Uses

func (m *Nodes) GetShortTable() []uint32

func (*Nodes) GetVersion Uses

func (va *Nodes) GetVersion() string

func (*Nodes) ProtoMessage Uses

func (*Nodes) ProtoMessage()

func (*Nodes) Reset Uses

func (m *Nodes) Reset()

func (*Nodes) String Uses

func (m *Nodes) String() string

func (*Nodes) XXX_DiscardUnknown Uses

func (m *Nodes) XXX_DiscardUnknown()

func (*Nodes) XXX_Marshal Uses

func (m *Nodes) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*Nodes) XXX_Merge Uses

func (dst *Nodes) XXX_Merge(src proto.Message)

func (*Nodes) XXX_Size Uses

func (m *Nodes) XXX_Size() int

func (*Nodes) XXX_Unmarshal Uses

func (m *Nodes) XXX_Unmarshal(b []byte) error

type Opt Uses

type Opt struct {

    // DedupValue remove branches that leads to a record with the same value as the previous one.
    // By default it is true.
    //
    // Reducing leaves with the same value is a effective way to optimize index.
    // E.g., in memory an application stores indexes of 3 keys:
    // a,b,c, pointing to disk offset 4096, 4098, 4099
    // In this case the gaps between a,b,c are very small and with the assumption that one disk IO reading less than 4KB costs the same time.
    // Thus the index does not need to store the exact offset. But instead, only the 4KB-aligned index.
    // Thus a,b,c have the same offset: offsetOf(x) & ~(4096-1)
    // With this assumption, the in memory index will be significantly reduced.
    // by only recording the index of a.
    // Because we know that a<b<c, and offsetOf(c) - offsetOf(a) < 4KB
    //
    // Since 0.5.10
    DedupValue *bool

    // InnerPrefix tells SlimTrie to store text on a trie branch to inner
    // node(not to leaf node), instead of storing only branch length.
    // With this option SlimTrie costs more space but reduces false positive
    // rate.
    //
    // Default false.
    //
    // Since 0.5.10
    InnerPrefix *bool

    // LeafPrefix tells SlimTrie to store text on a branch to leaf node.
    // With this option SlimTrie costs more space but reduces false positive
    // rate.
    //
    // Default false.
    //
    // Since 0.5.10
    LeafPrefix *bool

    // Complete tells SlimTrie to store complete keys content.
    // This option implies "InnerPrefix" and "LeafPrefix".
    // With this option there is no false positive and SlimTrie works just like
    // a static key-value map.
    //
    // Default false.
    //
    // Since 0.5.10
    Complete *bool
}

Opt specifies options for creating a SlimTrie.

By default SlimTrie remove unnecessary information for locating a PRESENT key, such as trie branch content. And it introduces false positives. In this case it is the responsibility of upper level to ensure whether a query result is absolutely correct.

But SlimTrie can also store complete key information thus let it always returns correct query result, without any false positive.

Since 0.5.10

type SlimTrie Uses

type SlimTrie struct {
    // contains filtered or unexported fields
}

SlimTrie is a space efficient Trie index.

The space overhead is about 14 bits per key and is irrelevant to key length.

It does not store full key information, but only just enough info for locating a record. That's why an end user must re-validate the record key after reading it from other storage.

It stores three parts of information in three SlimArray:

`Children` stores node branches and children position.

Since 0.2.0

func NewSlimTrie Uses

func NewSlimTrie(e encode.Encoder, keys []string, values interface{}, opts ...Opt) (*SlimTrie, error)

NewSlimTrie create an SlimTrie. Argument e implements a encode.Encoder to convert user data to serialized bytes and back. Leave it nil if element in values are size fixed type and you do not really care about performance.

int is not of fixed size.
struct { X int64; Y int32; } hax fixed size.

Since 0.2.0

func (*SlimTrie) Get Uses

func (st *SlimTrie) Get(key string) (interface{}, bool)

Get the value of the specified key from SlimTrie.

If the key exist in SlimTrie, it returns the correct value. If the key does NOT exist in SlimTrie, it could also return some value.

Because SlimTrie is a "index" but not a "kv-map", it does not stores complete info of all keys. SlimTrie tell you "WHERE IT POSSIBLY BE", rather than "IT IS JUST THERE".

Since 0.2.0

func (*SlimTrie) GetI16 Uses

func (st *SlimTrie) GetI16(key string) (int16, bool)

GetI16 is same as Get() except it is optimized for int16.

Since 0.5.10

func (*SlimTrie) GetI32 Uses

func (st *SlimTrie) GetI32(key string) (int32, bool)

GetI32 is same as Get() except it is optimized for int32.

Since 0.5.10

func (*SlimTrie) GetI64 Uses

func (st *SlimTrie) GetI64(key string) (int64, bool)

GetI64 is same as Get() except it is optimized for int64.

Since 0.5.10

func (*SlimTrie) GetI8 Uses

func (st *SlimTrie) GetI8(key string) (int8, bool)

GetI8 is same as Get() except it is optimized for int8.

Since 0.5.10

func (*SlimTrie) GetID Uses

func (st *SlimTrie) GetID(key string) int32

GetID looks up for key and return the node id. It should only be used to create a user-defined, type specific SlimTrie.

Since 0.5.10

func (*SlimTrie) GetVersion Uses

func (st *SlimTrie) GetVersion() string

func (*SlimTrie) Marshal Uses

func (st *SlimTrie) Marshal() ([]byte, error)

Marshal serializes it to byte stream.

Since 0.4.3

func (*SlimTrie) ProtoMessage Uses

func (st *SlimTrie) ProtoMessage()

ProtoMessage implements proto.Message

Since 0.4.3

func (*SlimTrie) RangeGet Uses

func (st *SlimTrie) RangeGet(key string) (interface{}, bool)

RangeGet look for a range that contains a key in SlimTrie.

A range that contains a key means range-start <= key <= range-end.

It returns the value the range maps to, and a bool indicate if a range is found.

A positive return value does not mean the range absolutely exists, which in this case, is a "false positive".

Since 0.4.3

Code:

// To index a map of key range to value with SlimTrie is very simple:
//
// Gives a set of key the same value, and use RangeGet() instead of Get().
// SlimTrie does not store branches for adjacent leaves with the same value.

keys := []string{
    "abc",
    "abcd",

    "bc",

    "bcd",
    "bce",
}
values := []int{
    1, 1,
    2,
    3, 3,
}
st, err := NewSlimTrie(encode.Int{}, keys, values)
if err != nil {
    panic(err)
}

cases := []struct {
    key string
    msg string
}{
    {"ab", "out of range"},

    {"abc", "in range"},
    {"abc1", "FALSE POSITIVE"},
    {"abc2", "FALSE POSITIVE"},
    {"abcd", "in range"},

    {"abcde", "FALSE POSITIVE: a suffix of abcd"},

    {"acc", "FALSE POSITIVE"},

    {"bc", "in single key range [bc]"},
    {"bc1", "FALSE POSITIVE"},

    {"bcd1", "FALSE POSITIVE"},

    // {"def", "FALSE POSITIVE"},
}

for _, c := range cases {
    v, found := st.RangeGet(c.key)
    fmt.Printf("%-10s %-5v %-5t: %s\n", c.key, v, found, c.msg)
}

func (*SlimTrie) Reset Uses

func (st *SlimTrie) Reset()

Reset implements proto.Message

Since 0.4.3

func (*SlimTrie) Search Uses

func (st *SlimTrie) Search(key string) (lVal, eqVal, rVal interface{})

Search for a key in SlimTrie.

It returns values of 3 values: The value of greatest key < `key`. It is nil if `key` is the smallest. The value of `key`. It is nil if there is not a matching. The value of smallest key > `key`. It is nil if `key` is the greatest.

A non-nil return value does not mean the `key` exists. An in-existent `key` also could matches partial info stored in SlimTrie.

Since 0.2.0

func (*SlimTrie) String Uses

func (st *SlimTrie) String() string

String implements proto.Message and output human readable multiline representation.

A node is in form of

<income-label>-><node-id>+<step>*<fanout-count>=<value>

E.g.:

000->#000+4*3
         001->#001+4*2
                  003->#004+8=0
                           006->#007+4=1
                  004->#005+1=2
                           006->#008+4=3
         002->#002+4=4
                  006->#006+4=5
                           006->#009+8=6
         003->#003+8=7`[1:]

Since 0.4.3

func (*SlimTrie) Unmarshal Uses

func (st *SlimTrie) Unmarshal(buf []byte) error

Unmarshal a SlimTrie from a byte stream.

Since 0.4.3

type VLenArray Uses

type VLenArray struct {
    // N is the max set bit index plus 1.
    //
    // Since 0.5.10
    N   int32 `protobuf:"varint,10,opt,name=N,proto3" json:"N,omitempty"`
    // EltCnt is the number of present elts.
    //
    // Since 0.5.10
    EltCnt int32 `protobuf:"varint,11,opt,name=EltCnt,proto3" json:"EltCnt,omitempty"`
    // PresenceBM set 1 at the i-th bit if the i-th elt presents.
    //
    // Since 0.5.10
    PresenceBM *Bitmap `protobuf:"bytes,61,opt,name=PresenceBM,proto3" json:"PresenceBM,omitempty"`
    // PositionBM is a bitmap of starting position of every present elt.
    //
    // Since 0.5.10
    PositionBM *Bitmap `protobuf:"bytes,20,opt,name=PositionBM,proto3" json:"PositionBM,omitempty"`
    // FixedSize is set to elt size in Bytes field, if all the elts have equal sizes.
    //
    // Since 0.5.10
    FixedSize int32 `protobuf:"varint,23,opt,name=FixedSize,proto3" json:"FixedSize,omitempty"`
    // Bytes is the content in bytes
    //
    // Since 0.5.10
    Bytes                []byte   `protobuf:"bytes,30,opt,name=Bytes,proto3" json:"Bytes,omitempty"`
    XXX_NoUnkeyedLiteral struct{} `json:"-"`
    XXX_unrecognized     []byte   `json:"-"`
    XXX_sizecache        int32    `json:"-"`
}

VLenArray stores var-length []byte elts.

Since 0.5.10

func (*VLenArray) Descriptor Uses

func (*VLenArray) Descriptor() ([]byte, []int)

func (*VLenArray) GetBytes Uses

func (m *VLenArray) GetBytes() []byte

func (*VLenArray) GetEltCnt Uses

func (m *VLenArray) GetEltCnt() int32

func (*VLenArray) GetFixedSize Uses

func (m *VLenArray) GetFixedSize() int32

func (*VLenArray) GetN Uses

func (m *VLenArray) GetN() int32

func (*VLenArray) GetPositionBM Uses

func (m *VLenArray) GetPositionBM() *Bitmap

func (*VLenArray) GetPresenceBM Uses

func (m *VLenArray) GetPresenceBM() *Bitmap

func (*VLenArray) ProtoMessage Uses

func (*VLenArray) ProtoMessage()

func (*VLenArray) Reset Uses

func (m *VLenArray) Reset()

func (*VLenArray) String Uses

func (m *VLenArray) String() string

func (*VLenArray) XXX_DiscardUnknown Uses

func (m *VLenArray) XXX_DiscardUnknown()

func (*VLenArray) XXX_Marshal Uses

func (m *VLenArray) XXX_Marshal(b []byte, deterministic bool) ([]byte, error)

func (*VLenArray) XXX_Merge Uses

func (dst *VLenArray) XXX_Merge(src proto.Message)

func (*VLenArray) XXX_Size Uses

func (m *VLenArray) XXX_Size() int

func (*VLenArray) XXX_Unmarshal Uses

func (m *VLenArray) XXX_Unmarshal(b []byte) error

Directories

PathSynopsis
benchmarkPackage benchmark provides internally used benchmark support
report

Package trie imports 20 packages (graph) and is imported by 2 packages. Updated 2020-11-30. Refresh now. Tools for package owners.