Documentation ¶
Index ¶
- Constants
- type BGERank
- type BGERanks
- type GPTEncoder
- func (encoder GPTEncoder) AlignAndSizeTokens(tokens *Tokens, desiredLength int) (alignedTokens Tokens, endAt int)
- func (encoder *GPTEncoder) Decode(encoded *Tokens) (text string)
- func (encoder *GPTEncoder) DecodeBuffer(encoded *[]byte) (text string)
- func (encoder *GPTEncoder) Encode(text *string) *Tokens
- func (encoder *GPTEncoder) EncodeBuffer(buffer *[]byte) *[]byte
- func (encoder *GPTEncoder) EncodeReader(reader io.RuneReader) *Tokens
- func (encoder *GPTEncoder) Get(text string) *Token
- func (encoder *GPTEncoder) SplitWords(text *string) *[]string
- func (encoder *GPTEncoder) StreamingEncode(reader io.RuneReader) func(int) *Tokens
- func (encoder *GPTEncoder) ToBPE(text string) Tokens
- func (encoder *GPTEncoder) TokensReady(tokens *Tokens) bool
- func (encoder GPTEncoder) TrimIncompleteSentence(tokens *Tokens) (*Tokens, error)
- func (encoder GPTEncoder) TrimNewlines(tokens *Tokens, direction TrimDirection, limit uint) (*Tokens, error)
- func (encoder GPTEncoder) TrimSentences(tokens *Tokens, direction TrimDirection, limit uint) (*Tokens, error)
- func (encoder *GPTEncoder) TrimTokens(tokens *Tokens) (trimmed *Tokens)
- func (encoder *GPTEncoder) UpdateSpecialsTree()
- func (encoder *GPTEncoder) WordSplitter(reader io.RuneReader) func() *string
- type GPTPair
- type NextRuneFunc
- type RuneNode
- type RuneNodes
- type Token
- type TokenPair
- type Tokens
- type TrimDirection
- type WordCallback
Constants ¶
const BPE_LRU_SZ = 65536
const PUNC_REGEX = "\\p{L}[.!?;]\\p{L}"
const REGEX_ERROR = "gpt_bpe: Fatal error compiling regular expression: %v"
const RUNEBUF_SZ = 16384
const SPLIT_REGEX = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L" +
"}+| ?\\p{N}+| ?[^\\s\\p{L" +
"}\\p{N}]+|\\s+(\\S){0}|\\s+"
const (
TokenSize = 2
)
const VOCAB_ID_CLIP = "clip-tokenizer"
const VOCAB_ID_GPT2 = "gpt2-tokenizer"
const VOCAB_ID_LLAMA = "llama-tokenizer"
const VOCAB_ID_NERDSTASH_V1 = "nerdstash_v1-tokenizer"
const VOCAB_ID_NERDSTASH_V2 = "nerdstash_v2-tokenizer"
const VOCAB_ID_PILE = "pile-tokenizer"
const WORDCHAN_SZ = 4096
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type GPTEncoder ¶
type GPTEncoder struct { Encoder map[string]Token Decoder map[Token][]byte BpeRanks map[GPTPair]float64 TokenMerges map[TokenPair]Token BytesEncoder *map[byte]Token Specials map[string]Tokens SpecialsTree *RuneNode Cache *lru.ARCCache PuncRunes []rune Normalizer *strings.Replacer DecodeExtra *strings.Replacer BosToken Token EosToken Token PadToken Token LruHits int LruMisses int LruEvictions int LruSize int SplitterThreads int VocabId string // contains filtered or unexported fields }
func NewCLIPEncoder ¶
func NewCLIPEncoder() GPTEncoder
func NewEncoder ¶
func NewEncoder(vocabId string) (*GPTEncoder, error)
NewEncoder Returns a GPTEncoder with the tokenizer data loaded for that vocabulary id.
func NewGPT2Encoder ¶
func NewGPT2Encoder() GPTEncoder
func NewLlama2Encoder ¶
func NewLlama2Encoder() GPTEncoder
func NewNerdstashV1Encoder ¶
func NewNerdstashV1Encoder() GPTEncoder
func NewNerdstashV2Encoder ¶
func NewNerdstashV2Encoder() GPTEncoder
func NewPileEncoder ¶
func NewPileEncoder() GPTEncoder
func (GPTEncoder) AlignAndSizeTokens ¶
func (encoder GPTEncoder) AlignAndSizeTokens(tokens *Tokens, desiredLength int) (alignedTokens Tokens, endAt int)
func (*GPTEncoder) Decode ¶
func (encoder *GPTEncoder) Decode(encoded *Tokens) (text string)
Decode Tokens back into a string, handling unicode.
func (*GPTEncoder) DecodeBuffer ¶
func (encoder *GPTEncoder) DecodeBuffer(encoded *[]byte) (text string)
DecodeBuffer Decode Tokens from a byte array into a string.
func (*GPTEncoder) Encode ¶
func (encoder *GPTEncoder) Encode(text *string) *Tokens
Encode encodes a string into a sequence of tokens.
func (*GPTEncoder) EncodeBuffer ¶
func (encoder *GPTEncoder) EncodeBuffer(buffer *[]byte) *[]byte
EncodeBuffer takes a byte array and encodes it into Tokens in another byte array.
func (*GPTEncoder) EncodeReader ¶
func (encoder *GPTEncoder) EncodeReader(reader io.RuneReader) *Tokens
func (*GPTEncoder) Get ¶
func (encoder *GPTEncoder) Get(text string) *Token
Get Looks up text in the Encoder, and returns the Token representation of it. If the text is not found, then nil is returned.
func (*GPTEncoder) SplitWords ¶
func (encoder *GPTEncoder) SplitWords(text *string) *[]string
SplitWords splits a string into words according to BPE Encoder rules.
func (*GPTEncoder) StreamingEncode ¶
func (encoder *GPTEncoder) StreamingEncode(reader io.RuneReader) func(int) *Tokens
StreamingEncode is a streaming Encoder. It takes an io.RuneReader and returns an iterator function that will return Tokens on each call.
func (*GPTEncoder) ToBPE ¶
func (encoder *GPTEncoder) ToBPE(text string) Tokens
ToBPE Given pre-split text, perform bigram ranking and merges, and returns Tokens
func (*GPTEncoder) TokensReady ¶
func (encoder *GPTEncoder) TokensReady(tokens *Tokens) bool
TokensReady Determine if the sequence of Tokens given is ready to be serialized to string, based on if the sequence will produce valid Unicode runes.
func (GPTEncoder) TrimIncompleteSentence ¶
func (encoder GPTEncoder) TrimIncompleteSentence(tokens *Tokens) (*Tokens, error)
func (GPTEncoder) TrimNewlines ¶
func (encoder GPTEncoder) TrimNewlines(tokens *Tokens, direction TrimDirection, limit uint) (*Tokens, error)
func (GPTEncoder) TrimSentences ¶
func (encoder GPTEncoder) TrimSentences(tokens *Tokens, direction TrimDirection, limit uint) (*Tokens, error)
func (*GPTEncoder) TrimTokens ¶
func (encoder *GPTEncoder) TrimTokens(tokens *Tokens) (trimmed *Tokens)
TrimTokens Trims the given Tokens to tokens that produce valid unicode.
func (*GPTEncoder) UpdateSpecialsTree ¶
func (encoder *GPTEncoder) UpdateSpecialsTree()
func (*GPTEncoder) WordSplitter ¶
func (encoder *GPTEncoder) WordSplitter(reader io.RuneReader) func() *string
WordSplitter Returns an iterator function that reads from an io.RuneReader and splits the input into words. Each invocation of the iterator function returns one word or nil if there are no more words.
type NextRuneFunc ¶
type RuneNode ¶
type RuneNode struct {
// contains filtered or unexported fields
}
func CreateRuneTree ¶
Create a new rune tree from an array of strings to match against.
func NewRuneTree ¶
func NewRuneTree() *RuneNode
func (*RuneNode) InsertIntoRuneTree ¶
func (*RuneNode) InsertReplacementsIntoRuneTree ¶
type TrimDirection ¶
type TrimDirection uint
const ( TrimTop TrimDirection = iota TrimBottom TrimDirection = iota TrimNone TrimDirection = iota )
type WordCallback ¶
type WordCallback func(*string)