gpt_bpe

package module
v0.0.0-...-edd9879 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 10, 2024 License: LGPL-2.1 Imports: 16 Imported by: 9

README

gpt-bpe

An implementation of GPT2 Byte Pair Encoding Encoder/Decoder in golang. Generally very fast, bottlenecked by the regex engine for whitespace separation.

Documentation

Index

Constants

View Source
const BPE_LRU_SZ = 65536
View Source
const PUNC_REGEX = "\\p{L}[.!?;]\\p{L}"
View Source
const REGEX_ERROR = "gpt_bpe: Fatal error compiling regular expression: %v"
View Source
const RUNEBUF_SZ = 16384
View Source
const SPLIT_REGEX = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L" +
	"}+| ?\\p{N}+| ?[^\\s\\p{L" +
	"}\\p{N}]+|\\s+(\\S){0}|\\s+"
View Source
const (
	TokenSize = 2
)
View Source
const VOCAB_ID_CLIP = "clip-tokenizer"
View Source
const VOCAB_ID_GPT2 = "gpt2-tokenizer"
View Source
const VOCAB_ID_LLAMA = "llama-tokenizer"
View Source
const VOCAB_ID_NERDSTASH_V1 = "nerdstash_v1-tokenizer"
View Source
const VOCAB_ID_NERDSTASH_V2 = "nerdstash_v2-tokenizer"
View Source
const VOCAB_ID_PILE = "pile-tokenizer"
View Source
const WORDCHAN_SZ = 4096

Variables

This section is empty.

Functions

This section is empty.

Types

type BGERank

type BGERank struct {
	// contains filtered or unexported fields
}

type BGERanks

type BGERanks []BGERank

func (BGERanks) Len

func (bs BGERanks) Len() int

func (BGERanks) Less

func (bs BGERanks) Less(i, j int) bool

func (BGERanks) Swap

func (bs BGERanks) Swap(i, j int)

type GPTEncoder

type GPTEncoder struct {
	Encoder      map[string]Token
	Decoder      map[Token][]byte
	BpeRanks     map[GPTPair]float64
	TokenMerges  map[TokenPair]Token
	BytesEncoder *map[byte]Token

	Specials     map[string]Tokens
	SpecialsTree *RuneNode
	Cache        *lru.ARCCache
	PuncRunes    []rune
	Normalizer   *strings.Replacer
	DecodeExtra  *strings.Replacer
	BosToken     Token
	EosToken     Token
	PadToken     Token

	LruHits         int
	LruMisses       int
	LruEvictions    int
	LruSize         int
	SplitterThreads int
	VocabId         string
	// contains filtered or unexported fields
}

func NewCLIPEncoder

func NewCLIPEncoder() GPTEncoder

func NewEncoder

func NewEncoder(vocabId string) (*GPTEncoder, error)

NewEncoder Returns a GPTEncoder with the tokenizer data loaded for that vocabulary id.

func NewGPT2Encoder

func NewGPT2Encoder() GPTEncoder

func NewLlama2Encoder

func NewLlama2Encoder() GPTEncoder

func NewNerdstashV1Encoder

func NewNerdstashV1Encoder() GPTEncoder

func NewNerdstashV2Encoder

func NewNerdstashV2Encoder() GPTEncoder

func NewPileEncoder

func NewPileEncoder() GPTEncoder

func (GPTEncoder) AlignAndSizeTokens

func (encoder GPTEncoder) AlignAndSizeTokens(tokens *Tokens,
	desiredLength int) (alignedTokens Tokens, endAt int)

func (*GPTEncoder) Decode

func (encoder *GPTEncoder) Decode(encoded *Tokens) (text string)

Decode Tokens back into a string, handling unicode.

func (*GPTEncoder) DecodeBuffer

func (encoder *GPTEncoder) DecodeBuffer(encoded *[]byte) (text string)

DecodeBuffer Decode Tokens from a byte array into a string.

func (*GPTEncoder) Encode

func (encoder *GPTEncoder) Encode(text *string) *Tokens

Encode encodes a string into a sequence of tokens.

func (*GPTEncoder) EncodeBuffer

func (encoder *GPTEncoder) EncodeBuffer(buffer *[]byte) *[]byte

EncodeBuffer takes a byte array and encodes it into Tokens in another byte array.

func (*GPTEncoder) EncodeReader

func (encoder *GPTEncoder) EncodeReader(reader io.RuneReader) *Tokens

func (*GPTEncoder) Get

func (encoder *GPTEncoder) Get(text string) *Token

Get Looks up text in the Encoder, and returns the Token representation of it. If the text is not found, then nil is returned.

func (*GPTEncoder) SplitWords

func (encoder *GPTEncoder) SplitWords(text *string) *[]string

SplitWords splits a string into words according to BPE Encoder rules.

func (*GPTEncoder) StreamingEncode

func (encoder *GPTEncoder) StreamingEncode(reader io.RuneReader) func(int) *Tokens

StreamingEncode is a streaming Encoder. It takes an io.RuneReader and returns an iterator function that will return Tokens on each call.

func (*GPTEncoder) ToBPE

func (encoder *GPTEncoder) ToBPE(text string) Tokens

ToBPE Given pre-split text, perform bigram ranking and merges, and returns Tokens

func (*GPTEncoder) TokensReady

func (encoder *GPTEncoder) TokensReady(tokens *Tokens) bool

TokensReady Determine if the sequence of Tokens given is ready to be serialized to string, based on if the sequence will produce valid Unicode runes.

func (GPTEncoder) TrimIncompleteSentence

func (encoder GPTEncoder) TrimIncompleteSentence(tokens *Tokens) (*Tokens, error)

func (GPTEncoder) TrimNewlines

func (encoder GPTEncoder) TrimNewlines(tokens *Tokens, direction TrimDirection,
	limit uint) (*Tokens, error)

func (GPTEncoder) TrimSentences

func (encoder GPTEncoder) TrimSentences(tokens *Tokens, direction TrimDirection,
	limit uint) (*Tokens, error)

func (*GPTEncoder) TrimTokens

func (encoder *GPTEncoder) TrimTokens(tokens *Tokens) (trimmed *Tokens)

TrimTokens Trims the given Tokens to tokens that produce valid unicode.

func (*GPTEncoder) UpdateSpecialsTree

func (encoder *GPTEncoder) UpdateSpecialsTree()

func (*GPTEncoder) WordSplitter

func (encoder *GPTEncoder) WordSplitter(reader io.RuneReader) func() *string

WordSplitter Returns an iterator function that reads from an io.RuneReader and splits the input into words. Each invocation of the iterator function returns one word or nil if there are no more words.

type GPTPair

type GPTPair struct {
	Left  string
	Right string
}

type NextRuneFunc

type NextRuneFunc func() (rune, int, error)

type RuneNode

type RuneNode struct {
	// contains filtered or unexported fields
}

func CreateReplacementsRuneTree

func CreateReplacementsRuneTree(replacements map[string]string) *RuneNode

func CreateRuneTree

func CreateRuneTree(s []string) *RuneNode

Create a new rune tree from an array of strings to match against.

func NewRuneTree

func NewRuneTree() *RuneNode

func (*RuneNode) InsertIntoRuneTree

func (runeTree *RuneNode) InsertIntoRuneTree(s []string)

func (*RuneNode) InsertReplacementsIntoRuneTree

func (runeTree *RuneNode) InsertReplacementsIntoRuneTree(
	replacements map[string]string,
)

func (*RuneNode) String

func (runeTree *RuneNode) String() string

Wrapper

type RuneNodes

type RuneNodes []*RuneNode

type Token

type Token uint16

type TokenPair

type TokenPair struct {
	Left  Token
	Right Token
}

type Tokens

type Tokens []Token

func TokensFromBin

func TokensFromBin(bin *[]byte) *Tokens

func (*Tokens) ToBin

func (tokens *Tokens) ToBin() *[]byte

type TrimDirection

type TrimDirection uint
const (
	TrimTop    TrimDirection = iota
	TrimBottom TrimDirection = iota
	TrimNone   TrimDirection = iota
)

type WordCallback

type WordCallback func(*string)

Directories

Path Synopsis
cmd

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL