Documentation ¶
Index ¶
- Constants
- type BPE
- func (b *BPE) ClearCache()
- func (b *BPE) FromFiles(vocab string, merges string) *BpeBuilder
- func (b *BPE) GetContinuingSubwordPrfix() *string
- func (b *BPE) GetUnkToken() *string
- func (b BPE) GetVocab() map[string]int
- func (b BPE) GetVocabSize() int
- func (b BPE) IdToToken(id int) (token string, ok bool)
- func (b *BPE) MergeWord(w string) *Word
- func (b *BPE) ReadFiles(vocabF string, mergesF string) (*model.Vocab, *Merges, error)
- func (b BPE) Save(dir string, nameOpt ...string) error
- func (b BPE) TokenToId(token string) (id int, ok bool)
- func (b BPE) Tokenize(sequence string) (retVal []tokenizer.Token, err error)
- func (b BPE) TokenizeWithCache(sequence string) (retVal []tokenizer.Token)
- func (b *BPE) WordToTokens(word Word) []tokenizer.Token
- type BpeBuilder
- func (bb *BpeBuilder) Build() (*BPE, error)
- func (bb *BpeBuilder) CacheCapacity(capacity int)
- func (bb *BpeBuilder) ContinuingSubwordPrefix(continuingSubwordPrefix string)
- func (bb *BpeBuilder) Dropout(dropout float32)
- func (bb *BpeBuilder) EndOfWordSuffix(endOfWordSuffix string)
- func (bb *BpeBuilder) Files(vocab string, merges string)
- func (bb *BpeBuilder) UnkToken(unkTok string)
- func (bb *BpeBuilder) VocabAndMerges(vocab model.Vocab, merges Merges)
- type BpeTrainer
- type BpeTrainerBuilder
- func (btb *BpeTrainerBuilder) Build() *BpeTrainer
- func (btb *BpeTrainerBuilder) ContinuingSubwordPrefix(prefix string)
- func (btb *BpeTrainerBuilder) EndOfWordSuffix(suffix string)
- func (btb *BpeTrainerBuilder) InitialAlphabet(alphabet CharSet)
- func (btb *BpeTrainerBuilder) LimitAlphabet(limit int)
- func (btb *BpeTrainerBuilder) MinFrequency(freq int)
- func (btb *BpeTrainerBuilder) ShowProgress(show bool)
- func (btb *BpeTrainerBuilder) SpecialTokens(tokens []tokenizer.AddedToken)
- func (btb *BpeTrainerBuilder) VocabSize(size int)
- type Cache
- type CacheItem
- type CharSet
- type Config
- type Merge
- type Merges
- type Ordering
- type Pair
- type PairVal
- type Symbol
- type Symbols
- type TConfig
- type TMerge
- type UintSet
- type WChange
- type Word
Constants ¶
const DefaultCacheCapacity int = 10000
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type BPE ¶
type BPE struct { // Vocab is the vocabulary assigns a number to each token. Vocab *model.Vocab // VocabR is Reversed vocabulary, to rebuild sentences. VocabR *model.VocabR // Merges contains the mapping between Pairs and their (rank, newId). Merges *Merges // Cache contains the cache for optimizing the encoding step. // It is a `map[string]Word` Cache *Cache // Dropout probability for merges. // 0 = no dropout is the default. // At 1.0, tokenization will perform no merges, so the result will just be characters. Dropout *float32 // UnkToken is the unknown token to be used when we encounter an unknown char UnkToken *string // ContinuingSubwordPrefix is an optional prefix // to use on any subword that exist only behind another one ContinuingSubwordPrefix *string // EndOfWordSuffix is an optional suffix // to caracterize and end-of-word subword EndOfWordSuffix *string }
BPE is a struct for byte pair encoding model Ref. https://www.aclweb.org/anthology/P16-1162/
func DefaultBPE ¶
func New ¶ added in v0.2.0
func New( vocab model.Vocab, mergesData []string, dropout *float32, unkToken *string, continuingSubwordPrefix *string, endOfWordSuffix *string, ) (*BPE, error)
New create new BPE model.
func NewBpeFromFiles ¶
NewBpeFromFiles create BPE model from vocab and merges files
func (*BPE) FromFiles ¶
func (b *BPE) FromFiles(vocab string, merges string) *BpeBuilder
FromFile creates `BpeBuilder` from vocab and merges files.
func (*BPE) GetContinuingSubwordPrfix ¶
GetContinuingSubwordPrefix returns continuing subword prefix
func (BPE) GetVocabSize ¶
func (BPE) Tokenize ¶
Tokenize tokenizes sentences into tokens NOTE: sentence is []PreToken struct{Value string, Offsets Offsets}
func (BPE) TokenizeWithCache ¶
type BpeBuilder ¶
type BpeBuilder struct {
// contains filtered or unexported fields
}
BpeBuilder can be used to create a `BPE` model with a custom configuration
func NewBpeBuilder ¶
func NewBpeBuilder() *BpeBuilder
func (*BpeBuilder) Build ¶
func (bb *BpeBuilder) Build() (*BPE, error)
Build returns a `BPE` model that uses the BpeBuilder configuration
func (*BpeBuilder) CacheCapacity ¶
func (bb *BpeBuilder) CacheCapacity(capacity int)
CacheCapacity sets the cache capacity. Disable cache by setting it to 0
func (*BpeBuilder) ContinuingSubwordPrefix ¶
func (bb *BpeBuilder) ContinuingSubwordPrefix(continuingSubwordPrefix string)
ContinuingSubword set the `continuingSubwordPrefix` option.
func (*BpeBuilder) Dropout ¶
func (bb *BpeBuilder) Dropout(dropout float32)
Dropout set dropout for model Ref. https://arxiv.org/abs/1910.13267
func (*BpeBuilder) EndOfWordSuffix ¶
func (bb *BpeBuilder) EndOfWordSuffix(endOfWordSuffix string)
EndOfWordSuffix set the `endOfWordSuffix` option.
func (*BpeBuilder) Files ¶
func (bb *BpeBuilder) Files(vocab string, merges string)
Files sets input files for the model
func (*BpeBuilder) UnkToken ¶
func (bb *BpeBuilder) UnkToken(unkTok string)
UnkToken set the `UNK` token for the vocab
func (*BpeBuilder) VocabAndMerges ¶
func (bb *BpeBuilder) VocabAndMerges(vocab model.Vocab, merges Merges)
VocabAndMerges sets vocab and merges
type BpeTrainer ¶
type BpeTrainer struct { // The minimum frequency a pair must have to produce a merge operation MinFrequency int // The target vocabulary size VocabSize int // Whether to show progress while training ShowProgress bool // A list of special tokens that the model should know of SpecialTokens []tokenizer.AddedToken // Whether to limit the number of initial tokens that can be kept before // computing merges LimitAlphabet *int // TODO: replace with int and `None` value = -1 // The initial alphabet we want absolutely to include. This allows to cover // some characters that are not necessarily in the training set InitialAlphabet CharSet // An optional prefix to use on any subword that exist only behind another one ContinuingSubwordPrefix *string // An optional suffix to characterize and end-of-word subword EndOfWordSuffix *string }
BpeTrainer is in charge of training a `BPE` model from a mapping of words to word counts.
Example:
wordCounts := map[string]int = { {"Hello", 1}, {"World", 1}, }
trainer := NewBPETrainer() model, specialTokens := trainer.Train(wordCounts)
func NewBpeTrainer ¶
func NewBpeTrainer(minFreq int, vocabSize int) *BpeTrainer
func (*BpeTrainer) ProcessTokens ¶
func (bt *BpeTrainer) ProcessTokens(words map[string]int, tokens []string)
Process a bunch of tokens, counting them
func (*BpeTrainer) Train ¶
func (bt *BpeTrainer) Train(wordCounts map[string]int) (tokenizer.Model, []tokenizer.AddedToken)
Train trains bpe model on input wordCounts and returns 1. BPE model; 2. merges func (bt *BpeTrainer) Train(wordCounts map[string]int) (BPE, []string) {
func (*BpeTrainer) WithProgressBar ¶
func (bt *BpeTrainer) WithProgressBar() bool
type BpeTrainerBuilder ¶
type BpeTrainerBuilder struct {
Config *TConfig
}
BpeTrainerBuilder can be used to create a `BpeTrainer` with a custom configuration
func NewBPETrainerBuilder ¶
func NewBPETrainerBuilder() *BpeTrainerBuilder
func (*BpeTrainerBuilder) Build ¶
func (btb *BpeTrainerBuilder) Build() *BpeTrainer
Build constructs the final BpeTrainer
func (*BpeTrainerBuilder) ContinuingSubwordPrefix ¶
func (btb *BpeTrainerBuilder) ContinuingSubwordPrefix(prefix string)
ContinuingSubwordPrefix set the ContinuingSubwordPrefix
func (*BpeTrainerBuilder) EndOfWordSuffix ¶
func (btb *BpeTrainerBuilder) EndOfWordSuffix(suffix string)
EndOfWordSuffix set the EndOfWordSuffix
func (*BpeTrainerBuilder) InitialAlphabet ¶
func (btb *BpeTrainerBuilder) InitialAlphabet(alphabet CharSet)
InitialAlphabet set the initial alphabet
func (*BpeTrainerBuilder) LimitAlphabet ¶
func (btb *BpeTrainerBuilder) LimitAlphabet(limit int)
LimitAlphabet set the alphabet limit
func (*BpeTrainerBuilder) MinFrequency ¶
func (btb *BpeTrainerBuilder) MinFrequency(freq int)
MinFequency set minimum frequency
func (*BpeTrainerBuilder) ShowProgress ¶
func (btb *BpeTrainerBuilder) ShowProgress(show bool)
ShowProgress set whether to show progress
func (*BpeTrainerBuilder) SpecialTokens ¶
func (btb *BpeTrainerBuilder) SpecialTokens(tokens []tokenizer.AddedToken)
SpecialToken set special tokens
func (*BpeTrainerBuilder) VocabSize ¶
func (btb *BpeTrainerBuilder) VocabSize(size int)
VocabSize set the vocabulary size
type Cache ¶
type Cache struct { Capacity int // contains filtered or unexported fields }
Cache is a map with read-write mutex included to hold map of `word` strings E.g. https://tour.golang.org/concurrency/9 NOTE: can we you sync.Map struct instead???
func (*Cache) Fresh ¶
func (c *Cache) Fresh()
Fresh create a fresh `Cache` with the same configuration
type Symbols ¶
type Symbols []Symbol
Some slice methods to manipulate slice struct Symbol
type TConfig ¶
type TConfig struct { MinFrequency int VocabSize int ShowProgress bool SpecialTokens []tokenizer.AddedToken LimitAlphabet *int InitialAlphabet CharSet ContinuingSubwordPrefix *string EndOfWordSuffix *string }
NOTE: there exists `Config`
type UintSet ¶
type UintSet map[int]struct{}
Map with no value Ref: https://stackoverflow.com/questions/57620170