bpe

package

v0.2.2 Latest Latest Go to latest Published: Jun 25, 2023 License: Apache-2.0 Imports: 21 Imported by: 7

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/sugarme/tokenizer

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
type BPE
- func DefaultBPE() (*BPE, error)
- func New(vocab model.Vocab, mergesData []string, dropout *float32, unkToken *string, ...) (*BPE, error)
- func NewBPE(vocab model.Vocab, merges Merges) *BPE
- func NewBpeFromFiles(vocab, merges string) (*BPE, error)
- func (b *BPE) ClearCache()
- func (b *BPE) FromFiles(vocab string, merges string) *BpeBuilder
- func (b *BPE) GetContinuingSubwordPrfix() *string
- func (b *BPE) GetUnkToken() *string
- func (b BPE) GetVocab() map[string]int
- func (b BPE) GetVocabSize() int
- func (b BPE) IdToToken(id int) (token string, ok bool)
- func (b *BPE) MergeWord(w string) *Word
- func (b *BPE) ReadFiles(vocabF string, mergesF string) (*model.Vocab, *Merges, error)
- func (b BPE) Save(dir string, nameOpt ...string) error
- func (b BPE) TokenToId(token string) (id int, ok bool)
- func (b BPE) Tokenize(sequence string) (retVal []tokenizer.Token, err error)
- func (b BPE) TokenizeWithCache(sequence string) (retVal []tokenizer.Token)
- func (b *BPE) WordToTokens(word Word) []tokenizer.Token
type BpeBuilder
- func NewBpeBuilder() *BpeBuilder
- func (bb *BpeBuilder) Build() (*BPE, error)
- func (bb *BpeBuilder) CacheCapacity(capacity int)
- func (bb *BpeBuilder) ContinuingSubwordPrefix(continuingSubwordPrefix string)
- func (bb *BpeBuilder) Dropout(dropout float32)
- func (bb *BpeBuilder) EndOfWordSuffix(endOfWordSuffix string)
- func (bb *BpeBuilder) Files(vocab string, merges string)
- func (bb *BpeBuilder) UnkToken(unkTok string)
- func (bb *BpeBuilder) VocabAndMerges(vocab model.Vocab, merges Merges)
type BpeTrainer
- func NewBpeTrainer(minFreq int, vocabSize int) *BpeTrainer
- func (bt *BpeTrainer) ProcessTokens(words map[string]int, tokens []string)
- func (bt *BpeTrainer) Train(wordCounts map[string]int) (tokenizer.Model, []tokenizer.AddedToken)
- func (bt *BpeTrainer) WithProgressBar() bool
type BpeTrainerBuilder
- func NewBPETrainerBuilder() *BpeTrainerBuilder
- func (btb *BpeTrainerBuilder) Build() *BpeTrainer
- func (btb *BpeTrainerBuilder) ContinuingSubwordPrefix(prefix string)
- func (btb *BpeTrainerBuilder) EndOfWordSuffix(suffix string)
- func (btb *BpeTrainerBuilder) InitialAlphabet(alphabet CharSet)
- func (btb *BpeTrainerBuilder) LimitAlphabet(limit int)
- func (btb *BpeTrainerBuilder) MinFrequency(freq int)
- func (btb *BpeTrainerBuilder) ShowProgress(show bool)
- func (btb *BpeTrainerBuilder) SpecialTokens(tokens []tokenizer.AddedToken)
- func (btb *BpeTrainerBuilder) VocabSize(size int)
type Cache
- func NewCache(capacity int) *Cache
- func (c *Cache) Clear()
- func (c *Cache) Fresh()
- func (c *Cache) GetValues(keys []string) []Word
- func (c *Cache) SetValues(values []CacheItem)
type CacheItem
type CharSet
type Config
type Merge
- func (m *Merge) Cmp(other *Merge) Ordering
- func (m *Merge) Eq(other *Merge) bool
- func (m *Merge) PartialCmp(other *Merge) (Ordering, error)
type Merges
- func CreateMerges(vocab map[string]int, mergesData []string) (*Merges, error)
type Ordering
type Pair
type PairVal
type Symbol
- func (s *Symbol) MergeWith(other *Symbol, newC int)
type Symbols
- func (ss *Symbols) Insert(s Symbol, i int) error
- func (ss *Symbols) Remove(i int) error
type TConfig
type TMerge
type UintSet
type WChange
type Word
- func NewWord() *Word
- func (w *Word) Add(c int, byteLen int)
- func (w *Word) GetChars() []int
- func (w *Word) GetOffsets() [][]int
- func (w *Word) Merge(c1, c2, replacement int) ([]WChange, error)
- func (w *Word) MergeAll(merges map[Pair]PairVal, dropoutOpt ...float32)

Constants ¶

View Source

const DefaultCacheCapacity int = 10000

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type BPE ¶

type BPE struct {
	// Vocab is the vocabulary assigns a number to each token.
	Vocab *model.Vocab

	// VocabR is Reversed vocabulary, to rebuild sentences.
	VocabR *model.VocabR

	// Merges contains the mapping between Pairs and their (rank, newId).
	Merges *Merges

	// Cache contains the cache for optimizing the encoding step.
	// It is a `map[string]Word`
	Cache *Cache

	// Dropout probability for merges.
	// 0 = no dropout is the default.
	// At 1.0, tokenization will perform no merges, so the result will just be characters.
	Dropout *float32

	// UnkToken is the unknown token to be used when we encounter an unknown char
	UnkToken *string

	// ContinuingSubwordPrefix is an optional prefix
	// to use on any subword that exist only behind another one
	ContinuingSubwordPrefix *string

	// EndOfWordSuffix is an optional suffix
	// to caracterize and end-of-word subword
	EndOfWordSuffix *string
}

BPE is a struct for byte pair encoding model Ref. https://www.aclweb.org/anthology/P16-1162/

func DefaultBPE ¶

func DefaultBPE() (*BPE, error)

func New ¶ added in v0.2.0

func New(

	vocab model.Vocab,
	mergesData []string,
	dropout *float32,
	unkToken *string,
	continuingSubwordPrefix *string,
	endOfWordSuffix *string,
) (*BPE, error)

New create new BPE model.

func NewBPE ¶

func NewBPE(vocab model.Vocab, merges Merges) *BPE

NewBPE creates new BPE model with given vocab and merges

func NewBpeFromFiles ¶

func NewBpeFromFiles(vocab, merges string) (*BPE, error)

NewBpeFromFiles create BPE model from vocab and merges files

func (*BPE) ClearCache ¶

func (b *BPE) ClearCache()

ClearCache reset the cache

func (*BPE) FromFiles ¶

func (b *BPE) FromFiles(vocab string, merges string) *BpeBuilder

FromFile creates `BpeBuilder` from vocab and merges files.

func (*BPE) GetContinuingSubwordPrfix ¶

func (b *BPE) GetContinuingSubwordPrfix() *string

GetContinuingSubwordPrefix returns continuing subword prefix

func (*BPE) GetUnkToken ¶

func (b *BPE) GetUnkToken() *string

GetUnkToken returns `unk` token

func (BPE) GetVocab ¶

func (b BPE) GetVocab() map[string]int

GetVocab returns BPE vocab func (b *BPE) GetVocab() *model.Vocab {

func (BPE) GetVocabSize ¶

func (b BPE) GetVocabSize() int

func (BPE) IdToToken ¶

func (b BPE) IdToToken(id int) (token string, ok bool)

func (*BPE) MergeWord ¶

func (b *BPE) MergeWord(w string) *Word

MergeWord merges given word

func (*BPE) ReadFiles ¶

func (b *BPE) ReadFiles(vocabF string, mergesF string) (*model.Vocab, *Merges, error)

ReadFiles read the given files to extract vocab and merges

func (BPE) Save ¶

func (b BPE) Save(dir string, nameOpt ...string) error

func (BPE) TokenToId ¶

func (b BPE) TokenToId(token string) (id int, ok bool)

func (BPE) Tokenize ¶

func (b BPE) Tokenize(sequence string) (retVal []tokenizer.Token, err error)

Tokenize tokenizes sentences into tokens NOTE: sentence is []PreToken struct{Value string, Offsets Offsets}

func (BPE) TokenizeWithCache ¶

func (b BPE) TokenizeWithCache(sequence string) (retVal []tokenizer.Token)

func (*BPE) WordToTokens ¶

func (b *BPE) WordToTokens(word Word) []tokenizer.Token

WordToTokens slices word to tokens

type BpeBuilder ¶

type BpeBuilder struct {
	// contains filtered or unexported fields
}

BpeBuilder can be used to create a `BPE` model with a custom configuration

func NewBpeBuilder ¶

func NewBpeBuilder() *BpeBuilder

func (*BpeBuilder) Build ¶

func (bb *BpeBuilder) Build() (*BPE, error)

Build returns a `BPE` model that uses the BpeBuilder configuration

func (*BpeBuilder) CacheCapacity ¶

func (bb *BpeBuilder) CacheCapacity(capacity int)

CacheCapacity sets the cache capacity. Disable cache by setting it to 0

func (*BpeBuilder) ContinuingSubwordPrefix ¶

func (bb *BpeBuilder) ContinuingSubwordPrefix(continuingSubwordPrefix string)

ContinuingSubword set the `continuingSubwordPrefix` option.

func (*BpeBuilder) Dropout ¶

func (bb *BpeBuilder) Dropout(dropout float32)

Dropout set dropout for model Ref. https://arxiv.org/abs/1910.13267

func (*BpeBuilder) EndOfWordSuffix ¶

func (bb *BpeBuilder) EndOfWordSuffix(endOfWordSuffix string)

EndOfWordSuffix set the `endOfWordSuffix` option.

func (*BpeBuilder) Files ¶

func (bb *BpeBuilder) Files(vocab string, merges string)

Files sets input files for the model

func (*BpeBuilder) UnkToken ¶

func (bb *BpeBuilder) UnkToken(unkTok string)

UnkToken set the `UNK` token for the vocab

func (*BpeBuilder) VocabAndMerges ¶

func (bb *BpeBuilder) VocabAndMerges(vocab model.Vocab, merges Merges)

VocabAndMerges sets vocab and merges

type BpeTrainer ¶

type BpeTrainer struct {
	// The minimum frequency a pair must have to produce a merge operation
	MinFrequency int
	// The target vocabulary size
	VocabSize int
	// Whether to show progress while training
	ShowProgress bool
	// A list of special tokens that the model should know of
	SpecialTokens []tokenizer.AddedToken
	// Whether to limit the number of initial tokens that can be kept before
	// computing merges
	LimitAlphabet *int // TODO: replace with int and `None` value = -1
	// The initial alphabet we want absolutely to include. This allows to cover
	// some characters that are not necessarily in the training set
	InitialAlphabet CharSet
	// An optional prefix to use on any subword that exist only behind another one
	ContinuingSubwordPrefix *string
	// An optional suffix to characterize and end-of-word subword
	EndOfWordSuffix *string
}

BpeTrainer is in charge of training a `BPE` model from a mapping of words to word counts.

Example:

wordCounts := map[string]int = {
	{"Hello", 1},
	{"World", 1},
}

trainer := NewBPETrainer() model, specialTokens := trainer.Train(wordCounts)

func NewBpeTrainer ¶

func NewBpeTrainer(minFreq int, vocabSize int) *BpeTrainer

func (*BpeTrainer) ProcessTokens ¶

func (bt *BpeTrainer) ProcessTokens(words map[string]int, tokens []string)

Process a bunch of tokens, counting them

func (*BpeTrainer) Train ¶

func (bt *BpeTrainer) Train(wordCounts map[string]int) (tokenizer.Model, []tokenizer.AddedToken)

Train trains bpe model on input wordCounts and returns 1. BPE model; 2. merges func (bt *BpeTrainer) Train(wordCounts map[string]int) (BPE, []string) {

func (*BpeTrainer) WithProgressBar ¶

func (bt *BpeTrainer) WithProgressBar() bool

type BpeTrainerBuilder ¶

type BpeTrainerBuilder struct {
	Config *TConfig
}

BpeTrainerBuilder can be used to create a `BpeTrainer` with a custom configuration

func NewBPETrainerBuilder ¶

func NewBPETrainerBuilder() *BpeTrainerBuilder

func (*BpeTrainerBuilder) Build ¶

func (btb *BpeTrainerBuilder) Build() *BpeTrainer

Build constructs the final BpeTrainer

func (*BpeTrainerBuilder) ContinuingSubwordPrefix ¶

func (btb *BpeTrainerBuilder) ContinuingSubwordPrefix(prefix string)

ContinuingSubwordPrefix set the ContinuingSubwordPrefix

func (*BpeTrainerBuilder) EndOfWordSuffix ¶

func (btb *BpeTrainerBuilder) EndOfWordSuffix(suffix string)

EndOfWordSuffix set the EndOfWordSuffix

func (*BpeTrainerBuilder) InitialAlphabet ¶

func (btb *BpeTrainerBuilder) InitialAlphabet(alphabet CharSet)

InitialAlphabet set the initial alphabet

func (*BpeTrainerBuilder) LimitAlphabet ¶

func (btb *BpeTrainerBuilder) LimitAlphabet(limit int)

LimitAlphabet set the alphabet limit

func (*BpeTrainerBuilder) MinFrequency ¶

func (btb *BpeTrainerBuilder) MinFrequency(freq int)

MinFequency set minimum frequency

func (*BpeTrainerBuilder) ShowProgress ¶

func (btb *BpeTrainerBuilder) ShowProgress(show bool)

ShowProgress set whether to show progress

func (*BpeTrainerBuilder) SpecialTokens ¶

func (btb *BpeTrainerBuilder) SpecialTokens(tokens []tokenizer.AddedToken)

SpecialToken set special tokens

func (*BpeTrainerBuilder) VocabSize ¶

func (btb *BpeTrainerBuilder) VocabSize(size int)

VocabSize set the vocabulary size

type Cache ¶

type Cache struct {
	Capacity int
	// contains filtered or unexported fields
}

Cache is a map with read-write mutex included to hold map of `word` strings E.g. https://tour.golang.org/concurrency/9 NOTE: can we you sync.Map struct instead???

func NewCache ¶

func NewCache(capacity int) *Cache

NewCache create an empty Cache with a specified capacity

func (*Cache) Clear ¶

func (c *Cache) Clear()

Clear clears the cache

func (*Cache) Fresh ¶

func (c *Cache) Fresh()

Fresh create a fresh `Cache` with the same configuration

func (*Cache) GetValues ¶

func (c *Cache) GetValues(keys []string) []Word

GetValues returns slices of values associated with input keys

func (*Cache) SetValues ¶

func (c *Cache) SetValues(values []CacheItem)

type CacheItem ¶

type CacheItem struct {
	// Key   interface{}
	// Value interface{}
	Key   string
	Value Word // `word` string
}

type CharSet ¶

type CharSet map[string]struct{}

type Config ¶

type Config struct {
	// contains filtered or unexported fields
}

type Merge ¶

type Merge struct {
	Pos   int
	Rank  int
	NewId int
	Time  time.Time
}

func (*Merge) Cmp ¶

func (m *Merge) Cmp(other *Merge) Ordering

func (*Merge) Eq ¶

func (m *Merge) Eq(other *Merge) bool

NOTE.Should we implement comparing methods? - Eq - PartialCmp - Cmp

func (*Merge) PartialCmp ¶

func (m *Merge) PartialCmp(other *Merge) (Ordering, error)

type Merges ¶

type Merges map[Pair]PairVal

func CreateMerges ¶ added in v0.2.0

func CreateMerges(vocab map[string]int, mergesData []string) (*Merges, error)

type Ordering ¶

type Ordering int

Ordering is a enum of Less, Equal, and Greater

const (
	Less    Ordering = -1
	Equal   Ordering = 0
	Greater Ordering = 1
)

type Pair ¶

type Pair struct {
	C1 int
	C2 int
}

type PairVal ¶

type PairVal struct {
	Rank  int
	NewId int
}

PairVal holds pair's rank and NewId

type Symbol ¶

type Symbol struct {
	C    int
	Prev int
	Next int
	Len  int
}

func (*Symbol) MergeWith ¶

func (s *Symbol) MergeWith(other *Symbol, newC int)

type Symbols ¶

type Symbols []Symbol

Some slice methods to manipulate slice struct Symbol

func (*Symbols) Insert ¶

func (ss *Symbols) Insert(s Symbol, i int) error

Insert inserts a symbol to the slice at `i` index point

func (*Symbols) Remove ¶

func (ss *Symbols) Remove(i int) error

Remove removes a symbol from the slice at `i` index point

type TConfig ¶

type TConfig struct {
	MinFrequency            int
	VocabSize               int
	ShowProgress            bool
	SpecialTokens           []tokenizer.AddedToken
	LimitAlphabet           *int
	InitialAlphabet         CharSet
	ContinuingSubwordPrefix *string
	EndOfWordSuffix         *string
}

NOTE: there exists `Config`

type TMerge ¶

type TMerge struct {
	Pair  Pair
	Count int
	Pos   UintSet
	Time  time.Time
}

type UintSet ¶

type UintSet map[int]struct{}

Map with no value Ref: https://stackoverflow.com/questions/57620170

type WChange ¶

type WChange struct {
	C1     int
	C2     int
	Change int
}

type Word ¶

type Word struct {
	Symbols Symbols
}

func NewWord ¶

func NewWord() *Word

func (*Word) Add ¶

func (w *Word) Add(c int, byteLen int)

func (*Word) GetChars ¶

func (w *Word) GetChars() []int

func (*Word) GetOffsets ¶

func (w *Word) GetOffsets() [][]int

func (*Word) Merge ¶

func (w *Word) Merge(c1, c2, replacement int) ([]WChange, error)

Merge finds any pairs of (c1, c2) and removes in place. It also maps changes depending on the position of the pair in word.

func (*Word) MergeAll ¶

func (w *Word) MergeAll(merges map[Pair]PairVal, dropoutOpt ...float32)

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL