bpe

package
v0.2.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 25, 2023 License: Apache-2.0 Imports: 21 Imported by: 7

Documentation

Index

Constants

View Source
const DefaultCacheCapacity int = 10000

Variables

This section is empty.

Functions

This section is empty.

Types

type BPE

type BPE struct {
	// Vocab is the vocabulary assigns a number to each token.
	Vocab *model.Vocab

	// VocabR is Reversed vocabulary, to rebuild sentences.
	VocabR *model.VocabR

	// Merges contains the mapping between Pairs and their (rank, newId).
	Merges *Merges

	// Cache contains the cache for optimizing the encoding step.
	// It is a `map[string]Word`
	Cache *Cache

	// Dropout probability for merges.
	// 0 = no dropout is the default.
	// At 1.0, tokenization will perform no merges, so the result will just be characters.
	Dropout *float32

	// UnkToken is the unknown token to be used when we encounter an unknown char
	UnkToken *string

	// ContinuingSubwordPrefix is an optional prefix
	// to use on any subword that exist only behind another one
	ContinuingSubwordPrefix *string

	// EndOfWordSuffix is an optional suffix
	// to caracterize and end-of-word subword
	EndOfWordSuffix *string
}

BPE is a struct for byte pair encoding model Ref. https://www.aclweb.org/anthology/P16-1162/

func DefaultBPE

func DefaultBPE() (*BPE, error)

func New added in v0.2.0

func New(

	vocab model.Vocab,
	mergesData []string,
	dropout *float32,
	unkToken *string,
	continuingSubwordPrefix *string,
	endOfWordSuffix *string,
) (*BPE, error)

New create new BPE model.

func NewBPE

func NewBPE(vocab model.Vocab, merges Merges) *BPE

NewBPE creates new BPE model with given vocab and merges

func NewBpeFromFiles

func NewBpeFromFiles(vocab, merges string) (*BPE, error)

NewBpeFromFiles create BPE model from vocab and merges files

func (*BPE) ClearCache

func (b *BPE) ClearCache()

ClearCache reset the cache

func (*BPE) FromFiles

func (b *BPE) FromFiles(vocab string, merges string) *BpeBuilder

FromFile creates `BpeBuilder` from vocab and merges files.

func (*BPE) GetContinuingSubwordPrfix

func (b *BPE) GetContinuingSubwordPrfix() *string

GetContinuingSubwordPrefix returns continuing subword prefix

func (*BPE) GetUnkToken

func (b *BPE) GetUnkToken() *string

GetUnkToken returns `unk` token

func (BPE) GetVocab

func (b BPE) GetVocab() map[string]int

GetVocab returns BPE vocab func (b *BPE) GetVocab() *model.Vocab {

func (BPE) GetVocabSize

func (b BPE) GetVocabSize() int

func (BPE) IdToToken

func (b BPE) IdToToken(id int) (token string, ok bool)

func (*BPE) MergeWord

func (b *BPE) MergeWord(w string) *Word

MergeWord merges given word

func (*BPE) ReadFiles

func (b *BPE) ReadFiles(vocabF string, mergesF string) (*model.Vocab, *Merges, error)

ReadFiles read the given files to extract vocab and merges

func (BPE) Save

func (b BPE) Save(dir string, nameOpt ...string) error

func (BPE) TokenToId

func (b BPE) TokenToId(token string) (id int, ok bool)

func (BPE) Tokenize

func (b BPE) Tokenize(sequence string) (retVal []tokenizer.Token, err error)

Tokenize tokenizes sentences into tokens NOTE: sentence is []PreToken struct{Value string, Offsets Offsets}

func (BPE) TokenizeWithCache

func (b BPE) TokenizeWithCache(sequence string) (retVal []tokenizer.Token)

func (*BPE) WordToTokens

func (b *BPE) WordToTokens(word Word) []tokenizer.Token

WordToTokens slices word to tokens

type BpeBuilder

type BpeBuilder struct {
	// contains filtered or unexported fields
}

BpeBuilder can be used to create a `BPE` model with a custom configuration

func NewBpeBuilder

func NewBpeBuilder() *BpeBuilder

func (*BpeBuilder) Build

func (bb *BpeBuilder) Build() (*BPE, error)

Build returns a `BPE` model that uses the BpeBuilder configuration

func (*BpeBuilder) CacheCapacity

func (bb *BpeBuilder) CacheCapacity(capacity int)

CacheCapacity sets the cache capacity. Disable cache by setting it to 0

func (*BpeBuilder) ContinuingSubwordPrefix

func (bb *BpeBuilder) ContinuingSubwordPrefix(continuingSubwordPrefix string)

ContinuingSubword set the `continuingSubwordPrefix` option.

func (*BpeBuilder) Dropout

func (bb *BpeBuilder) Dropout(dropout float32)

Dropout set dropout for model Ref. https://arxiv.org/abs/1910.13267

func (*BpeBuilder) EndOfWordSuffix

func (bb *BpeBuilder) EndOfWordSuffix(endOfWordSuffix string)

EndOfWordSuffix set the `endOfWordSuffix` option.

func (*BpeBuilder) Files

func (bb *BpeBuilder) Files(vocab string, merges string)

Files sets input files for the model

func (*BpeBuilder) UnkToken

func (bb *BpeBuilder) UnkToken(unkTok string)

UnkToken set the `UNK` token for the vocab

func (*BpeBuilder) VocabAndMerges

func (bb *BpeBuilder) VocabAndMerges(vocab model.Vocab, merges Merges)

VocabAndMerges sets vocab and merges

type BpeTrainer

type BpeTrainer struct {
	// The minimum frequency a pair must have to produce a merge operation
	MinFrequency int
	// The target vocabulary size
	VocabSize int
	// Whether to show progress while training
	ShowProgress bool
	// A list of special tokens that the model should know of
	SpecialTokens []tokenizer.AddedToken
	// Whether to limit the number of initial tokens that can be kept before
	// computing merges
	LimitAlphabet *int // TODO: replace with int and `None` value = -1
	// The initial alphabet we want absolutely to include. This allows to cover
	// some characters that are not necessarily in the training set
	InitialAlphabet CharSet
	// An optional prefix to use on any subword that exist only behind another one
	ContinuingSubwordPrefix *string
	// An optional suffix to characterize and end-of-word subword
	EndOfWordSuffix *string
}

BpeTrainer is in charge of training a `BPE` model from a mapping of words to word counts.

Example:

wordCounts := map[string]int = {
	{"Hello", 1},
	{"World", 1},
}

trainer := NewBPETrainer() model, specialTokens := trainer.Train(wordCounts)

func NewBpeTrainer

func NewBpeTrainer(minFreq int, vocabSize int) *BpeTrainer

func (*BpeTrainer) ProcessTokens

func (bt *BpeTrainer) ProcessTokens(words map[string]int, tokens []string)

Process a bunch of tokens, counting them

func (*BpeTrainer) Train

func (bt *BpeTrainer) Train(wordCounts map[string]int) (tokenizer.Model, []tokenizer.AddedToken)

Train trains bpe model on input wordCounts and returns 1. BPE model; 2. merges func (bt *BpeTrainer) Train(wordCounts map[string]int) (BPE, []string) {

func (*BpeTrainer) WithProgressBar

func (bt *BpeTrainer) WithProgressBar() bool

type BpeTrainerBuilder

type BpeTrainerBuilder struct {
	Config *TConfig
}

BpeTrainerBuilder can be used to create a `BpeTrainer` with a custom configuration

func NewBPETrainerBuilder

func NewBPETrainerBuilder() *BpeTrainerBuilder

func (*BpeTrainerBuilder) Build

func (btb *BpeTrainerBuilder) Build() *BpeTrainer

Build constructs the final BpeTrainer

func (*BpeTrainerBuilder) ContinuingSubwordPrefix

func (btb *BpeTrainerBuilder) ContinuingSubwordPrefix(prefix string)

ContinuingSubwordPrefix set the ContinuingSubwordPrefix

func (*BpeTrainerBuilder) EndOfWordSuffix

func (btb *BpeTrainerBuilder) EndOfWordSuffix(suffix string)

EndOfWordSuffix set the EndOfWordSuffix

func (*BpeTrainerBuilder) InitialAlphabet

func (btb *BpeTrainerBuilder) InitialAlphabet(alphabet CharSet)

InitialAlphabet set the initial alphabet

func (*BpeTrainerBuilder) LimitAlphabet

func (btb *BpeTrainerBuilder) LimitAlphabet(limit int)

LimitAlphabet set the alphabet limit

func (*BpeTrainerBuilder) MinFrequency

func (btb *BpeTrainerBuilder) MinFrequency(freq int)

MinFequency set minimum frequency

func (*BpeTrainerBuilder) ShowProgress

func (btb *BpeTrainerBuilder) ShowProgress(show bool)

ShowProgress set whether to show progress

func (*BpeTrainerBuilder) SpecialTokens

func (btb *BpeTrainerBuilder) SpecialTokens(tokens []tokenizer.AddedToken)

SpecialToken set special tokens

func (*BpeTrainerBuilder) VocabSize

func (btb *BpeTrainerBuilder) VocabSize(size int)

VocabSize set the vocabulary size

type Cache

type Cache struct {
	Capacity int
	// contains filtered or unexported fields
}

Cache is a map with read-write mutex included to hold map of `word` strings E.g. https://tour.golang.org/concurrency/9 NOTE: can we you sync.Map struct instead???

func NewCache

func NewCache(capacity int) *Cache

NewCache create an empty Cache with a specified capacity

func (*Cache) Clear

func (c *Cache) Clear()

Clear clears the cache

func (*Cache) Fresh

func (c *Cache) Fresh()

Fresh create a fresh `Cache` with the same configuration

func (*Cache) GetValues

func (c *Cache) GetValues(keys []string) []Word

GetValues returns slices of values associated with input keys

func (*Cache) SetValues

func (c *Cache) SetValues(values []CacheItem)

type CacheItem

type CacheItem struct {
	// Key   interface{}
	// Value interface{}
	Key   string
	Value Word // `word` string
}

type CharSet

type CharSet map[string]struct{}

type Config

type Config struct {
	// contains filtered or unexported fields
}

type Merge

type Merge struct {
	Pos   int
	Rank  int
	NewId int
	Time  time.Time
}

func (*Merge) Cmp

func (m *Merge) Cmp(other *Merge) Ordering

func (*Merge) Eq

func (m *Merge) Eq(other *Merge) bool

NOTE.Should we implement comparing methods? - Eq - PartialCmp - Cmp

func (*Merge) PartialCmp

func (m *Merge) PartialCmp(other *Merge) (Ordering, error)

type Merges

type Merges map[Pair]PairVal

func CreateMerges added in v0.2.0

func CreateMerges(vocab map[string]int, mergesData []string) (*Merges, error)

type Ordering

type Ordering int

Ordering is a enum of Less, Equal, and Greater

const (
	Less    Ordering = -1
	Equal   Ordering = 0
	Greater Ordering = 1
)

type Pair

type Pair struct {
	C1 int
	C2 int
}

type PairVal

type PairVal struct {
	Rank  int
	NewId int
}

PairVal holds pair's rank and NewId

type Symbol

type Symbol struct {
	C    int
	Prev int
	Next int
	Len  int
}

func (*Symbol) MergeWith

func (s *Symbol) MergeWith(other *Symbol, newC int)

type Symbols

type Symbols []Symbol

Some slice methods to manipulate slice struct Symbol

func (*Symbols) Insert

func (ss *Symbols) Insert(s Symbol, i int) error

Insert inserts a symbol to the slice at `i` index point

func (*Symbols) Remove

func (ss *Symbols) Remove(i int) error

Remove removes a symbol from the slice at `i` index point

type TConfig

type TConfig struct {
	MinFrequency            int
	VocabSize               int
	ShowProgress            bool
	SpecialTokens           []tokenizer.AddedToken
	LimitAlphabet           *int
	InitialAlphabet         CharSet
	ContinuingSubwordPrefix *string
	EndOfWordSuffix         *string
}

NOTE: there exists `Config`

type TMerge

type TMerge struct {
	Pair  Pair
	Count int
	Pos   UintSet
	Time  time.Time
}

type UintSet

type UintSet map[int]struct{}

Map with no value Ref: https://stackoverflow.com/questions/57620170

type WChange

type WChange struct {
	C1     int
	C2     int
	Change int
}

type Word

type Word struct {
	Symbols Symbols
}

func NewWord

func NewWord() *Word

func (*Word) Add

func (w *Word) Add(c int, byteLen int)

func (*Word) GetChars

func (w *Word) GetChars() []int

func (*Word) GetOffsets

func (w *Word) GetOffsets() [][]int

func (*Word) Merge

func (w *Word) Merge(c1, c2, replacement int) ([]WChange, error)

Merge finds any pairs of (c1, c2) and removes in place. It also maps changes depending on the position of the pair in word.

func (*Word) MergeAll

func (w *Word) MergeAll(merges map[Pair]PairVal, dropoutOpt ...float32)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL