bpemodel

package

v0.2.0 Latest Latest Go to latest Published: Dec 12, 2020 License: BSD-2-Clause Imports: 9 Imported by: 1

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/nlpodyssey/gotokenizers

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
type BPEModel
- func New(vocab *vocabulary.Vocabulary, merges *MergeMap, cacheCapacity int, ...) *BPEModel
- func NewDefault() *BPEModel
- func (m *BPEModel) Tokenize(sequence string) ([]models.Token, error)
type MergeMap
- func MergeMapFromFile(filename string, vocab *vocabulary.Vocabulary, prefixLength int) (m *MergeMap, err error)
- func NewMergeMap() *MergeMap
- func (m *MergeMap) Get(firstID, secondID int) (MergeValue, bool)
- func (m *MergeMap) Set(firstID, secondID int, v MergeValue)
type MergeValue
type Symbol
type Word
- func NewWord() *Word
- func NewWordWithCapacity(capacity int) *Word
- func (w *Word) Add(symbolID, byteLen int)
- func (w *Word) Len() int
- func (w *Word) MergeAll(merges *MergeMap, dropout float64)
type WordCache
- func NewCache(capacity int) *WordCache
- func NewDefaultCache() *WordCache
- func (c *WordCache) Get(key string) *Word
- func (c *WordCache) GetValues(keys []string) []*Word
- func (c *WordCache) Set(key string, values *Word)
- func (c *WordCache) SetValues(keys []string, values []*Word)
type WordMerge
type WordMergeHeap
- func (h *WordMergeHeap) Len() int
- func (h *WordMergeHeap) Less(i, j int) bool
- func (h *WordMergeHeap) Pop() interface{}
- func (h *WordMergeHeap) Push(x interface{})
- func (h *WordMergeHeap) Swap(i, j int)
type WordSymbol
- func (s *WordSymbol) HasNext() bool
- func (s *WordSymbol) HasPrev() bool
- func (s *WordSymbol) MergeWith(other *WordSymbol, newSymbolID int)

Constants ¶

View Source

const DefaultCacheCapacity = 10_000

DefaultCacheCapacity is the default capacity for BPEModel internal cache.

Variables ¶

View Source

var ErrUnknownTokenOutOfVocabulary = fmt.Errorf("the provided unk token is out of vocabulary")

Functions ¶

This section is empty.

Types ¶

type BPEModel ¶

type BPEModel struct {
	// contains filtered or unexported fields
}

BPEModel is a Byte Pair Encoding (BPE) model.

See: https://www.aclweb.org/anthology/P16-1162/

func New ¶

func New(
	vocab *vocabulary.Vocabulary,
	merges *MergeMap,
	cacheCapacity int,
	dropout float64,
	unknownToken string,
	continuingSubwordPrefix string,
	endOfWordSuffix string,
	unknownFusionEnabled bool,
) *BPEModel

New returns a new BPEModel initialized with the given options.

func NewDefault ¶

func NewDefault() *BPEModel

func (*BPEModel) Tokenize ¶

func (m *BPEModel) Tokenize(sequence string) ([]models.Token, error)

type MergeMap ¶

type MergeMap map[symbolIDPair]MergeValue

MergeMap maps pairs of Symbol IDs to (Rank, ID) values.

func MergeMapFromFile ¶

func MergeMapFromFile(
	filename string,
	vocab *vocabulary.Vocabulary,
	prefixLength int,
) (m *MergeMap, err error)

MergeMapFromFile reads merges from file.

func NewMergeMap ¶

func NewMergeMap() *MergeMap

NewMergeMap a new empty MergeMap.

func (*MergeMap) Get ¶

func (m *MergeMap) Get(firstID, secondID int) (MergeValue, bool)

Get returns a value associated to the given pair of ID, and whether the value exists in the map.

func (*MergeMap) Set ¶

func (m *MergeMap) Set(firstID, secondID int, v MergeValue)

type MergeValue ¶

type MergeValue struct {
	// Rank determines the order in which a merge is applied during
	// tokenization.
	Rank int
	// ID is the vocabulary ID of the symbol resulting from merging a pair of
	// symbols.
	ID int
}

MergeValue is a (Rank, ID) pair.

type Symbol ¶

type Symbol struct {
	// Unique identifier, which implicitly refers to a sequence of characters.
	// For example, it might be the ID of a word in a vocabulary.
	ID int
	// The length in bytes of the implicit sequence of characters.
	Length int
}

Symbol is an abstract reference to a sequence of characters.

type Word ¶

type Word []*WordSymbol

Word is a slice of WordSymbol.

func NewWord ¶

func NewWord() *Word

NewWord returns a new empty Word.

func NewWordWithCapacity ¶

func NewWordWithCapacity(capacity int) *Word

NewWordWithCapacity returns a new empty Word with the given capacity.

func (*Word) Add ¶

func (w *Word) Add(symbolID, byteLen int)

Add appends a new symbol to the Word.

func (*Word) Len ¶

func (w *Word) Len() int

func (*Word) MergeAll ¶

func (w *Word) MergeAll(merges *MergeMap, dropout float64)

type WordCache ¶

type WordCache struct {
	// contains filtered or unexported fields
}

func NewCache ¶

func NewCache(capacity int) *WordCache

NewCache returns a new Cache initialized with the given capacity.

If capacity is set to zero, the cache becomes ineffective (is disabled).

func NewDefaultCache ¶

func NewDefaultCache() *WordCache

NewDefaultCache returns a new Cache initialized with the default capacity.

func (*WordCache) Get ¶

func (c *WordCache) Get(key string) *Word

func (*WordCache) GetValues ¶

func (c *WordCache) GetValues(keys []string) []*Word

func (*WordCache) Set ¶

func (c *WordCache) Set(key string, values *Word)

func (*WordCache) SetValues ¶

func (c *WordCache) SetValues(keys []string, values []*Word)

type WordMerge ¶

type WordMerge struct {
	MergeValue
	Pos int
}

type WordMergeHeap ¶

type WordMergeHeap []WordMerge

func (*WordMergeHeap) Len ¶

func (h *WordMergeHeap) Len() int

func (*WordMergeHeap) Less ¶

func (h *WordMergeHeap) Less(i, j int) bool

func (*WordMergeHeap) Pop ¶

func (h *WordMergeHeap) Pop() interface{}

func (*WordMergeHeap) Push ¶

func (h *WordMergeHeap) Push(x interface{})

func (*WordMergeHeap) Swap ¶

func (h *WordMergeHeap) Swap(i, j int)

type WordSymbol ¶

type WordSymbol struct {
	Symbol
	// Prev is the index of the previous symbol in the Word.
	// -1 means no previous symbol.
	Prev int
	// Prev is the index of the next symbol in the Word.
	// -1 means no next symbol.
	Next int
}

WordSymbol expands a Symbol with contextual information related to the Word that contains it.

func (*WordSymbol) HasNext ¶

func (s *WordSymbol) HasNext() bool

func (*WordSymbol) HasPrev ¶

func (s *WordSymbol) HasPrev() bool

func (*WordSymbol) MergeWith ¶

func (s *WordSymbol) MergeWith(other *WordSymbol, newSymbolID int)

MergeWith merges the current WordSymbol with the other one. In order to update prev/next, we consider the receiver to be the WordSymbol on the left, and other to be the next one on the right.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL