go_bert_tokenizer

package module

v1.0.0 Latest Latest Go to latest Published: Oct 28, 2023 License: MIT Imports: 5 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/Hank-Kuo/go-bert-tokenizer

Links

Open Source Insights

README ¶

go-bert-tokenizer

Implement bert tokenizer by go. Follow Bert tokenzier algo

Usage

FullTokenizer

import (
    tokenizer "github.com/Hank-Kuo/go-bert-tokenizer"
)

voc, _ := tokenizer.FromFile("./tmp/vocab.txt") // load vocab for vocab file 

tkz := tokenizer.NewFullTokenizer(voc1, 128, true) 
encoding := tkz.Tokenize(sentence)
fmt.Println(encoding.Text)
fmt.Println(encoding.Tokens)
fmt.Println(encoding.TokenIDs)
fmt.Println(encoding.MaskIDs)
fmt.Println(encoding.TypeIDs)

WordpieceTokenizer

import (
    tokenizer "github.com/Hank-Kuo/go-bert-tokenizer"
)

voc, _ := tokenizer.FromFile("./tmp/vocab.txt") // load vocab for vocab file 

tkz := tokenizer.NewFullTokenizer(voc1, 128, true) 
encoding := tkz.Tokenize(sentence)
fmt.Println(encoding.Text)
fmt.Println(encoding.Tokens)
fmt.Println(encoding.TokenIDs)
fmt.Println(encoding.MaskIDs)
fmt.Println(encoding.TypeIDs)

BasicTokenizer

import (
    tokenizer "github.com/Hank-Kuo/go-bert-tokenizer"
)

voc, _ := tokenizer.FromFile("./tmp/vocab.txt") // load vocab for vocab file 
seqLen, lower := 128, true
tkz := tokenizer.NewFullTokenizer(voc1, seqLen, lower) 
encoding := tkz.Tokenize(sentence)
fmt.Println(encoding.Text)
fmt.Println(encoding.Tokens)
fmt.Println(encoding.TokenIDs)
fmt.Println(encoding.MaskIDs)
fmt.Println(encoding.TypeIDs)

Documentation ¶

Index ¶

Constants
type BasicTokenizer
- func NewBasicTokenizer(lower bool) *BasicTokenizer
- func (bt *BasicTokenizer) Tokenize(text string) []string
type Encode
type FullTokenizer
- func NewFullTokenizer(voc *Vocab, seqLen int, lower bool) *FullTokenizer
- func (tkz *FullTokenizer) Tokenize(text string) *Encode
type ID
- func (id ID) Int32() int32
type Vocab
- func FromFile(path string) (*Vocab, error)
- func New(tokens []string) *Vocab
type WordpieceTokenizer
- func NewWordpieceTokenizer(voc *Vocab) *WordpieceTokenizer

Constants ¶

View Source

const (
	ClassToken        = "[CLS]"
	SeparatorToken    = "[SEP]"
	SequenceSeparator = " ||| "
)

View Source

const DefaultMaxWordChars = 200

View Source

const DefaultUnknownToken = "[UNK]"

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type BasicTokenizer ¶

type BasicTokenizer struct {
	Lower bool
}

func NewBasicTokenizer ¶

func NewBasicTokenizer(lower bool) *BasicTokenizer

func (*BasicTokenizer) Tokenize ¶

func (bt *BasicTokenizer) Tokenize(text string) []string

type Encode ¶

type Encode struct {
	ID       int32
	Text     string
	Tokens   []string
	TokenIDs []int32
	MaskIDs  []int32
	TypeIDs  []int32
}

type FullTokenizer ¶

type FullTokenizer struct {
	Basic     *BasicTokenizer
	Wordpiece *WordpieceTokenizer
	SeqLen    int
}

func NewFullTokenizer ¶

func NewFullTokenizer(voc *Vocab, seqLen int, lower bool) *FullTokenizer

func (*FullTokenizer) Tokenize ¶

func (tkz *FullTokenizer) Tokenize(text string) *Encode

type ID ¶

type ID int32

func (ID) Int32 ¶

func (id ID) Int32() int32

type Vocab ¶

type Vocab struct {
	// contains filtered or unexported fields
}

func FromFile ¶

func FromFile(path string) (*Vocab, error)

func New ¶

func New(tokens []string) *Vocab

func (*Vocab) Add ¶

func (v *Vocab) Add(token string)

func (*Vocab) GetID ¶

func (v *Vocab) GetID(token string) ID

func (*Vocab) GetToken ¶

func (v *Vocab) GetToken() map[string]ID

func (*Vocab) Size ¶

func (v *Vocab) Size() int

type WordpieceTokenizer ¶

type WordpieceTokenizer struct {
	// contains filtered or unexported fields
}

func NewWordpieceTokenizer ¶

func NewWordpieceTokenizer(voc *Vocab) *WordpieceTokenizer

func (*WordpieceTokenizer) SetMaxWordChars ¶

func (wp *WordpieceTokenizer) SetMaxWordChars(c int)

func (*WordpieceTokenizer) SetUnknownToken ¶

func (wp *WordpieceTokenizer) SetUnknownToken(tok string)

func (*WordpieceTokenizer) Tokenize ¶

func (wp *WordpieceTokenizer) Tokenize(text string) []string

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
example

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL