go_bert_tokenizer

package module
v1.0.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 28, 2023 License: MIT Imports: 5 Imported by: 0

README

go-bert-tokenizer

Implement bert tokenizer by go. Follow Bert tokenzier algo

Usage

FullTokenizer
import (
    tokenizer "github.com/Hank-Kuo/go-bert-tokenizer"
)

voc, _ := tokenizer.FromFile("./tmp/vocab.txt") // load vocab for vocab file 

tkz := tokenizer.NewFullTokenizer(voc1, 128, true) 
encoding := tkz.Tokenize(sentence)
fmt.Println(encoding.Text)
fmt.Println(encoding.Tokens)
fmt.Println(encoding.TokenIDs)
fmt.Println(encoding.MaskIDs)
fmt.Println(encoding.TypeIDs)
WordpieceTokenizer
import (
    tokenizer "github.com/Hank-Kuo/go-bert-tokenizer"
)

voc, _ := tokenizer.FromFile("./tmp/vocab.txt") // load vocab for vocab file 

tkz := tokenizer.NewFullTokenizer(voc1, 128, true) 
encoding := tkz.Tokenize(sentence)
fmt.Println(encoding.Text)
fmt.Println(encoding.Tokens)
fmt.Println(encoding.TokenIDs)
fmt.Println(encoding.MaskIDs)
fmt.Println(encoding.TypeIDs)
BasicTokenizer
import (
    tokenizer "github.com/Hank-Kuo/go-bert-tokenizer"
)

voc, _ := tokenizer.FromFile("./tmp/vocab.txt") // load vocab for vocab file 
seqLen, lower := 128, true
tkz := tokenizer.NewFullTokenizer(voc1, seqLen, lower) 
encoding := tkz.Tokenize(sentence)
fmt.Println(encoding.Text)
fmt.Println(encoding.Tokens)
fmt.Println(encoding.TokenIDs)
fmt.Println(encoding.MaskIDs)
fmt.Println(encoding.TypeIDs)

Documentation

Index

Constants

View Source
const (
	ClassToken        = "[CLS]"
	SeparatorToken    = "[SEP]"
	SequenceSeparator = " ||| "
)
View Source
const DefaultMaxWordChars = 200
View Source
const DefaultUnknownToken = "[UNK]"

Variables

This section is empty.

Functions

This section is empty.

Types

type BasicTokenizer

type BasicTokenizer struct {
	Lower bool
}

func NewBasicTokenizer

func NewBasicTokenizer(lower bool) *BasicTokenizer

func (*BasicTokenizer) Tokenize

func (bt *BasicTokenizer) Tokenize(text string) []string

type Encode

type Encode struct {
	ID       int32
	Text     string
	Tokens   []string
	TokenIDs []int32
	MaskIDs  []int32
	TypeIDs  []int32
}

type FullTokenizer

type FullTokenizer struct {
	Basic     *BasicTokenizer
	Wordpiece *WordpieceTokenizer
	SeqLen    int
}

func NewFullTokenizer

func NewFullTokenizer(voc *Vocab, seqLen int, lower bool) *FullTokenizer

func (*FullTokenizer) Tokenize

func (tkz *FullTokenizer) Tokenize(text string) *Encode

type ID

type ID int32

func (ID) Int32

func (id ID) Int32() int32

type Vocab

type Vocab struct {
	// contains filtered or unexported fields
}

func FromFile

func FromFile(path string) (*Vocab, error)

func New

func New(tokens []string) *Vocab

func (*Vocab) Add

func (v *Vocab) Add(token string)

func (*Vocab) GetID

func (v *Vocab) GetID(token string) ID

func (*Vocab) GetToken

func (v *Vocab) GetToken() map[string]ID

func (*Vocab) Size

func (v *Vocab) Size() int

type WordpieceTokenizer

type WordpieceTokenizer struct {
	// contains filtered or unexported fields
}

func NewWordpieceTokenizer

func NewWordpieceTokenizer(voc *Vocab) *WordpieceTokenizer

func (*WordpieceTokenizer) SetMaxWordChars

func (wp *WordpieceTokenizer) SetMaxWordChars(c int)

func (*WordpieceTokenizer) SetUnknownToken

func (wp *WordpieceTokenizer) SetUnknownToken(tok string)

func (*WordpieceTokenizer) Tokenize

func (wp *WordpieceTokenizer) Tokenize(text string) []string

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL