tokenize

package
v0.0.0-...-0d25092 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 18, 2018 License: MIT Imports: 5 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type EnglishContractions

type EnglishContractions struct {
	// contains filtered or unexported fields
}

func NewEnglishContractions

func NewEnglishContractions() *EnglishContractions

func (*EnglishContractions) Expand

func (c *EnglishContractions) Expand(token *Token) ([]*Token, bool)

type LangContractions

type LangContractions interface {
	Expand(*Token) ([]*Token, bool)
}

type SplitTokenizer

type SplitTokenizer struct {
	// contains filtered or unexported fields
}

func NewSplitTokenizer

func NewSplitTokenizer(delimiter string) *SplitTokenizer

func (*SplitTokenizer) Tokenize

func (t *SplitTokenizer) Tokenize(str string) []*Token

type TBWordTokenizer

type TBWordTokenizer struct {
	LangContractions  LangContractions
	ExpandContrations bool
	Normalize         bool
	// contains filtered or unexported fields
}

Mimics TreeBank word tokenizer without using mass of regexps

func NewTBWordTokenizer

func NewTBWordTokenizer(normalize, checkContr bool, langContr LangContractions) *TBWordTokenizer

func (*TBWordTokenizer) Tokenize

func (t *TBWordTokenizer) Tokenize(s string) []*Token

func (*TBWordTokenizer) TokenizeRune

func (t *TBWordTokenizer) TokenizeRune(s []rune) []*Token

type Token

type Token struct {
	Runes         []rune `json:"runes"`
	Word          string `json:"word"`
	Pos           int    `json:"pos"`
	PosTag        string `json:"pos_tag"`
	IsQuoteStart  bool   `json:"is_quote_start"`
	IsQuoteEnd    bool   `json:"is_quote_end"`
	IsEllipsis    bool   `json:"is_ellipsis"`
	HasApostrophe bool   `json:"has_apostrophe"`
}

func NewToken

func NewToken(str []rune, posStart, length int) *Token

func (*Token) Equals

func (t *Token) Equals(compare *Token) bool

func (*Token) Len

func (t *Token) Len() int

func (*Token) PosEnd

func (t *Token) PosEnd() int

func (*Token) SetText

func (t *Token) SetText(text []rune)

func (*Token) String

func (t *Token) String() string

type TokenExtractor

type TokenExtractor func([]rune, int) (*Token, bool)

type Tokenizer

type Tokenizer interface {
	Tokenize(string) []*Token
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL