tokenize

package
v0.0.0-...-0d25092 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 18, 2018 License: MIT Imports: 5 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type EnglishContractions

type EnglishContractions struct {
	// contains filtered or unexported fields
}

func NewEnglishContractions

func NewEnglishContractions() *EnglishContractions

func (*EnglishContractions) Expand

func (c *EnglishContractions) Expand(token *SentenceToken) ([]*SentenceToken, bool)

type LangContractions

type LangContractions interface {
	Expand(*SentenceToken) ([]*SentenceToken, bool)
}

type SentenceToken

type SentenceToken struct {
	Text          []rune `json:"text"`
	PosStart      int    `json:"pos_start"`
	PosEnd        int    `json:"pos_end"`
	IsQuoteStart  bool   `json:"is_quote_start"`
	IsQuoteEnd    bool   `json:"is_quote_end"`
	IsEllipsis    bool   `json:"is_ellipsis"`
	HasApostrophe bool   `json:"has_apostrophe"`
}

func NewSentenceToken

func NewSentenceToken(str []rune, posStart, posEnd int) *SentenceToken

func (*SentenceToken) Equals

func (t *SentenceToken) Equals(compare *SentenceToken) bool

func (*SentenceToken) String

func (t *SentenceToken) String() string

type TBWordTokenizer

type TBWordTokenizer struct {
	LangContractions  LangContractions
	ExpandContrations bool
	Normalize         bool
	// contains filtered or unexported fields
}

Mimics TreeBank word tokenizer without using mass of regexps

func NewTBWordTokenizer

func NewTBWordTokenizer(normalize, checkContr bool, langContr LangContractions) *TBWordTokenizer

func (*TBWordTokenizer) Tokenize

func (t *TBWordTokenizer) Tokenize(s []rune) []*SentenceToken

type TokenExtractor

type TokenExtractor func([]rune, int) (*SentenceToken, bool)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL