Documentation ¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type EnglishContractions ¶
type EnglishContractions struct {
// contains filtered or unexported fields
}
func NewEnglishContractions ¶
func NewEnglishContractions() *EnglishContractions
func (*EnglishContractions) Expand ¶
func (c *EnglishContractions) Expand(token *SentenceToken) ([]*SentenceToken, bool)
type LangContractions ¶
type LangContractions interface {
Expand(*SentenceToken) ([]*SentenceToken, bool)
}
type SentenceToken ¶
type SentenceToken struct { Text []rune `json:"text"` PosStart int `json:"pos_start"` PosEnd int `json:"pos_end"` IsQuoteStart bool `json:"is_quote_start"` IsQuoteEnd bool `json:"is_quote_end"` IsEllipsis bool `json:"is_ellipsis"` HasApostrophe bool `json:"has_apostrophe"` }
func NewSentenceToken ¶
func NewSentenceToken(str []rune, posStart, posEnd int) *SentenceToken
func (*SentenceToken) Equals ¶
func (t *SentenceToken) Equals(compare *SentenceToken) bool
func (*SentenceToken) String ¶
func (t *SentenceToken) String() string
type TBWordTokenizer ¶
type TBWordTokenizer struct { LangContractions LangContractions ExpandContrations bool Normalize bool // contains filtered or unexported fields }
Mimics TreeBank word tokenizer without using mass of regexps
func NewTBWordTokenizer ¶
func NewTBWordTokenizer(normalize, checkContr bool, langContr LangContractions) *TBWordTokenizer
func (*TBWordTokenizer) Tokenize ¶
func (t *TBWordTokenizer) Tokenize(s []rune) []*SentenceToken
type TokenExtractor ¶
type TokenExtractor func([]rune, int) (*SentenceToken, bool)
Click to show internal directories.
Click to hide internal directories.