ling

package module

v0.0.0-...-522aef2 Latest Latest Go to latest Published: May 9, 2020 License: Apache-2.0 Imports: 18 Imported by: 9

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/liuzl/ling

Links

Open Source Insights

README ¶

ling is a golang toolkit for natural language processing

Implementation references

Similar NLP tools

Stanford CoreNLP Java
spaCy Python
lingo Golang

Multilingual text toknization

Text normalization

Text normalization in Go

Lemmatization

词干提取（stemming）和词形还原（lemmatization）

Stemming and lemmatization
Lemmatization Lists_{Datasets by MBM}
The UniMorph Project
中文繁简转换
- gocc Golang version OpenCC
- OpenCC
- Chinese-Character Jian<=>Fan converting library in Go
- Traditional and Simplified Chinese Conversion in Go
- Han unification

Tagging

Regex tagger
- commonregex, a collection of common regular expressions for Go.
- xurls, a Go package of regex for urls.

Natural language Detection

getlang is much slower than franco

getlang
franco
test scripts
- franco: Duration: 5.12s, 26.93%
- getlang: Duration: 11.58s, 59.54%

Documentation ¶

Index ¶

Constants
Variables
func Script(text string) string
type APITagger
- func NewAPITagger(addr string) (*APITagger, error)
- func (t *APITagger) Process(d *Document) error
type DictTagger
- func NewDictTagger() (*DictTagger, error)
- func (t *DictTagger) Process(d *Document) error
type Document
- func NewDocument(text string) *Document
- func (d *Document) NewSpan(start, end int) *Span
- func (d *Document) String() string
- func (d *Document) XRealTokens(anno string) []string
- func (d *Document) XTokens(anno string) []string
type Entity
type Lemmatizer
- func (l *Lemmatizer) Process(d *Document) error
type Normalizer
- func (n *Normalizer) Process(d *Document) error
type Pipeline
- func DefaultNLP() (*Pipeline, error)
- func MustNLP(annotators ...string) *Pipeline
- func NLP(annotators ...string) (*Pipeline, error)
- func (p *Pipeline) AddTagger(t Processor) error
- func (p *Pipeline) Annotate(d *Document) error
- func (p *Pipeline) AnnotatePro(d *Document, taggers ...Processor) error
type Processor
type RegexTagger
- func (t *RegexTagger) Process(d *Document) error
type Span
- func (s *Span) String() string
type Token
- func (t *Token) String() string
type TokenType
- func Type(text string) TokenType
- func (r TokenType) MarshalJSON() ([]byte, error)
- func (i TokenType) String() string
- func (r *TokenType) UnmarshalJSON(data []byte) error
type Tokenizer
- func (t *Tokenizer) Process(d *Document) error
type Unidecoder
- func (u *Unidecoder) Process(d *Document) error

Constants ¶

View Source

const (
	DatePattern = `(?i)(?:(?:tgl)?\d{1,2}[^0-9^:]\d{1,2}[^0-9^:](?:19|20)?\d{2})|(?:(?:19|20)?\d{2}[^0-9^:]\d{1,2}[^0-9^:]\d{1,2})`
	TimePattern = `(?:(?:0?|[12])\d\s*:+\s*[0-5]\d(?:\s*:+\s*[0-5]\d(?:\.\d+)?(?:\s*(?:\+|-)(?:0?\d|1[0-2]):?(?:0|3)0)?)?)`
	//TimePattern = `(?is)((?:0?|[12])\d\s*:+\s*[0-5]\d(?:\s*:+\s*[0-5]\d)?(?:\s*[,:.]*\s*(?:am|pm))?|(?:0?|[12])\d\s*[.\s]+\s*[0-5]\d(?:\s*[,:.]*\s*(?:am|pm))+)`
	PhonePattern          = `` /* 133-byte string literal not displayed */
	PhonesWithExtsPattern = `` /* 273-byte string literal not displayed */
	LinkPattern           = `(?i)(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)*\/?`
	EmailPattern          = `(?i)([A-Za-z0-9!#$%&'*+\/=?^_{|.}~-]+@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)`
	IPv4Pattern           = `(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)`
	CreditCardPattern     = `(?i)(?:(?:(?:[\d\*x]{4}[- ]?){3}[\d\*x]{4}|[\d\*x]{15,16}))`
	VISACreditCardPattern = `4\d{3}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}`
	MCCreditCardPattern   = `5[1-5]\d{2}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}`
	BtcAddressPattern     = `[13][a-km-zA-HJ-NP-Z1-9]{25,34}`
	SSNPattern            = `(?:\d{3}-\d{2}-\d{4})`
	MD5HexPattern         = `[0-9a-fA-F]{32}`
	SHA1HexPattern        = `[0-9a-fA-F]{40}`
	SHA256HexPattern      = `[0-9a-fA-F]{64}`
	GUIDPattern           = `[0-9a-fA-F]{8}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{12}`
	ISBN13Pattern         = `(?:[\d]-?){12}[\dxX]`
	ISBN10Pattern         = `(?:[\d]-?){9}[\dxX]`
	MACAddressPattern     = `(([a-fA-F0-9]{2}[:-]){5}([a-fA-F0-9]{2}))`
	IBANPattern           = `[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}([A-Z\d]?){0,16}`
	NumericPattern        = `([+\-]?((\d{1,3}(,\d{3})+))|((?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?))`
	DigitsPattern         = `\d+`
)

https://github.com/mingrammer/commonregex Regular expression patterns

View Source

const Lemma = "lemma"

Lemma processor name

View Source

const Lower = "lower"

View Source

const Norm = "norm"

View Source

const Unidecode = "unidecode"

Variables ¶

View Source

var Processors = make(map[string]Processor)

View Source

var Regexes = map[string]*regexp.Regexp{
	"date":             regexp.MustCompile(DatePattern),
	"time":             regexp.MustCompile(TimePattern),
	"phone":            regexp.MustCompile(PhonePattern),
	"phones_with_exts": regexp.MustCompile(PhonesWithExtsPattern),
	"link":             regexp.MustCompile(LinkPattern),
	"email":            regexp.MustCompile(EmailPattern),
	"ipv4":             regexp.MustCompile(IPv4Pattern),
	"credit_card":      regexp.MustCompile(CreditCardPattern),
	"btc_address":      regexp.MustCompile(BtcAddressPattern),
	"ssn":              regexp.MustCompile(SSNPattern),
	"md5_hex":          regexp.MustCompile(MD5HexPattern),
	"sha1_hex":         regexp.MustCompile(SHA1HexPattern),
	"sha256_hex":       regexp.MustCompile(SHA256HexPattern),
	"guid":             regexp.MustCompile(GUIDPattern),
	"isbn13":           regexp.MustCompile(ISBN13Pattern),
	"isbn10":           regexp.MustCompile(ISBN10Pattern),
	"visa_credit_card": regexp.MustCompile(VISACreditCardPattern),
	"mc_credit_card":   regexp.MustCompile(MCCreditCardPattern),
	"mac_address":      regexp.MustCompile(MACAddressPattern),
	"iban":             regexp.MustCompile(IBANPattern),
	"numeric":          regexp.MustCompile(NumericPattern),
	"digits":           regexp.MustCompile(DigitsPattern),
}

Regexes is the compiled regular expressions

Functions ¶

func Script ¶

func Script(text string) string

Types ¶

type APITagger ¶

type APITagger struct {
	// contains filtered or unexported fields
}

APITagger via http interface

func NewAPITagger ¶

func NewAPITagger(addr string) (*APITagger, error)

NewAPITagger returns a new tagger

func (*APITagger) Process ¶

func (t *APITagger) Process(d *Document) error

Process the input document

type DictTagger ¶

type DictTagger struct {
	*d.Dictionary
}

func NewDictTagger ¶

func NewDictTagger() (*DictTagger, error)

func (*DictTagger) Process ¶

func (t *DictTagger) Process(d *Document) error

type Document ¶

type Document struct {
	Text   string   `json:"text"`
	Tokens []*Token `json:"tokens"`
	Spans  []*Span  `json:"spans"`
	Lang   string   `json:"lang"`
	Langs  []string `json:"langs"`
}

func NewDocument ¶

func NewDocument(text string) *Document

func (*Document) NewSpan ¶

func (d *Document) NewSpan(start, end int) *Span

func (*Document) String ¶

func (d *Document) String() string

func (*Document) XRealTokens ¶

func (d *Document) XRealTokens(anno string) []string

func (*Document) XTokens ¶

func (d *Document) XTokens(anno string) []string

type Entity ¶

type Entity struct {
	Text  string      `json:"text"`
	Type  string      `json:"type"`
	Value interface{} `json:"value"`
	Start int         `json:"start"`
	End   int         `json:"end"`
}

Entity stores the NER entity

type Lemmatizer ¶

type Lemmatizer struct {
}

Lemmatizer is the processor for lemmatization

func (*Lemmatizer) Process ¶

func (l *Lemmatizer) Process(d *Document) error

Process is the function to annotate documents

type Normalizer ¶

type Normalizer struct {
}

Normalizer is the processor for token normalization

func (*Normalizer) Process ¶

func (n *Normalizer) Process(d *Document) error

Process normalizes the tokens of Document d

type Pipeline ¶

type Pipeline struct {
	Annotators []string
	// contains filtered or unexported fields
}

A Pipeline contains configured annotators and taggers for nl processing

func DefaultNLP ¶

func DefaultNLP() (*Pipeline, error)

DefaultNLP returns ling handler with norm, lemma, unidecode and regex

func MustNLP ¶

func MustNLP(annotators ...string) *Pipeline

MustNLP is like NLP but panics if the annotators are not correct. It simplifies safe initialization of global variables holding ling handler

func NLP ¶

func NLP(annotators ...string) (*Pipeline, error)

NLP returns ling handler with the annotators

func (*Pipeline) AddTagger ¶

func (p *Pipeline) AddTagger(t Processor) error

AddTagger adds a new processor t to Pipeline p

func (*Pipeline) Annotate ¶

func (p *Pipeline) Annotate(d *Document) error

Annotate tags the Document by each configured processors and taggers

func (*Pipeline) AnnotatePro ¶

func (p *Pipeline) AnnotatePro(d *Document, taggers ...Processor) error

AnnotatePro tags the Document by each configured processors and taggers

type Processor ¶

type Processor interface {
	Process(d *Document) error
}

type RegexTagger ¶

type RegexTagger struct {
}

RegexTagger is the processor that uses regex expression

func (*RegexTagger) Process ¶

func (t *RegexTagger) Process(d *Document) error

Process is the function to annotate documents

type Span ¶

type Span struct {
	Doc         *Document              `json:"-"`
	Start       int                    `json:"start"`
	End         int                    `json:"end"`
	Annotations map[string]interface{} `json:"annotations"`
}

func (*Span) String ¶

func (s *Span) String() string

type Token ¶

type Token struct {
	Doc         *Document         `json:"-"`
	Text        string            `json:"text"`
	Type        TokenType         `json:"type"`
	Script      string            `json:"script"`
	I           int               `json:"i"`
	StartByte   int               `json:"start_byte"`
	EndByte     int               `json:"end_byte"`
	Annotations map[string]string `json:"annotations"`
}

func (*Token) String ¶

func (t *Token) String() string

type TokenType ¶

type TokenType byte

const (
	EOF TokenType = iota
	Space
	Symbol
	Number
	Letters
	Punct
	Word
)

func Type ¶

func Type(text string) TokenType

func (TokenType) MarshalJSON ¶

func (r TokenType) MarshalJSON() ([]byte, error)

MarshalJSON is generated so TokenType satisfies json.Marshaler.

func (TokenType) String ¶

func (i TokenType) String() string

func (*TokenType) UnmarshalJSON ¶

func (r *TokenType) UnmarshalJSON(data []byte) error

UnmarshalJSON is generated so TokenType satisfies json.Unmarshaler.

type Tokenizer ¶

type Tokenizer struct {
}

func (*Tokenizer) Process ¶

func (t *Tokenizer) Process(d *Document) error

type Unidecoder ¶

type Unidecoder struct {
}

func (*Unidecoder) Process ¶

func (u *Unidecoder) Process(d *Document) error

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
cmd
gen_lemmatize_data
gen_normalize_data
ling
lemmatize
normalize
util

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL