nlp

package module
v0.0.0-...-af2a801 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 4, 2023 License: MIT Imports: 13 Imported by: 1

README

Natural Language Processing (NLP)

NLP is a collection of algorithms to work with human languages.

todo ...

Documentation

Index

Constants

View Source
const (
	CleanControl = 1 << iota
	CleanMark
	CleanPunct
	CleanSpace
	CleanDigit
	CleanNumber
	CleanSymbol
	CleanLetter
	CleanPrint
	CleanGraphic

	DefaultCleanMask = CleanControl | CleanMark | CleanSymbol | CleanNumber | CleanPunct
)

Variables

View Source
var (
	ErrEmptyInput = errors.New("input text is empty")
	ErrBadVersion = errors.New("incompatible version")
)

Functions

func ReleaseCtx

func ReleaseCtx[T byteseq.Byteseq](ctx *Ctx[T])

Types

type Bigram

type Bigram uint32

func NewBigram

func NewBigram(a, b rune) (n Bigram)

func (Bigram) AppendTo

func (b Bigram) AppendTo(dst []byte) []byte

func (Bigram) String

func (b Bigram) String() string

type Cleaner

type Cleaner[T byteseq.Byteseq] interface {
	Clean(x T) []rune
	AppendClean(dst []rune, x T) []rune
}

type Ctx

type Ctx[T byteseq.Byteseq] struct {
	bitset.Bitset

	BufSP ScriptProba
	// contains filtered or unexported fields
}

func AcquireCtx

func AcquireCtx[T byteseq.Byteseq]() *Ctx[T]

func NewCtx

func NewCtx[T byteseq.Byteseq]() *Ctx[T]

func (*Ctx[T]) Clean

func (ctx *Ctx[T]) Clean() *Ctx[T]

func (*Ctx[T]) DetectScript

func (ctx *Ctx[T]) DetectScript() *Ctx[T]

func (*Ctx[T]) DetectScriptProba

func (ctx *Ctx[T]) DetectScriptProba() *Ctx[T]

func (*Ctx[T]) GetError

func (ctx *Ctx[T]) GetError() error

func (*Ctx[T]) GetOriginText

func (ctx *Ctx[T]) GetOriginText() T

func (*Ctx[T]) GetRunes

func (ctx *Ctx[T]) GetRunes() []rune

func (*Ctx[T]) GetScript

func (ctx *Ctx[T]) GetScript() Script

func (*Ctx[T]) GetScriptProba

func (ctx *Ctx[T]) GetScriptProba() ScriptProba

func (*Ctx[T]) GetScriptsLimit

func (ctx *Ctx[T]) GetScriptsLimit() []Script

func (*Ctx[T]) GetText

func (ctx *Ctx[T]) GetText() T

func (*Ctx[T]) GetTokens

func (ctx *Ctx[T]) GetTokens() Tokens

func (*Ctx[T]) LimitScripts

func (ctx *Ctx[T]) LimitScripts(list []Script) *Ctx[T]

func (*Ctx[T]) Modify

func (ctx *Ctx[T]) Modify() *Ctx[T]

func (*Ctx[T]) Reset

func (ctx *Ctx[T]) Reset() *Ctx[T]

func (*Ctx[T]) ResetCleaners

func (ctx *Ctx[T]) ResetCleaners() *Ctx[T]

func (*Ctx[T]) ResetModifiers

func (ctx *Ctx[T]) ResetModifiers() *Ctx[T]

func (*Ctx[T]) ResetScriptDetector

func (ctx *Ctx[T]) ResetScriptDetector() *Ctx[T]

func (*Ctx[T]) ResetTokenizer

func (ctx *Ctx[T]) ResetTokenizer() *Ctx[T]

func (*Ctx[T]) SetText

func (ctx *Ctx[T]) SetText(text T) *Ctx[T]

func (*Ctx[T]) Tokenize

func (ctx *Ctx[T]) Tokenize() *Ctx[T]

func (*Ctx[T]) WithCleaner

func (ctx *Ctx[T]) WithCleaner(cln Cleaner[T]) *Ctx[T]

func (*Ctx[T]) WithModifier

func (ctx *Ctx[T]) WithModifier(mod Modifier[T]) *Ctx[T]

func (*Ctx[T]) WithScriptDetector

func (ctx *Ctx[T]) WithScriptDetector(ds ScriptDetector[T]) *Ctx[T]

func (*Ctx[T]) WithTokenizer

func (ctx *Ctx[T]) WithTokenizer(tkn Tokenizer[T]) *Ctx[T]

type DummyModifier

type DummyModifier[T byteseq.Byteseq] struct{}

func (DummyModifier[T]) AppendModify

func (DummyModifier[T]) AppendModify(dst []rune, _ T) []rune

func (DummyModifier[T]) Modify

func (DummyModifier[T]) Modify(x T) T

type Fivegram

type Fivegram struct {
	// contains filtered or unexported fields
}

func NewFivegram

func NewFivegram(a, b, c, d, e rune) Fivegram

func (Fivegram) AppendTo

func (f Fivegram) AppendTo(dst []byte) []byte

func (Fivegram) String

func (f Fivegram) String() string

type Language

type Language uint

Language describes language type that allow to get different form of the language names.

Use generated language_repo.go for fast access to names repository. Similar to stringer approach but ~2-3 times faster. See https://github.com/koykov/versus/tree/master/stringer for comparison benchmarks.

const (
	LanguageAbaza Language = iota
	LanguageAbenaki_Penobscot
	LanguageAbkhaz
	LanguageAdyghe
	LanguageAfar
	LanguageAfrikaans
	LanguageAghul
	LanguageAhtna
	LanguageAinu
	LanguageAlsatian
	LanguageArabic_Romanized
	LanguageArabic
	LanguageArmenian
	LanguageAromanian
	LanguageAkan
	LanguageAlbanian
	LanguageAmharic
	LanguageAzerbaijani
	LanguageBalkar
	LanguageBalochi
	LanguageBambara
	LanguageBashkir
	LanguageBasque
	LanguageBlackfoot
	LanguageBolivian_Quechua
	LanguageBonan
	LanguageBhojpuri
	LanguageBelarusian
	LanguageBengali
	LanguageBosnian
	LanguageBulgarian
	LanguageBurmese
	LanguageBrazilian_Portuguese
	LanguageBrazilian_Veneto
	LanguageBreton
	LanguageBudukh
	LanguageBuryat
	LanguageCantonese
	LanguageCatalan
	LanguageCebuano
	LanguageChti
	LanguageCherokee
	LanguageCheyenne
	LanguageChewa
	LanguageChichewa
	LanguageChinese
	LanguageChoctaw
	LanguageChulym
	LanguageChuukese
	LanguageChuvash
	LanguageCornish
	LanguageCroatian
	LanguageCzech
	LanguageDalmatian
	LanguageDanish
	LanguageDaur
	LanguageDolgan
	LanguageDongxiang
	LanguageDutch
	LanguageEnets
	LanguageEnglish
	LanguageEsperanto
	LanguageEstonian
	LanguageEvenki
	LanguageFaroese
	LanguageFinnish
	LanguageFrench
	LanguageFulfulde
	LanguageGagauz
	LanguageGanda
	LanguageGbari
	LanguageGeorgian
	LanguageGerman
	LanguageGothic
	LanguageGreenlandic
	LanguageGreek
	LanguageGujarati
	LanguageGwichin
	LanguageHaida
	LanguageHaitian_Creole
	LanguageHan
	LanguageHausa
	LanguageHebrew
	LanguageHindi
	LanguageHmar
	LanguageHmong
	LanguageHungarian
	LanguageIndonesian
	LanguageInuit
	LanguageIrish
	LanguageIgbo
	LanguageIcelandic
	LanguageIlocano
	LanguageItalian
	LanguageJapanese
	LanguageJavanese
	LanguageJerriais
	LanguageJingpho
	LanguageJurchen
	LanguageKabardian
	LanguageKabyle
	LanguageKalmyk
	LanguageKamas
	LanguageKaraim
	LanguageKarakalpak
	LanguageKarakhanid
	LanguageKashubian
	LanguageKaska
	LanguageKazakh
	LanguageKannada
	LanguageKet
	LanguageKhakas
	LanguageKhalaj
	LanguageKhanty
	LanguageKhmer
	LanguageKhowar
	LanguageKiche
	LanguageKinyarwanda
	LanguageKipsigis
	LanguageKirghiz
	LanguageKiribati
	LanguageKoine_Greek
	LanguageKomi
	LanguageKorean
	LanguageKorean_Hangul
	LanguageKosraean
	LanguageKott
	LanguageKryts
	LanguageKumyk
	LanguageKurdish
	LanguageKutchi
	LanguageLak
	LanguageLakota
	LanguageLao
	LanguageLatin
	LanguageLatvian
	LanguageLaz
	LanguageLepcha
	LanguageLezgi
	LanguageLigurian
	LanguageLingala
	LanguageLithuanian
	LanguageMacedonian
	LanguageMalagasy
	LanguageMalay
	LanguageMalayalam
	LanguageMaori
	LanguageMarathi
	LanguageMongolian
	LanguageMaithili
	LanguageNepali
	LanguageNorwegian_Bokmal
	LanguageNorwegian_Nynorsk
	LanguageOromo
	LanguageOriya
	LanguagePunjabi
	LanguagePersian
	LanguagePolish
	LanguagePortuguese
	LanguageKirundi
	LanguageRomanian
	LanguageRussian
	LanguageSaraiki
	LanguageSerbian
	LanguageShona
	LanguageSinhalese
	LanguageSlovak
	LanguageSlovenian
	LanguageSomali
	LanguageSotho
	LanguageSpanish
	LanguageSwahili
	LanguageSwedish
	LanguageTamil
	LanguageTelugu
	LanguageThai
	LanguageTigrinya
	LanguageTurkmen
	LanguageTagalog
	LanguageTswana
	LanguageTurkish
	LanguageTsonga
	LanguageUyghur
	LanguageUkrainian
	LanguageUrdu
	LanguageUzbek
	LanguageVietnamese
	LanguageWelsh
	LanguageXhosa
	LanguageYiddish
	LanguageYoruba
	LanguageZulu
)

func (Language) Iso6391

func (l Language) Iso6391() string

func (Language) Iso6393

func (l Language) Iso6393() string

func (Language) Native

func (l Language) Native() string

func (Language) String

func (l Language) String() string

type Modifier

type Modifier[T byteseq.Byteseq] interface {
	Modify(x T) T
	AppendModify(dst []rune, x T) []rune
}

type NGModel

type NGModel[T byteseq.Byteseq] struct {
	Version   uint64
	Tokenizer Tokenizer[T]
	// contains filtered or unexported fields
}

func (*NGModel[T]) AddBigram

func (m *NGModel[T]) AddBigram(ng Bigram) *NGModel[T]

func (*NGModel[T]) AddFivegram

func (m *NGModel[T]) AddFivegram(ng Fivegram) *NGModel[T]

func (*NGModel[T]) AddQuadrigram

func (m *NGModel[T]) AddQuadrigram(ng Quadrigram) *NGModel[T]

func (*NGModel[T]) AddTrigram

func (m *NGModel[T]) AddTrigram(ng Trigram) *NGModel[T]

func (*NGModel[T]) AddUnigram

func (m *NGModel[T]) AddUnigram(ng Unigram) *NGModel[T]

func (*NGModel[T]) LoadFile

func (m *NGModel[T]) LoadFile(path string) error

func (*NGModel[T]) Parse

func (m *NGModel[T]) Parse(text T) *NGModel[T]

func (*NGModel[T]) Read

func (m *NGModel[T]) Read(r io.Reader) (n int, err error)

func (*NGModel[T]) Stat

func (m *NGModel[T]) Stat() (int, int, int, int, int)

func (*NGModel[T]) Write

func (m *NGModel[T]) Write(w io.Writer) (n int, err error)

type Quadrigram

type Quadrigram uint64

func NewQuadrigram

func NewQuadrigram(a, b, c, d rune) (n Quadrigram)

func (Quadrigram) AppendTo

func (q Quadrigram) AppendTo(dst []byte) []byte

func (Quadrigram) String

func (q Quadrigram) String() string

type SRE

type SRE struct {
	Evaluate func(r rune) bool
	// contains filtered or unexported fields
}

SRE is a script rune evaluator. Nested functions approach performance https://github.com/koykov/lab/tree/master/call_perf

type Script

type Script uint
const (
	ScriptLatin Script = iota
	ScriptArabic
	ScriptCyrillic
	ScriptDevanagari
	ScriptEthiopic
	ScriptHan
	ScriptTagalog
	ScriptPhags_Pa
	ScriptTelugu
	ScriptHebrew
	ScriptBopomofo
	ScriptMyanmar
	ScriptBengali
	ScriptDeseret
	ScriptShavian
	ScriptDuployan
	ScriptGeorgian
	ScriptRunic
	ScriptGreek
	ScriptGujarati
	ScriptArmenian
	ScriptMahajani
	ScriptOgham
	ScriptSyriac
	ScriptHiragana
	ScriptKatakana
	ScriptJavanese
	ScriptKannada
	ScriptKhmer
	ScriptHangul
	ScriptMalayalam
	ScriptModi
	ScriptMongolian
	ScriptTirhuta
	ScriptElbasan
	ScriptGurmukhi
	ScriptSinhala
	ScriptOsmanya
	ScriptTamil
	ScriptThai
)

func ScriptsSupported

func ScriptsSupported() []Script

func (Script) Evaluate

func (s Script) Evaluate(r rune) bool

Evaluate checks if given rune r is written on script s. Use precompiled SRE (script rune evaluator) to speed up evaluation. See performance tests https://github.com/koykov/versus/blob/master/nlp_script/evaluate_test.go

func (Script) Is

func (s Script) Is(r rune) bool

func (Script) Languages

func (s Script) Languages() []Language

type ScriptDetectAlgo

type ScriptDetectAlgo uint
const (
	ScriptDetectAlgoHalf ScriptDetectAlgo = iota
	ScriptDetectAlgoDistributed
	ScriptDetectAlgoFull
)

type ScriptDetector

type ScriptDetector[T byteseq.Byteseq] interface {
	Detect(ctx *Ctx[T]) (Script, error)
	DetectProba(ctx *Ctx[T]) (ScriptProba, error)
}

type ScriptProba

type ScriptProba []ScriptScore

func (ScriptProba) Len

func (s ScriptProba) Len() int

func (ScriptProba) Less

func (s ScriptProba) Less(i, j int) bool

func (*ScriptProba) Swap

func (s *ScriptProba) Swap(i, j int)

type ScriptScore

type ScriptScore struct {
	Script Script
	Score  float32
}

type StringTokenizer

type StringTokenizer[T byteseq.Byteseq] struct {
	Separator  string
	BlankLines TokenizerBlankLines
}

func NewStringTokenizer

func NewStringTokenizer[T byteseq.Byteseq](sep string, blankLines TokenizerBlankLines) StringTokenizer[T]

func (StringTokenizer[T]) AppendTokenize

func (t StringTokenizer[T]) AppendTokenize(dst Tokens, x T) Tokens

func (StringTokenizer[T]) Tokenize

func (t StringTokenizer[T]) Tokenize(x T) Tokens

type Token

type Token struct {
	// contains filtered or unexported fields
}

func ParseToken

func ParseToken[T byteseq.Byteseq](s T, lo, hi int) Token

func (Token) Bytes

func (t Token) Bytes() []byte

func (Token) Span

func (t Token) Span() (lo, hi int)

func (Token) String

func (t Token) String() string

type Tokenizer

type Tokenizer[T byteseq.Byteseq] interface {
	Tokenize(x T) Tokens
	AppendTokenize(dst Tokens, x T) Tokens
}

type TokenizerBlankLines

type TokenizerBlankLines int
const (
	TokenizerBlankLinesDiscard TokenizerBlankLines = iota
	TokenizerBlankLinesKeep
	TokenizerBlankLinesDiscardEOF

	DefaultTokenSeparator = " \n\t"
)

type Tokens

type Tokens []Token

func (Tokens) Each

func (t Tokens) Each(fn func(i int, t Token))

func (Tokens) Equal

func (t Tokens) Equal(e Tokens) bool

func (*Tokens) Reset

func (t *Tokens) Reset()

type Trigram

type Trigram struct {
	// contains filtered or unexported fields
}

func NewTrigram

func NewTrigram(a, b, c rune) Trigram

func (Trigram) AppendTo

func (t Trigram) AppendTo(dst []byte) []byte

func (Trigram) String

func (t Trigram) String() string

type UnicodeCleaner

type UnicodeCleaner[T byteseq.Byteseq] struct {
	Mask uint32
}

func NewUnicodeCleaner

func NewUnicodeCleaner[T byteseq.Byteseq](mask uint32) UnicodeCleaner[T]

func (UnicodeCleaner[T]) AppendClean

func (c UnicodeCleaner[T]) AppendClean(dst []rune, x T) []rune

func (UnicodeCleaner[T]) Clean

func (c UnicodeCleaner[T]) Clean(x T) []rune

type UnicodeScriptDetector

type UnicodeScriptDetector[T byteseq.Byteseq] struct {
	// contains filtered or unexported fields
}

UnicodeScriptDetector is a builtin detector of writing scripts.

func NewUnicodeScriptDetector

func NewUnicodeScriptDetector[T byteseq.Byteseq]() UnicodeScriptDetector[T]

func NewUnicodeScriptDetectorWithAlgo

func NewUnicodeScriptDetectorWithAlgo[T byteseq.Byteseq](algo ScriptDetectAlgo) UnicodeScriptDetector[T]

func (UnicodeScriptDetector[T]) Detect

func (d UnicodeScriptDetector[T]) Detect(ctx *Ctx[T]) (Script, error)

func (UnicodeScriptDetector[T]) DetectProba

func (d UnicodeScriptDetector[T]) DetectProba(ctx *Ctx[T]) (ScriptProba, error)

type Unigram

type Unigram uint16

func NewUnigram

func NewUnigram(r rune) Unigram

func (Unigram) AppendTo

func (u Unigram) AppendTo(dst []byte) []byte

func (Unigram) String

func (u Unigram) String() string

Directories

Path Synopsis
cmd

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL