doc2vec

package
v0.0.0-...-b167170 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 23, 2020 License: Apache-2.0 Imports: 15 Imported by: 4

Documentation

Index

Constants

View Source
const (
	MAX_EXP                 float64 = 6.0
	EXP_TABLE_SIZE          int     = 1000
	NEG_SAMPLING_TABLE_SIZE int     = 1e8
	PROGRESS_BAR_THRESHOLD  int     = 100000
	THREAD_NUM              int     = 32
)

Variables

This section is empty.

Functions

func DBC2SBC

func DBC2SBC(s string) string

半角->全角

func GetNegativeSamplingWordIdx

func GetNegativeSamplingWordIdx() int32

func GetSigmoidValue

func GetSigmoidValue(f float64) float64

func If

func If(condition bool, trueVal, falseVal interface{}) interface{}

func Max

func Max(first int, args ...int) int

func Min

func Min(first int, args ...int) int

func QuickSort

func QuickSort(i, j int, vec []*SortItem)

升序快排

func SBC2DBC

func SBC2DBC(s string) string

全角->半角

Types

type IDoc2Vec

type IDoc2Vec interface {
	Train(fname string)
	TrainFromString(strA string)
	GetCorpus() corpus.ICorpus
	GetNeuralNet() neuralnet.INeuralNet
	SaveModel(fname string) (err error)
	LoadModel(fname string) (err error)
	Word2Words(word string)
	TXWord2Words(word string, limit int, outWordsA map[string]int) []string
	TXWord2WordsIn(word string, limit int, inWordListA map[string]int) []string
	TXWord2WordsInOut(word string, limit int, inWordListA map[string]int, outWordsA map[string]int) []string
	TXWord2Docs(word string, limit int) []string
	Word2Docs(word string)
	Sen2Words(content string, iters int)
	InferDoc(content string, iters int) (rs []float32)
	Sen2Docs(content string, iters int)
	TXDoc2Words(content string, iters int, limit int, outWordsA map[string]int) (rs []string, vec []float32)
	FindNearestDoc(content string, iters int) string
	Doc2Docs(docidx int)
	Doc2Words(docidx int)
	GetLikelihood4Doc(context string) (likelihood float64)
	GetLeaveOneOutKwds(content string, iters int)
	DocSimCal(content1 string, content2 string) (dis float64)
	GetDim() int
	GetRound() int
}

func NewDoc2Vec

func NewDoc2Vec(useCbow, useHS, useNEG bool, windowSize, dim, iters int) IDoc2Vec

type SortItem

type SortItem struct {
	Idx int32
	Dis float64
}

func (*SortItem) DecodeMsg

func (z *SortItem) DecodeMsg(dc *msgp.Reader) (err error)

DecodeMsg implements msgp.Decodable

func (SortItem) EncodeMsg

func (z SortItem) EncodeMsg(en *msgp.Writer) (err error)

EncodeMsg implements msgp.Encodable

func (SortItem) MarshalMsg

func (z SortItem) MarshalMsg(b []byte) (o []byte, err error)

MarshalMsg implements msgp.Marshaler

func (SortItem) Msgsize

func (z SortItem) Msgsize() (s int)

Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message

func (*SortItem) UnmarshalMsg

func (z *SortItem) UnmarshalMsg(bts []byte) (o []byte, err error)

UnmarshalMsg implements msgp.Unmarshaler

type TDoc2VecImpl

type TDoc2VecImpl struct {
	Trainfile    string
	Dim          int
	UseCbow      bool //true:Continuous Bag-of-Word Model false:skip-gram
	WindowSize   int  //cbow model的窗口大小
	UseHS        bool
	UseNEG       bool //UseHS / UseNEG两种求解优化算法必须选一个 也可以两种算法都选 详见google word2vec源代码
	Negative     int  //负采样词的个数
	StartAlpha   float64
	Iters        int
	TrainedWords int
	Corpus       corpus.ICorpus
	NN           neuralnet.INeuralNet
	Pool         *sync.Pool
}

func (*TDoc2VecImpl) DecodeMsg

func (z *TDoc2VecImpl) DecodeMsg(dc *msgp.Reader) (err error)

DecodeMsg implements msgp.Decodable

func (*TDoc2VecImpl) Doc2Docs

func (p *TDoc2VecImpl) Doc2Docs(docidx int)

func (*TDoc2VecImpl) Doc2Words

func (p *TDoc2VecImpl) Doc2Words(docidx int)

func (*TDoc2VecImpl) DocSimCal

func (p *TDoc2VecImpl) DocSimCal(content1 string, content2 string) (sim float64)

func (*TDoc2VecImpl) EncodeMsg

func (z *TDoc2VecImpl) EncodeMsg(en *msgp.Writer) (err error)

EncodeMsg implements msgp.Encodable

func (*TDoc2VecImpl) FindNearestDoc

func (p *TDoc2VecImpl) FindNearestDoc(content string, iters int) string

func (*TDoc2VecImpl) FitDoc

func (p *TDoc2VecImpl) FitDoc(context string, iters int) (dsyn0 *neuralnet.TVector)

func (*TDoc2VecImpl) GetCorpus

func (p *TDoc2VecImpl) GetCorpus() corpus.ICorpus

func (*TDoc2VecImpl) GetDim

func (p *TDoc2VecImpl) GetDim() int

func (*TDoc2VecImpl) GetLeaveOneOutKwds

func (p *TDoc2VecImpl) GetLeaveOneOutKwds(content string, iters int)

func (*TDoc2VecImpl) GetLikelihood4Doc

func (p *TDoc2VecImpl) GetLikelihood4Doc(context string) (likelihood float64)

func (*TDoc2VecImpl) GetNeuralNet

func (p *TDoc2VecImpl) GetNeuralNet() neuralnet.INeuralNet

func (*TDoc2VecImpl) GetRound

func (p *TDoc2VecImpl) GetRound() int

func (*TDoc2VecImpl) GetTopKDocs

func (p *TDoc2VecImpl) GetTopKDocs(slice TSortItemSlice) []string

func (*TDoc2VecImpl) GetTopKDocsLimit

func (p *TDoc2VecImpl) GetTopKDocsLimit(slice TSortItemSlice, limit int) []string

func (*TDoc2VecImpl) GetTopKWords

func (p *TDoc2VecImpl) GetTopKWords(slice TSortItemSlice, limit int, outWordsA map[string]int) []string

func (*TDoc2VecImpl) GetTopKWordsIn

func (p *TDoc2VecImpl) GetTopKWordsIn(slice TSortItemSlice, limit int, inWordListA map[string]int) []string

func (*TDoc2VecImpl) GetTopKWordsInOut

func (p *TDoc2VecImpl) GetTopKWordsInOut(slice TSortItemSlice, limit int, inWordListA map[string]int, outWordListA map[string]int) []string

func (*TDoc2VecImpl) InferDoc

func (p *TDoc2VecImpl) InferDoc(content string, iters int) (rs []float32)

func (*TDoc2VecImpl) LoadModel

func (p *TDoc2VecImpl) LoadModel(fname string) (err error)

func (*TDoc2VecImpl) MarshalMsg

func (z *TDoc2VecImpl) MarshalMsg(b []byte) (o []byte, err error)

MarshalMsg implements msgp.Marshaler

func (*TDoc2VecImpl) Msgsize

func (z *TDoc2VecImpl) Msgsize() (s int)

Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message

func (*TDoc2VecImpl) PrintTopKDocs

func (p *TDoc2VecImpl) PrintTopKDocs(slice TSortItemSlice)

func (*TDoc2VecImpl) PrintTopKWords

func (p *TDoc2VecImpl) PrintTopKWords(slice TSortItemSlice)

func (*TDoc2VecImpl) SaveModel

func (p *TDoc2VecImpl) SaveModel(fname string) (err error)

func (*TDoc2VecImpl) Sen2Docs

func (p *TDoc2VecImpl) Sen2Docs(content string, iters int)

func (*TDoc2VecImpl) Sen2Words

func (p *TDoc2VecImpl) Sen2Words(content string, iters int)

func (*TDoc2VecImpl) TXDoc2Words

func (p *TDoc2VecImpl) TXDoc2Words(content string, iters int, limit int, outWordsA map[string]int) (rs []string, vec []float32)

func (*TDoc2VecImpl) TXWord2Docs

func (p *TDoc2VecImpl) TXWord2Docs(word string, limit int) []string

func (*TDoc2VecImpl) TXWord2Words

func (p *TDoc2VecImpl) TXWord2Words(word string, limit int, outWordsA map[string]int) []string

func (*TDoc2VecImpl) TXWord2WordsIn

func (p *TDoc2VecImpl) TXWord2WordsIn(word string, limit int, inWordListA map[string]int) []string

func (*TDoc2VecImpl) TXWord2WordsInOut

func (p *TDoc2VecImpl) TXWord2WordsInOut(word string, limit int, inWordListA map[string]int, outWordsA map[string]int) []string

func (*TDoc2VecImpl) Train

func (p *TDoc2VecImpl) Train(fname string)

func (*TDoc2VecImpl) TrainFromString

func (p *TDoc2VecImpl) TrainFromString(strA string)

func (*TDoc2VecImpl) UnmarshalMsg

func (z *TDoc2VecImpl) UnmarshalMsg(bts []byte) (o []byte, err error)

UnmarshalMsg implements msgp.Unmarshaler

func (*TDoc2VecImpl) Word2Docs

func (p *TDoc2VecImpl) Word2Docs(word string)

func (*TDoc2VecImpl) Word2Words

func (p *TDoc2VecImpl) Word2Words(word string)

type TSortItemSlice

type TSortItemSlice []*SortItem

func (*TSortItemSlice) DecodeMsg

func (z *TSortItemSlice) DecodeMsg(dc *msgp.Reader) (err error)

DecodeMsg implements msgp.Decodable

func (TSortItemSlice) EncodeMsg

func (z TSortItemSlice) EncodeMsg(en *msgp.Writer) (err error)

EncodeMsg implements msgp.Encodable

func (TSortItemSlice) Len

func (p TSortItemSlice) Len() int

func (TSortItemSlice) Less

func (p TSortItemSlice) Less(i, j int) bool

func (TSortItemSlice) MarshalMsg

func (z TSortItemSlice) MarshalMsg(b []byte) (o []byte, err error)

MarshalMsg implements msgp.Marshaler

func (TSortItemSlice) Msgsize

func (z TSortItemSlice) Msgsize() (s int)

Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message

func (TSortItemSlice) Swap

func (p TSortItemSlice) Swap(i, j int)

func (*TSortItemSlice) UnmarshalMsg

func (z *TSortItemSlice) UnmarshalMsg(bts []byte) (o []byte, err error)

UnmarshalMsg implements msgp.Unmarshaler

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL