corpus

package
v0.0.0-...-ce5e274 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 30, 2018 License: Apache-2.0 Imports: 9 Imported by: 0

Documentation

Index

Constants

View Source
const (
	VOCAB_HASH_SIZE int = 30000000 //3kw, 30M
)

Variables

This section is empty.

Functions

This section is empty.

Types

type ICorpus

type ICorpus interface {
	Build(model common.IModelDataProvider) (err error)
	GetVocabCnt() int //排重后的词库大小 Translate: "Word size after weight"
	GetDocCnt() int   //doc个数, 按docid排重
	GetWordsCnt() int //排重前的词数 Translate: "The number of words before the weight"
	GetWordIdx(word string) (idx int32, ok bool)
	GetWordItemByIdx(i int) (item *TWordItem)
	GetAllWords() (words TWordItemSlice)
	GetAllDocWordsIdx() [][]int32
	GetAllDocWords() (doc [][]*TWordItem)
	GetDocWordsByDocid(id string) (doc []*TWordItem)
	GetDocWordsByIdx(i int) (doc []*TWordItem)
	Transform(content string) (wordsidx []int32)
	GetDocids() []string // all documents as read in
	msgp.Encodable
	msgp.Decodable
	msgp.Marshaler
	msgp.Unmarshaler
	msgp.Sizer
}

type TCorpusImpl

type TCorpusImpl struct {
	Words        TWordItemSlice   //Vocab
	Word2Idx     map[string]int32 //word -> words中的下标
	Doc2WordsIdx [][]int32        //
	Doc2Idx      map[string]int32 //docid -> Doc2WordsIdx中的下表
	Docids       []string         // document words, in order, when read in
	MinReduce    int32
	MinCnt       int32
	WordsCnt     int //未排重的词数 Translate: "Unranked words"
}

func NewCorpus

func NewCorpus() *TCorpusImpl

func (*TCorpusImpl) Build

func (p *TCorpusImpl) Build(model common.IModelDataProvider) (err error)

func (*TCorpusImpl) DecodeMsg

func (z *TCorpusImpl) DecodeMsg(dc *msgp.Reader) (err error)

DecodeMsg implements msgp.Decodable

func (*TCorpusImpl) EncodeMsg

func (z *TCorpusImpl) EncodeMsg(en *msgp.Writer) (err error)

EncodeMsg implements msgp.Encodable

func (*TCorpusImpl) GetAllDocWords

func (p *TCorpusImpl) GetAllDocWords() (docs [][]*TWordItem)

func (*TCorpusImpl) GetAllDocWordsIdx

func (p *TCorpusImpl) GetAllDocWordsIdx() [][]int32

func (*TCorpusImpl) GetAllWords

func (p *TCorpusImpl) GetAllWords() (words TWordItemSlice)

func (*TCorpusImpl) GetDocCnt

func (p *TCorpusImpl) GetDocCnt() int

func (*TCorpusImpl) GetDocWordsByDocid

func (p *TCorpusImpl) GetDocWordsByDocid(id string) (doc []*TWordItem)

func (*TCorpusImpl) GetDocWordsByIdx

func (p *TCorpusImpl) GetDocWordsByIdx(i int) (doc []*TWordItem)

func (*TCorpusImpl) GetDocids

func (p *TCorpusImpl) GetDocids() []string

func (*TCorpusImpl) GetVocabCnt

func (p *TCorpusImpl) GetVocabCnt() int

func (*TCorpusImpl) GetWordIdx

func (p *TCorpusImpl) GetWordIdx(word string) (idx int32, ok bool)

func (*TCorpusImpl) GetWordItemByIdx

func (p *TCorpusImpl) GetWordItemByIdx(i int) (item *TWordItem)

func (*TCorpusImpl) GetWordsCnt

func (p *TCorpusImpl) GetWordsCnt() int

func (*TCorpusImpl) MarshalMsg

func (z *TCorpusImpl) MarshalMsg(b []byte) (o []byte, err error)

MarshalMsg implements msgp.Marshaler

func (*TCorpusImpl) Msgsize

func (z *TCorpusImpl) Msgsize() (s int)

Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message

func (*TCorpusImpl) String

func (p *TCorpusImpl) String() string

func (*TCorpusImpl) Transform

func (p *TCorpusImpl) Transform(content string) (wordsidx []int32)

func (*TCorpusImpl) UnmarshalMsg

func (z *TCorpusImpl) UnmarshalMsg(bts []byte) (o []byte, err error)

UnmarshalMsg implements msgp.Unmarshaler

type TWordItem

type TWordItem struct {
	Cnt   int32   //term frequency
	Point []int32 //Huffman tree(n leaf + n inner node, include root) path. [root, leaf), node index
	Code  []bool  //Huffman code. (root, leaf], 0/1 codes
	Word  string  //word desc
}

func (*TWordItem) DecodeMsg

func (z *TWordItem) DecodeMsg(dc *msgp.Reader) (err error)

DecodeMsg implements msgp.Decodable

func (*TWordItem) EncodeMsg

func (z *TWordItem) EncodeMsg(en *msgp.Writer) (err error)

EncodeMsg implements msgp.Encodable

func (*TWordItem) MarshalMsg

func (z *TWordItem) MarshalMsg(b []byte) (o []byte, err error)

MarshalMsg implements msgp.Marshaler

func (*TWordItem) Msgsize

func (z *TWordItem) Msgsize() (s int)

Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message

func (*TWordItem) UnmarshalMsg

func (z *TWordItem) UnmarshalMsg(bts []byte) (o []byte, err error)

UnmarshalMsg implements msgp.Unmarshaler

type TWordItemSlice

type TWordItemSlice []TWordItem

func (*TWordItemSlice) DecodeMsg

func (z *TWordItemSlice) DecodeMsg(dc *msgp.Reader) (err error)

DecodeMsg implements msgp.Decodable

func (TWordItemSlice) EncodeMsg

func (z TWordItemSlice) EncodeMsg(en *msgp.Writer) (err error)

EncodeMsg implements msgp.Encodable

func (TWordItemSlice) Len

func (p TWordItemSlice) Len() int

func (TWordItemSlice) Less

func (p TWordItemSlice) Less(i, j int) bool

func (TWordItemSlice) MarshalMsg

func (z TWordItemSlice) MarshalMsg(b []byte) (o []byte, err error)

MarshalMsg implements msgp.Marshaler

func (TWordItemSlice) Msgsize

func (z TWordItemSlice) Msgsize() (s int)

Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message

func (TWordItemSlice) Swap

func (p TWordItemSlice) Swap(i, j int)

func (*TWordItemSlice) UnmarshalMsg

func (z *TWordItemSlice) UnmarshalMsg(bts []byte) (o []byte, err error)

UnmarshalMsg implements msgp.Unmarshaler

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL