corpus

package
v0.0.0-...-ba7cae4 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 1, 2023 License: MIT Imports: 15 Imported by: 0

Documentation

Index

Constants

View Source
const (
	SUBCLUSTER_DELIM = "-------------------------"
	URL_DELIM        = " "

	MAX_URL_LEN             = 500
	DISALLOW_EMPTY_CLUSTERS = false
)

Variables

This section is empty.

Functions

func Compress

func Compress(s string) []byte

func CountUrls

func CountUrls(s string) int

func Decompress

func Decompress(b []byte) (string, error)

func GetIthUrl

func GetIthUrl(strs string, num uint64) string

Types

type Corpus

type Corpus struct {
	// contains filtered or unexported fields
}

func Concat

func Concat(arr []*Corpus) *Corpus

func ReadEmbeddingsCsv

func ReadEmbeddingsCsv(file string) *Corpus

func ReadEmbeddingsTxt

func ReadEmbeddingsTxt(clusterStart, clusterStop int, conf *config.Config) *Corpus

func ReadUrlsCsv

func ReadUrlsCsv(file string, compress bool) *Corpus

func ReadUrlsTxt

func ReadUrlsTxt(clusterStart, clusterStop int, conf *config.Config) *Corpus

func (*Corpus) ClusterToIndex

func (c *Corpus) ClusterToIndex(i uint) uint

func (*Corpus) Clusters

func (c *Corpus) Clusters() []uint

func (*Corpus) GetCompressUrl

func (c *Corpus) GetCompressUrl() bool

func (*Corpus) GetEmbedding

func (c *Corpus) GetEmbedding(index uint64) []int8

func (*Corpus) GetEmbeddingSlots

func (c *Corpus) GetEmbeddingSlots() uint64

func (*Corpus) GetNumDocs

func (c *Corpus) GetNumDocs() uint64

func (*Corpus) GetParams

func (c *Corpus) GetParams() Params

func (*Corpus) GetSlotBits

func (c *Corpus) GetSlotBits() uint64

func (*Corpus) GetSubcluster

func (c *Corpus) GetSubcluster(index uint) []byte

func (*Corpus) GetUrlBytes

func (c *Corpus) GetUrlBytes() uint64

func (*Corpus) GetUrlsInCluster

func (c *Corpus) GetUrlsInCluster(i uint64) string

func (*Corpus) IndexOfSubclusterWithinCluster

func (c *Corpus) IndexOfSubclusterWithinCluster(cluster, sc uint) int

func (*Corpus) NumClusters

func (c *Corpus) NumClusters() int

func (*Corpus) NumDocsInCluster

func (c *Corpus) NumDocsInCluster(i uint) uint64

func (*Corpus) NumSubclusters

func (c *Corpus) NumSubclusters() int

func (*Corpus) NumSubclustersInCluster

func (c *Corpus) NumSubclustersInCluster(i uint) int

func (*Corpus) SizeOfSubcluster

func (c *Corpus) SizeOfSubcluster(i uint) int

Returns size in bytes

func (*Corpus) SizeOfSubclusterByIndex

func (c *Corpus) SizeOfSubclusterByIndex(cluster uint, index int) int

func (*Corpus) SubclusterToClusterMap

func (c *Corpus) SubclusterToClusterMap() map[uint]uint

type Params

type Params struct {
	NumDocs        uint64 // number of docs in corpus
	EmbeddingSlots uint64 // number of slots per embedding
	SlotBits       uint64 // precision of each slot (in bits)
	UrlBytes       uint64 // max bytes/url -- after optional compression
	CompressUrl    bool   // whether the urls are compressed with gzip
}

func (*Params) Consistent

func (p *Params) Consistent(np *Params) bool

type Subcluster

type Subcluster struct {
	// contains filtered or unexported fields
}

TODO: Change to uint

func NewSubcluster

func NewSubcluster(i, s uint64) *Subcluster

func (*Subcluster) GobDecode

func (c *Subcluster) GobDecode(buf []byte) error

func (*Subcluster) GobEncode

func (c *Subcluster) GobEncode() ([]byte, error)

func (Subcluster) Index

func (sc Subcluster) Index() uint64

func (*Subcluster) SetIndex

func (sc *Subcluster) SetIndex(i uint64)

func (*Subcluster) SetSize

func (sc *Subcluster) SetSize(s uint64)

func (Subcluster) Size

func (sc Subcluster) Size() uint64

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL