frontier

package
v1.0.59 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 10, 2024 License: Apache-2.0 Imports: 17 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func SyncMapDecode

func SyncMapDecode(m *sync.Map, file *os.File) error

func SyncMapEncode

func SyncMapEncode(m *sync.Map, file *os.File) error

Types

type Frontier

type Frontier struct {
	Paused *utils.TAtomBool

	FinishingQueueWriter *utils.TAtomBool
	FinishingQueueReader *utils.TAtomBool
	IsQueueWriterActive  *utils.TAtomBool
	IsQueueReaderActive  *utils.TAtomBool
	JobPath              string

	// PullChan and PushChan are respectively the channels used for workers
	// to get new URLs to archive, and the channel to push the discovered URLs
	// to the frontier
	PullChan chan *Item
	PushChan chan *Item

	// Queue is a local queue storing all the URLs to crawl
	// it's a prefixed queue, basically one sub-queue per host
	Queue *goque.PrefixQueue
	// QueueCount store the number of URLs currently queued
	QueueCount *ratecounter.Counter

	// HostPool is an struct that contains a map and a Mutex.
	// the map contains all the different hosts that Zeno crawled,
	// with a counter for each, going through that map gives us
	// the prefix to query from the queue
	HostPool *sync.Map

	UseSeencheck bool
	Seencheck    *Seencheck
	LoggingChan  chan *FrontierLogMessage
}

Frontier holds all the data for a frontier

func (*Frontier) DecrHost

func (f *Frontier) DecrHost(host string)

Decr decrement by 1 the counter of an host in the pool

func (*Frontier) DecrHostActive

func (f *Frontier) DecrHostActive(host string)

func (*Frontier) GetActiveHostCount

func (f *Frontier) GetActiveHostCount(host string) (value int)

func (*Frontier) GetHostCount

func (f *Frontier) GetHostCount(host string) (value int)

GetCount return the counter of the key

func (*Frontier) GetHostsCount

func (f *Frontier) GetHostsCount() (value int64)

func (*Frontier) IncrHost

func (f *Frontier) IncrHost(host string)

Incr increment by 1 the counter of an host in the pool

func (*Frontier) IncrHostActive

func (f *Frontier) IncrHostActive(host string)

func (*Frontier) Init

func (f *Frontier) Init(jobPath string, loggingChan chan *FrontierLogMessage, workers int, useSeencheck bool) (err error)

Init ininitialize the components of a frontier

func (*Frontier) IsHostInPool

func (f *Frontier) IsHostInPool(host string) bool

IsHostInPool return true if the Host is in the pool

func (*Frontier) Load

func (f *Frontier) Load()

Load take the path to the frontier's hosts pool and status dump it decodes that file and load it in the job's frontier

func (*Frontier) Save

func (f *Frontier) Save()

Save write the in-memory hosts pool to resume properly the next time the job is loaded

func (*Frontier) Start

func (f *Frontier) Start()

Start fire up the background processes that handle the frontier

type FrontierLogMessage

type FrontierLogMessage struct {
	Fields  map[string]interface{}
	Message string
	Level   logrus.Level
}

type Item

type Item struct {
	ID             string
	Hash           uint64
	Hop            uint8
	Host           string
	Type           string
	Redirect       int
	URL            *url.URL
	ParentItem     *Item
	LocallyCrawled uint64
}

Item is crawl-able object

func IsSeedList

func IsSeedList(path string) (seeds []Item, err error)

IsSeedList validates if the path is a seed list, and return an array of frontier.Item made of the seeds if it can

func NewItem

func NewItem(URL *url.URL, parentItem *Item, itemType string, hop uint8, ID string) *Item

NewItem initialize an *Item

type Pair

type Pair struct {
	Key, Value interface{}
}

type PoolItem

type PoolItem struct {
	TotalCount  uint64
	ActiveCount uint64
}

type Seencheck

type Seencheck struct {
	SeenCount *ratecounter.Counter
	SeenDB    leveldb.Store
}

Seencheck holds the Seencheck database and the seen counter

func (*Seencheck) IsSeen

func (seencheck *Seencheck) IsSeen(hash string) (found bool, value string)

IsSeen check if the hash is in the seencheck database

func (*Seencheck) Seen

func (seencheck *Seencheck) Seen(hash, value string)

Seen mark a hash as seen and increment the seen counter

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL