crawler

package
v0.0.0-...-ef078db Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 18, 2018 License: MIT Imports: 14 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// throttling rate
	DefaultThrottlingRate = 20

	// max crawl depth
	DefaultMaxCrawlDepth = 5

	// default compliance level with robots.txt policy
	// @see https://moz.com/learn/seo/robotstxt
	DefaultComplyWithRobotPolicy = true

	// DefaultUserAgent is the default user agent string in HTTPRequest
	DefaultUserAgent = "GoCrawler/v0.1 (+https://github.com/q/gocrawler)"
)

constants

Variables

View Source
var ErrDomainAlreadyRegistered = errors.New("domain is already registered/crawled")

ErrDomainAlreadyRegistered is used when domain already exists

Functions

This section is empty.

Types

type Crawler

type Crawler struct {
	// mutex
	sync.Mutex

	// user agent to send
	UserAgent string

	// http client
	HTTPClient *http.Client

	// logger interface
	Logger Logger
	// contains filtered or unexported fields
}

Crawler is a collection of workers that crawl their respective domains

func New

func New() *Crawler

New returns a new crawler

func (*Crawler) Close

func (c *Crawler) Close() error

Close cancels the subscriptions in flight, closes the Updates channel, and returns the last fetch error, if any that is captured

func (*Crawler) Crawl

func (c *Crawler) Crawl(rawurl string, depth int) error

Crawl initialises crawler by looking up robots.txt and then seeds the queue with a initial resource

func (*Crawler) Worker

func (c *Crawler) Worker(domain string) *Worker

Worker returns worker for a given domain

type Logger

type Logger interface {
	SetOutput(w io.Writer)
	SetPrefix(prefix string)
	Fatal(v ...interface{})
	Fatalf(format string, v ...interface{})
	Panic(v ...interface{})
	Panicf(format string, v ...interface{})
	Print(v ...interface{})
	Printf(format string, v ...interface{})
}

Logger defines the logging interface

type Queue

type Queue struct {
	// contains filtered or unexported fields
}

Queue is a task queue for crawlers

type Resource

type Resource struct {
	// mutex
	sync.Mutex

	// resource URL
	URL *url.URL `json:"_"`

	// string version
	URLString string `json:"url"`

	// from meta
	Title string `json:"title"`

	// HTTP StatusCode
	HTTPStatusCode int `json:"status"`

	// root node
	Root *url.URL `json:"_"`

	// parent node ancestry
	Parent []string `json:"_"`

	// current depth
	Depth int `json:"depth"`

	// child nodes
	Nodes []*Resource `json:"nodes"`

	// last fetched timestamp
	LastFetched time.Time `json:"_"`
}

Resource describes a web page and it's nodes

type Worker

type Worker struct {
	// inherit wg
	sync.WaitGroup

	// nodes tree
	Tree *Resource

	// last updated timestamp
	LastUpdated time.Time
	// contains filtered or unexported fields
}

Worker is a crawler specific to a domain

func (*Worker) CrawlDepth

func (w *Worker) CrawlDepth() int

CrawlDepth returns the worker's depth

func (*Worker) Status

func (w *Worker) Status() WorkerStatus

Status describes the worker's status

type WorkerStatus

type WorkerStatus int

WorkerStatus is used to describe worker status

const (
	StatusInitialised WorkerStatus = iota
	StatusFetchingInProgress
	StatusFetchingComplete
	StatusFetchingError
)

worker status types

func (WorkerStatus) MarshalJSON

func (s WorkerStatus) MarshalJSON() ([]byte, error)

MarshalJSON definition for WorkerStatus

func (WorkerStatus) String

func (s WorkerStatus) String() string

fmt.Stringer definition

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL