crawler

package
v0.0.0-...-f1bc7a1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 25, 2013 License: MIT Imports: 19 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	QueueEmpty  = errors.New("Queue is empty")
	QueueClosed = errors.New("Queue was closed")
)
View Source
var (
	ERR_MANY_REDIRECT    = errors.New("Many Redirect Error")
	ERR_TIMEOUT          = errors.New("Request timed out")
	ERR_DATABASE         = errors.New("Database returned an error")
	ERR_DOWNLOAD         = errors.New("Failed to download a page")
	ERR_INTERNAL         = errors.New("Occur a internal error")
	ERR_INVALIDURL       = errors.New("URL is invalid")
	ERR_INVALID_ROBOTS   = errors.New("Robots.txt is invalid format")
	ERR_NOT_HTML         = errors.New("This page is not written in HTML")
	ERR_HTML_PARSE_ERROR = errors.New("Failed to parse HTML")
)

Functions

func SHA1Hash

func SHA1Hash(token []byte) string

Types

type CrawlQueue

type CrawlQueue struct {
	sync.Mutex
	// contains filtered or unexported fields
}

func NewCrawlQueue

func NewCrawlQueue(duration time.Duration) *CrawlQueue

func (*CrawlQueue) Close

func (q *CrawlQueue) Close()

func (*CrawlQueue) Flush

func (q *CrawlQueue) Flush() []*urlparse.URL

func (CrawlQueue) Len

func (q CrawlQueue) Len() int

func (CrawlQueue) Less

func (q CrawlQueue) Less(i int, j int) bool

func (*CrawlQueue) Pop

func (q *CrawlQueue) Pop() (url *urlparse.URL, err error)

func (*CrawlQueue) Push

func (q *CrawlQueue) Push(url *urlparse.URL) error

func (CrawlQueue) Swap

func (q CrawlQueue) Swap(i int, j int)

type Crawler

type Crawler struct {
	// contains filtered or unexported fields
}

func NewCrawler

func NewCrawler(exchange Exchange, riakClient *riak.Client, bucket, userAgent, crawlerName string) *Crawler

func (*Crawler) Start

func (c *Crawler) Start()

func (*Crawler) Stop

func (c *Crawler) Stop()

type CrawlingState

type CrawlingState struct {
	LastStatusCode int       `riak:"lastStatusCode"`
	LastDownload   time.Time `riak:"lastDownload"`
	Deleted        bool      `riak:"deleted"`
}

type Exchange

type Exchange struct {
	IPAddr string
	Port   int
}

type Page

type Page struct {
	URL         string        `riak:"url"`
	ContentType string        `riak:"contentType"`
	Body        []byte        `riak:"body"`
	RedirectTo  string        `riak:"redirectTo"`
	State       CrawlingState `riak:"state"`
	riak.Model
}

func NewPage

func NewPage(url string, statusCode int, contentType string, body []byte, redirectTo string, downloadAt time.Time) *Page

type PageStore

type PageStore struct {
	// contains filtered or unexported fields
}

func NewPageStore

func NewPageStore(client *riak.Client, bucket string) *PageStore

func (*PageStore) Delete

func (s *PageStore) Delete(page *Page)

func (*PageStore) Get

func (s *PageStore) Get(url string) (*Page, error)

func (*PageStore) IsKnownURL

func (s *PageStore) IsKnownURL(url *urlparse.URL) (bool, error)

func (*PageStore) Save

func (s *PageStore) Save(p *Page) error

type QueueElement

type QueueElement struct {
	// contains filtered or unexported fields
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL