crawlbot

package module
v0.0.0-...-3b7aaa4 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 1, 2014 License: BSD-3-Clause Imports: 10 Imported by: 0

README

CrawlBot

GoDoc

CrawlBot is a simple, efficient, and flexible webcrawler / spider. CrawlBot is easy to use out-of-the-box, but also provides extensive flexibility for advanced users.

package main

import (
	"fmt"
	"github.com/phayes/crawlbot"
	"log"
)

func main() {
	crawler := NewCrawler("http://cnn.com", myURLHandler, 4)
	crawler.Start()
	crawler.Wait()
}

func myURLHandler(resp *crawlbot.Response) {
	if resp.Err != nil {
		log.Fatal(resp.Err)
	}

	fmt.Println("Found URL at " + resp.URL)
}

CrawlBot provides extensive customizability for advances use cases. Please see documentation on crawlbot.Crawler and crawlbot.Response for more details.

package main

import (
	"fmt"
	"github.com/phayes/crawlbot"
	"log"
)

func main() {
	crawler := crawlbot.Crawler{
		URLs:       []string{"http://example.com", "http://cnn.com", "http://en.wikipedia.org"},
		NumWorkers: 12,
		Handler:    PrintTitle,
		CheckURL:   AllowEverything,
	}
	crawler.Start()
	crawler.Wait()
}

// Print the title of the page
func PrintTitle(resp *crawlbot.Response) {
	if resp.Err != nil {
		log.Println(resp.Err)
	}

	if resp.Doc != nil {
		title, err := resp.Doc.Search("//title")
		if err != nil {
			log.Println(err)
		}
		fmt.Printf("Title of %s is %s\n", resp.URL, title[0].Content())
	} else {
		fmt.Println("HTML was not parsed for " + resp.URL)
	}
}

// Crawl everything!
func AllowEverything(crawler *crawlbot.Crawler, url string) bool {
	return true
}

Documentation

Overview

Crawlbot is a simple, efficient, and flexible webcrawler / spider. Crawlbot is easy to use out-of-the-box, but also provides extensive flexibility for advanced users.

func main() {
	crawler := NewCrawler("http://cnn.com", myURLHandler, 4)
	crawler.Start()
	crawler.Wait()
}

func myURLHandler(resp *crawlbot.Response) {
	if resp.Err != nil {
		log.Fatal(resp.Err)
	}

	fmt.Println("Found URL at " + resp.URL)
}

CrawlBot provides extensive customizability for advances use cases. Please see documentation on Crawler and Response for more details.

func main() {
	crawler := crawlbot.Crawler{
		URLs:       []string{"http://example.com", "http://cnn.com", "http://en.wikipedia.org"},
		NumWorkers: 12,
		Handler:    PrintTitle,
		CheckURL:   AllowEverything,
	}
	crawler.Start()
	crawler.Wait()
}

// Print the title of the page
func PrintTitle(resp *crawlbot.Response) {
	if resp.Err != nil {
		log.Println(resp.Err)
	}

	if resp.Doc != nil {
		title, err := resp.Doc.Search("//title")
		if err != nil {
			log.Println(err)
		}
		fmt.Printf("Title of %s is %s\n", resp.URL, title[0].Content())
	} else {
		fmt.Println("HTML was not parsed for " + resp.URL)
	}
}

// Crawl everything!
func AllowEverything(crawler *crawlbot.Crawler, url string) bool {
	return true
}

Index

Constants

This section is empty.

Variables

View Source
var (
	ErrReqFailed      = errors.New("HTTP request failed")
	ErrBodyRead       = errors.New("Error reading HTTP response body")
	ErrAlreadyStarted = errors.New("Cannot start crawler that is already running")
	ErrHeaderRejected = errors.New("CheckHeader rejected URL")
	ErrURLRejected    = errors.New("CheckURL rejected URL")
	ErrBadHttpCode    = errors.New("Bad HTTP reponse code")
	ErrBadContentType = errors.New("Unsupported Content-Type")
)

Functions

This section is empty.

Types

type Crawler

type Crawler struct {
	// A list of URLs to start crawling. This is your list of seed URLs.
	URLs []string

	// Number of concurrent workers
	NumWorkers int

	// For each page crawled this function will be called.
	// This is where your business logic should reside.
	// There is no default. If Handler is not set the crawler will panic.
	Handler func(resp *Response)

	// Before a URL is crawled it is passed to this function to see if it should be followed or not. A good url should return nil.
	// By default we follow the link if it's in one of the same domains as our seed URLs.
	CheckURL func(crawler *Crawler, url string) error

	// Before reading in the body we can check the headers to see if we want to continue.
	// By default we abort if it's not HTTP 200 OK or not an html Content-Type.
	// Override this function if you wish to handle non-html files such as binary images.
	// This function should return nil if we wish to continue and read the body.
	CheckHeader func(crawler *Crawler, url string, status int, header http.Header) error

	// This function is called to find new urls in the document to crawl. By default it will
	// find all <a href> links in an html document. Override this function if you wish to follow
	// non <a href> links such as <img src>, or if you wish to find links in non-html documents.
	LinkFinder func(resp *Response) []string

	// The crawler will call this function when it needs a new http.Client to give to a worker.
	// The default client is the built-in net/http Client with a 15 seconnd timeout
	// A sensible alternative might be a simple round-tripper (eg. github.com/pkulak/simpletransport/simpletransport)
	// If you wish to rate-throttle your crawler you would do so by implemting a custom http.Client
	Client func() *http.Client

	// Set this to true and the crawler will not stop by itself, you will need to explicitly call Stop()
	// This is useful when you need a long-running crawler that you occationally feed new urls via Add()
	Persistent bool
	// contains filtered or unexported fields
}

func NewCrawler

func NewCrawler(url string, handler func(resp *Response), numworkers int) *Crawler

Create a new simple crawler. If more customization options are needed then a Crawler{} should be created directly.

func (*Crawler) Add

func (c *Crawler) Add(url string)

Add a URL to the crawler. If the item already exists this is a no-op. TODO: change this behavior so an item is re-queued if it already exists -- tricky if the item is StateRunning

func (*Crawler) IsRunning

func (c *Crawler) IsRunning() bool

Is the crawler currently running or is it stopped?

func (*Crawler) Start

func (c *Crawler) Start() error

Start crawling. Start() will immidiately return; if you wish to wait for the crawl to finish you will want to cal Wait() after calling Start().

func (*Crawler) State

func (c *Crawler) State(url string) State

Get the current state for a URL.

func (*Crawler) Stop

func (c *Crawler) Stop()

Stop a running crawler. This stops all new work but doesn't cancel ongoing jobs. After calling Stop(), call Wait() to wait for everything to finish

func (*Crawler) Wait

func (c *Crawler) Wait()

Wait for the crawler to finish, blocking until it's done. Calling this within a Handler function will cause a deadlock. Don't do this.

type Response

type Response struct {
	// The http.Reponse object
	*http.Response

	// The for this Response
	URL string

	// If any errors were encountered in retrieiving or processing this item, Err will be non-nill
	// Your Handler function should generally check this first
	Err error

	// The Crawler object that retreived this item. You may use this to stop the crawler, add more urls etc.
	// Calling Crawler.Wait() from within your Handler will cause a deadlock. Don't do this.
	Crawler *Crawler
	// contains filtered or unexported fields
}

When handling a crawled page a Response is passed to the Handler function. A crawlbot.Response is an http.Response with a few extra fields.

type State

type State int
const (
	StateNotFound State = iota
	StatePending  State = iota
	StateRunning  State = iota
	StateRejected State = iota
	StateDone     State = iota
)

URL states. You can query the current state of a url by calling Crawler.GetURL(url)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL