chuper

package module
v0.0.0-...-0099c3b Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 2, 2016 License: BSD-3-Clause Imports: 9 Imported by: 0

README

Chuper

A simple and extensible web crawler built on top of fetchbot.

Documentation

Index

Constants

View Source
const (
	DefaultCrawlDelay      = 5 * time.Second
	DefaultCrawlPoliteness = false
	DefaultLogFormat       = "text"
	DefaultLogLevel        = "info"
	DefaultUserAgent       = fetchbot.DefaultUserAgent
)

Variables

View Source
var (
	DefaultHTTPClient = http.DefaultClient
	DefaultCache      = NewMemoryCache()
)
View Source
var (
	ErrNotFound = errors.New("not found")
)

Functions

This section is empty.

Types

type Cache

type Cache interface {
	Get(key string) (interface{}, error)

	Set(key string, value interface{}) error

	SetNX(key string, value interface{}) (bool, error)

	Delete(key string) error
}

type Cmd

type Cmd struct {
	*fetchbot.Cmd
	S *url.URL
	D int
}

func (*Cmd) Depth

func (c *Cmd) Depth() int

func (*Cmd) SourceURL

func (c *Cmd) SourceURL() *url.URL

type CmdBasicAuth

type CmdBasicAuth struct {
	*fetchbot.Cmd
	S *url.URL
	D int
	// contains filtered or unexported fields
}

func (*CmdBasicAuth) BasicAuth

func (c *CmdBasicAuth) BasicAuth() (string, string)

func (*CmdBasicAuth) Depth

func (c *CmdBasicAuth) Depth() int

func (*CmdBasicAuth) SourceURL

func (c *CmdBasicAuth) SourceURL() *url.URL

type Command

type Command interface {
	URL() *url.URL
	Method() string
	SourceURL() *url.URL
	Depth() int
}

type Context

type Context interface {
	Cache() Cache
	Queue() Enqueuer
	Log(fields map[string]interface{}) *logrus.Entry
	URL() *url.URL
	Method() string
	SourceURL() *url.URL
	Depth() int
}

type Crawler

type Crawler struct {
	CrawlDelay      time.Duration
	CrawlDuration   time.Duration
	CrawlPoliteness bool
	LogFormat       string
	LogLevel        string
	Logger          *logrus.Logger
	UserAgent       string
	HTTPClient      fetchbot.Doer
	Cache           Cache
	// contains filtered or unexported fields
}

func New

func New() *Crawler

New returns an initialized Crawler.

func (*Crawler) Block

func (c *Crawler) Block()

func (*Crawler) Finish

func (c *Crawler) Finish()

func (*Crawler) Match

func (*Crawler) Register

func (c *Crawler) Register(rc *ResponseCriteria, procs ...Processor)

func (*Crawler) Start

func (c *Crawler) Start() Enqueuer

type Ctx

type Ctx struct {
	*fetchbot.Context
	C Cache
	L *logrus.Logger
}

func (*Ctx) Cache

func (c *Ctx) Cache() Cache

func (*Ctx) Depth

func (c *Ctx) Depth() int

func (*Ctx) Log

func (c *Ctx) Log(fields map[string]interface{}) *logrus.Entry

func (*Ctx) Method

func (c *Ctx) Method() string

func (*Ctx) Queue

func (c *Ctx) Queue() Enqueuer

func (*Ctx) SourceURL

func (c *Ctx) SourceURL() *url.URL

func (*Ctx) URL

func (c *Ctx) URL() *url.URL

type Enqueuer

type Enqueuer interface {
	Enqueue(string, string, string, int) error

	EnqueueWithBasicAuth(string, string, string, int, string, string) error
}

type MemoryCache

type MemoryCache struct {
	sync.Mutex
	// contains filtered or unexported fields
}

func NewMemoryCache

func NewMemoryCache() *MemoryCache

func (*MemoryCache) Delete

func (r *MemoryCache) Delete(key string) error

func (*MemoryCache) Get

func (r *MemoryCache) Get(key string) (interface{}, error)

func (*MemoryCache) Set

func (r *MemoryCache) Set(key string, value interface{}) error

func (*MemoryCache) SetNX

func (r *MemoryCache) SetNX(key string, value interface{}) (bool, error)

type Processor

type Processor interface {
	Process(Context, *goquery.Document) bool
}

type ProcessorFunc

type ProcessorFunc func(Context, *goquery.Document) bool

func (ProcessorFunc) Process

func (p ProcessorFunc) Process(ctx Context, doc *goquery.Document) bool

type Queue

type Queue struct {
	*fetchbot.Queue
}

func (*Queue) Enqueue

func (q *Queue) Enqueue(method, URL, sourceURL string, depth int) error

func (*Queue) EnqueueWithBasicAuth

func (q *Queue) EnqueueWithBasicAuth(method string, URL string, sourceURL string, depth int, user string, password string) error

type ResponseCriteria

type ResponseCriteria struct {
	Method      string
	ContentType string
	Status      int
	MinStatus   int
	MaxStatus   int
	Path        string
	Host        string
}

Directories

Path Synopsis
cmd

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL