goribot: github.com/zhshch2002/goribot Index | Files

package goribot

import "github.com/zhshch2002/goribot"

Index

Package Files

cache.go context.go extensions.go goribot.go net.go task_queue.go tools.go

Constants

const DefaultUA = "Goribot"

DefaultUA is the default User-Agent of spider

Variables

var DefaultClient = &http.Client{
    Timeout: 10 * time.Second,
}

DefaultClient is the default Client and is used by Get, Head, and Post.

var TodoContext = &Context{
    Text:     "",
    Html:     &goquery.Document{},
    Json:     map[string]interface{}{},
    Request:  &Request{},
    Response: &Response{},
    Tasks:    []*Task{},
    Items:    []interface{}{},
    Meta:     map[string]interface{}{},
    drop:     false,
}

TodoContext -- If a task created by `spider.NewTask` as seed task,the OnTask handler will get TodoContext as ctx param

func GetRequestHash Uses

func GetRequestHash(r *Request) [md5.Size]byte

GetRequestHash return a hash of url,header,cookie and body data from a request

func HostFilter Uses

func HostFilter(h ...string) func(s *Spider)

HostFilter is an extension can filter new task by host

func MaxReqLimiter Uses

func MaxReqLimiter(m uint64) func(s *Spider)

MaxReqLimiter is an extension can limit the number of new task

func MustParseUrl Uses

func MustParseUrl(rawurl string) *url.URL

MustParseUrl parse url from str,if get error will do panic

func RandomUserAgent Uses

func RandomUserAgent() func(s *Spider)

RandomUserAgent is an extension can set random User-Agent for new task

func RefererFiller Uses

func RefererFiller() func(s *Spider)

RefererFiller is an extension can add Referer for new task

func ReqDeduplicate Uses

func ReqDeduplicate() func(s *Spider)

ReqDeduplicate is an extension can deduplicate new task

func RobotsTxt Uses

func RobotsTxt(baseUrl, ua string) func(s *Spider)

RobotsTxt is an extension can parse the robots.txt and follow it

func UrlFilter Uses

func UrlFilter(str string) func(s *Spider)

UrlFilter is an extension can filter the new task's url by regexp

type CacheManger Uses

type CacheManger struct {
    // contains filtered or unexported fields
}

func NewCacheManger Uses

func NewCacheManger() *CacheManger

func (*CacheManger) Get Uses

func (cm *CacheManger) Get(k string) (interface{}, bool)

func (*CacheManger) GetAndSet Uses

func (cm *CacheManger) GetAndSet(k string, exp time.Duration, updateFunc func() interface{}) interface{}

func (*CacheManger) MustGet Uses

func (cm *CacheManger) MustGet(k string) interface{}

func (*CacheManger) Set Uses

func (cm *CacheManger) Set(k string, exp time.Duration, updateFunc func() interface{}) interface{}

type Context Uses

type Context struct {
    Text string                 // the response text
    Html *goquery.Document      // spider will try to parse the response as html
    Json map[string]interface{} // spider will try to parse the response as json

    Request  *Request  // origin request
    Response *Response // a response object

    Tasks []*Task                // the new request task which will send to the spider
    Items []interface{}          // the new result data which will send to the spider,use to store
    Meta  map[string]interface{} // the request task created by NewTaskWithMeta func will have a k-y pair
    // contains filtered or unexported fields
}

Context is a wrap of response,origin request,new task,etc

func (*Context) AddItem Uses

func (c *Context) AddItem(i interface{})

AddItem add an item to new item list. After every handler func return, spider will collect these items and call OnItem handler func

func (*Context) AddTask Uses

func (c *Context) AddTask(r *Task)

AddTask add a task to new task list. After every handler func return,spider will collect these tasks

func (*Context) Drop Uses

func (c *Context) Drop()

Drop this context to break the handler chain and stop handling

func (*Context) IsDrop Uses

func (c *Context) IsDrop() bool

IsDrop return was the context dropped

func (*Context) NewTask Uses

func (c *Context) NewTask(req *Request, RespHandler ...func(ctx *Context))

NewTask create a task and add it to new task list After every handler func return,spider will collect these tasks

func (*Context) NewTaskWithMeta Uses

func (c *Context) NewTaskWithMeta(req *Request, meta map[string]interface{}, RespHandler ...func(ctx *Context))

NewTaskWithMeta create a task with meta data and add it to new task list After every handler func return, spider will collect these tasks

type HttpErr Uses

type HttpErr struct {
    Request *Request
    // contains filtered or unexported fields
}

HttpErr is type of downloader error

type PostDataType Uses

type PostDataType int

PostDataType is the type of Content-Type

const (
    // TextPostData  text/plain
    TextPostData PostDataType = iota
    // UrlencodedPostData  application/x-www-form-urlencoded
    UrlencodedPostData
    // JsonPostData  application/json
    JsonPostData
)

type Request Uses

type Request struct {
    Url    *url.URL
    Method string
    Cookie []*http.Cookie
    Header http.Header
    Body   []byte
    Proxy  string
}

Request is request struct

func MustNewGetReq Uses

func MustNewGetReq(rawurl string) *Request

MustNewGetReq create a new get request,if get error will do panic

func MustNewPostReq Uses

func MustNewPostReq(rawurl string, datatype PostDataType, rawdata interface{}) *Request

MustNewPostReq create a new post request,if get error will do panic

func NewGetReq Uses

func NewGetReq(rawurl string) (*Request, error)

NewGetReq create a new get request

func NewPostReq Uses

func NewPostReq(rawurl string, datatype PostDataType, rawdata interface{}) (*Request, error)

NewPostReq create a new post request

func NewRequest Uses

func NewRequest() *Request

NewRequest create a new request

func (*Request) AddCookie Uses

func (r *Request) AddCookie(k, v string) *Request

AddCookie add a cookie to request

func (*Request) SetBody Uses

func (r *Request) SetBody(body []byte) *Request

SetBody set body data of request

func (*Request) SetHeader Uses

func (r *Request) SetHeader(k, v string) *Request

SetHeader set Header of request

func (*Request) WithProxy Uses

func (r *Request) WithProxy(proxy string) *Request

WithProxy set proxy of request

type Response Uses

type Response struct {
    Url    *url.URL
    Status int
    Header http.Header
    Body   []byte

    Request      *Request
    HttpResponse *http.Response

    Text string
    Html *goquery.Document
    Json map[string]interface{}
}

Response is response struct

func Download Uses

func Download(r *Request) (*Response, error)

Download do a request return response and error

type Spider Uses

type Spider struct {
    ThreadPoolSize uint64
    DepthFirst     bool
    Downloader     func(r *Request) (*Response, error)

    Cache *CacheManger
    // contains filtered or unexported fields
}

Spider is the core spider struct

func NewSpider Uses

func NewSpider(exts ...func(s *Spider)) *Spider

NewSpider create a new spider and run extension func to config the spider

func (*Spider) AddTask Uses

func (s *Spider) AddTask(ctx *Context, t *Task)

AddTask add a task to the queue

func (*Spider) NewTask Uses

func (s *Spider) NewTask(req *Request, RespHandler ...func(ctx *Context))

NewTask create a task and add it to the queue

func (*Spider) NewTaskWithMeta Uses

func (s *Spider) NewTaskWithMeta(req *Request, meta map[string]interface{}, RespHandler ...func(ctx *Context))

NewTaskWithMeta create a task with meta data and add it to the queue

func (*Spider) OnError Uses

func (s *Spider) OnError(h func(ctx *Context, err error))

OnError add an On Error handler func to the spider

func (*Spider) OnItem Uses

func (s *Spider) OnItem(h func(ctx *Context, i interface{}) interface{})

OnItem add an On New Item handler func to the spider. For some storage

func (*Spider) OnResp Uses

func (s *Spider) OnResp(h func(ctx *Context))

OnResp add an On Response handler func to the spider

func (*Spider) OnTask Uses

func (s *Spider) OnTask(h func(ctx *Context, t *Task) *Task)

OnTask add an On New Task handler func to the spider

func (*Spider) Run Uses

func (s *Spider) Run()

Run the spider and wait to all task done

type Task Uses

type Task struct {
    Request *Request

    Meta map[string]interface{}
    // contains filtered or unexported fields
}

Task is a wrap of request and its handler funcs

func NewTask Uses

func NewTask(req *Request, RespHandler ...func(ctx *Context)) *Task

NewTask create a new task

type TaskQueue Uses

type TaskQueue struct {
    sync.Mutex
    // contains filtered or unexported fields
}

TaskQueue is a queue of task

func NewTaskQueue Uses

func NewTaskQueue() *TaskQueue

NewTaskQueue create a new queue

func (*TaskQueue) IsEmpty Uses

func (s *TaskQueue) IsEmpty() bool

IsEmpty return true if the queue is empty

func (*TaskQueue) Pop Uses

func (s *TaskQueue) Pop() *Task

Pop a task from the queue

func (*TaskQueue) Push Uses

func (s *TaskQueue) Push(item *Task)

Push a task to the queue

func (*TaskQueue) PushInHead Uses

func (s *TaskQueue) PushInHead(item *Task)

PushInHead push a task to the queue head

Package goribot imports 17 packages (graph). Updated 2019-11-13. Refresh now. Tools for package owners.