goribot: github.com/zhshch2002/goribot Index | Files | Directories

package goribot

import "github.com/zhshch2002/goribot"

Index

Package Files

context.go extensions.go goribot.go limiter.go manager.go net.go scheduler.go tools.go

Constants

❖
const DeduplicateSuffix = "_deduplicate"
❖
const ItemsSuffix = "_items"
❖
const TasksSuffix = "_tasks"

Variables

❖
var D = NewBaseDownloader()
❖
var Do = D.Do
❖
var ErrRunFinishedSpider = errors.New("running a spider which is finished,you could recreate this spider and run the new one")
❖
var GetReq = Get

Deprecated: will be remove at next major version

❖
var Log = logging.MustGetLogger("goribot")
❖
var PostReq = Post

Deprecated: will be remove at next major version

func AddCookieToJar Uses

❖
func AddCookieToJar(urlAddr string, cookies ...*http.Cookie) func(s *Spider)

AddCookieToJar is an extension add a cookie to downloader's cookie jar

func GetRequestHash Uses

❖
func GetRequestHash(r *Request) [md5.Size]byte

GetRequestHash return a hash of url,header,cookie and body data from a request

func Limiter Uses

❖
func Limiter(WhiteList bool, rules ...*LimitRule) func(s *Spider)

func RandomProxy Uses

❖
func RandomProxy(p ...string) func(s *Spider)

RandomUserAgent is an extension can set random proxy url for new task

func RandomUserAgent Uses

❖
func RandomUserAgent() func(s *Spider)

RandomUserAgent is an extension can set random User-Agent for new task

func RedisDistributed Uses

❖
func RedisDistributed(ro *redis.Options, sName string, useDeduplicate bool, onSeedHandler CtxHandlerFun) func(s *Spider)

func RedisReqDeduplicate Uses

❖
func RedisReqDeduplicate(r *redis.Client, sName string) func(s *Spider)

ReqDeduplicate is an extension can deduplicate new task based on redis to support distributed

func RefererFiller Uses

❖
func RefererFiller() func(s *Spider)

RefererFiller is an extension can add Referer for new task

func ReqDeduplicate Uses

❖
func ReqDeduplicate() func(s *Spider)

ReqDeduplicate is an extension can deduplicate new task

func Retry Uses

❖
func Retry(maxTimes int, okcode ...int) func(s *Spider)

Retry is a extension make a new request when get response with error

func RobotsTxt Uses

❖
func RobotsTxt(baseUrl, ua string) func(s *Spider)

RobotsTxt is an extension can parse the robots.txt and follow it

func SaveItemsAsCSV Uses

❖
func SaveItemsAsCSV(f *os.File) func(s *Spider)

SaveItemsAsCSV is a extension save items to a csv file

func SaveItemsAsJSON Uses

❖
func SaveItemsAsJSON(f *os.File) func(s *Spider)

SaveItemsAsCSV is a extension save items to a json file

func SetDepthFirst Uses

❖
func SetDepthFirst(d bool) func(s *Spider)

SetDepthFirst is an extension change Scheduler DepthFirst setting

func SpiderLogError Uses

❖
func SpiderLogError(f *os.File) func(s *Spider)

SpiderLogError is a extension logs special or error response

func SpiderLogPrint Uses

❖
func SpiderLogPrint() func(s *Spider)

SpiderLogPrint is a extension print spider working status

type BaseDownloader Uses

❖
type BaseDownloader struct {
    Client *http.Client
    // contains filtered or unexported fields
}

BaseDownloader is default downloader of goribot

func NewBaseDownloader Uses

❖
func NewBaseDownloader() *BaseDownloader

func (*BaseDownloader) AddMiddleware Uses

❖
func (s *BaseDownloader) AddMiddleware(fn func(req *Request, next func(*Request) (*Response, error)) (*Response, error))

func (*BaseDownloader) Do Uses

❖
func (s *BaseDownloader) Do(req *Request) (resp *Response, err error)

type BaseScheduler Uses

❖
type BaseScheduler struct {

    // DepthFirst sets push new tasks to the top of the queue
    DepthFirst bool
    // contains filtered or unexported fields
}

Scheduler is default scheduler of goribot

func NewBaseScheduler Uses

❖
func NewBaseScheduler(depthFirst bool) *BaseScheduler

func (*BaseScheduler) AddItem Uses

❖
func (s *BaseScheduler) AddItem(i interface{})

func (*BaseScheduler) AddTask Uses

❖
func (s *BaseScheduler) AddTask(t *Task)

func (*BaseScheduler) GetItem Uses

❖
func (s *BaseScheduler) GetItem() interface{}

func (*BaseScheduler) GetTask Uses

❖
func (s *BaseScheduler) GetTask() *Task

func (*BaseScheduler) IsItemEmpty Uses

❖
func (s *BaseScheduler) IsItemEmpty() bool

func (*BaseScheduler) IsTaskEmpty Uses

❖
func (s *BaseScheduler) IsTaskEmpty() bool

type Context Uses

❖
type Context struct {
    // Req is the origin request
    Req *Request
    // Resp is the response object
    Resp *Response

    // Meta the request task created by NewTaskWithMeta func will have a k-y pair
    Meta map[string]interface{}

    Handlers []CtxHandlerFun
    // contains filtered or unexported fields
}

Context is a wrap of response,origin request,new task,etc

func (*Context) Abort Uses

❖
func (c *Context) Abort()

Abort this context to break the handler chain and stop handling

func (*Context) AddItem Uses

❖
func (c *Context) AddItem(i interface{})

AddItem add an item to new item list. After every handler func return, spider will collect these items and call OnItem handler func

func (*Context) AddTask Uses

❖
func (c *Context) AddTask(request *Request, handlers ...CtxHandlerFun)

AddTask add a task to new task list. After every handler func return,spider will collect these tasks

func (*Context) IsAborted Uses

❖
func (c *Context) IsAborted() bool

IsAborted return was the context dropped

type CsvItem Uses

❖
type CsvItem []string

type CtxHandlerFun Uses

❖
type CtxHandlerFun func(ctx *Context)

type Downloader Uses

❖
type Downloader interface {
    Do(req *Request) (resp *Response, err error)
    AddMiddleware(func(req *Request, next func(req *Request) (resp *Response, err error)) (resp *Response, err error))
}

Downloader tool download response from request

type DownloaderErr Uses

❖
type DownloaderErr struct {

    // Request is the Request object when the error occurred
    Request *Request
    // Response is the Request object when the error occurred.It could be nil.
    Response *Response
    // contains filtered or unexported fields
}

DownloaderErr is a error create by Downloader

type ErrorItem Uses

❖
type ErrorItem struct {
    Ctx *Context
    Msg string
}

type JsonItem Uses

❖
type JsonItem struct {
    Data interface{}
}

type LimitRule Uses

❖
type LimitRule struct {
    Regexp, Glob string
    Allow        LimitRuleAllow
    Parallelism  int64

    Rate int64

    Delay       time.Duration
    RandomDelay time.Duration
    MaxReq      int64

    MaxDepth int64
    // contains filtered or unexported fields
}

func (*LimitRule) Match Uses

❖
func (s *LimitRule) Match(u *url.URL) bool

type LimitRuleAllow Uses

❖
type LimitRuleAllow uint8
❖
const (
    NotSet LimitRuleAllow = iota
    Allow
    Disallow
)

type Manager Uses

❖
type Manager struct {
    // contains filtered or unexported fields
}

func NewManager Uses

❖
func NewManager(redis *redis.Client, sName string) *Manager

func (*Manager) GetItem Uses

❖
func (s *Manager) GetItem() interface{}

func (*Manager) OnItem Uses

❖
func (s *Manager) OnItem(fn func(i interface{}) interface{})

func (*Manager) Run Uses

❖
func (s *Manager) Run()

func (*Manager) SendReq Uses

❖
func (s *Manager) SendReq(req *Request)

func (*Manager) SetItemPoolSize Uses

❖
func (s *Manager) SetItemPoolSize(i int)

type RedisScheduler Uses

❖
type RedisScheduler struct {
    // contains filtered or unexported fields
}

Scheduler is default scheduler of goribot

func NewRedisScheduler Uses

❖
func NewRedisScheduler(redis *redis.Client, sName string, bs int, fn ...CtxHandlerFun) *RedisScheduler

func (*RedisScheduler) AddItem Uses

❖
func (s *RedisScheduler) AddItem(i interface{})

func (*RedisScheduler) AddTask Uses

❖
func (s *RedisScheduler) AddTask(t *Task)

func (*RedisScheduler) GetItem Uses

❖
func (s *RedisScheduler) GetItem() interface{}

func (*RedisScheduler) GetTask Uses

❖
func (s *RedisScheduler) GetTask() *Task

func (*RedisScheduler) IsItemEmpty Uses

❖
func (s *RedisScheduler) IsItemEmpty() bool

func (*RedisScheduler) IsTaskEmpty Uses

❖
func (s *RedisScheduler) IsTaskEmpty() bool

type Request Uses

❖
type Request struct {
    *http.Request
    Depth int
    // ResponseCharacterEncoding is the character encoding of the response body.
    // Leave it blank to allow automatic character encoding of the response body.
    // It is empty by default and it can be set in OnRequest callback.
    ResponseCharacterEncoding string
    // ProxyURL is the proxy address that handles the request
    ProxyURL string
    // Meta contains data between a Request and a Response
    Meta map[string]interface{}
    Err  error
    // contains filtered or unexported fields
}

Request is a object of HTTP request

func Get Uses

❖
func Get(urladdr string) *Request

Get creates a get request

func Post Uses

❖
func Post(urladdr string, body io.Reader) *Request

Post creates a post request

func PostFormReq Uses

❖
func PostFormReq(urladdr string, requestData map[string]string) *Request

PostFormReq creates a post request with form data

func PostJsonReq Uses

❖
func PostJsonReq(urladdr string, requestData interface{}) *Request

PostJsonReq creates a post request with json data

func PostRawReq Uses

❖
func PostRawReq(urladdr string, body []byte) *Request

PostReq creates a post request with raw data

func (*Request) AddCookie Uses

❖
func (s *Request) AddCookie(c *http.Cookie) *Request

AddCookie adds a cookie to the request.

func (*Request) AddParam Uses

❖
func (s *Request) AddParam(k, v string) *Request

AddParam adds a query param of request url.

func (*Request) GetBody Uses

❖
func (s *Request) GetBody() []byte

GetBody returns the body as bytes of request

func (*Request) SetHeader Uses

❖
func (s *Request) SetHeader(key, value string) *Request

SetHeader sets the header entries associated with key to the single element value.

func (*Request) SetParam Uses

❖
func (s *Request) SetParam(p map[string]string) *Request

SetParam sets query param of request url. Deprecated: will be remove at next major version

func (*Request) SetProxy Uses

❖
func (s *Request) SetProxy(p string) *Request

SetProxy sets proxy url of request.

func (*Request) SetUA Uses

❖
func (s *Request) SetUA(ua string) *Request

SetProxy sets user-agent url of request header.

func (*Request) WithMeta Uses

❖
func (s *Request) WithMeta(k string, v interface{}) *Request

SetParam sets the meta data of request.

type Response Uses

❖
type Response struct {
    *http.Response
    // Body is the content of the Response
    Body []byte
    // Text is the content of the Response parsed as string
    Text string
    // Request is the Req object from goribot of the response.Tip: there is another Request attr come from *http.Response
    Req *Request
    // Dom is the parsed html object
    Dom *goquery.Document
    // Meta contains data between a Request and a Response
    Meta map[string]interface{}
}

Response is a object of HTTP response

func (*Response) DecodeAndParse Uses

❖
func (s *Response) DecodeAndParse() error

DecodeAndParas decodes the body to text and try to parse it to html or json.

func (*Response) IsHTML Uses

❖
func (s *Response) IsHTML() bool

func (*Response) IsJSON Uses

❖
func (s *Response) IsJSON() bool

func (*Response) Json Uses

❖
func (s *Response) Json(q string) gjson.Result

Json returns json result parsed from response

type Scheduler Uses

❖
type Scheduler interface {
    // GetTask pops a task
    GetTask() *Task
    // GetItem pops a item
    GetItem() interface{}

    // AddTask push a task
    AddTask(t *Task)
    // AddItem push a item
    AddItem(i interface{})

    // IsTaskEmpty returns is tasks queue empty
    IsTaskEmpty() bool
    // IsItemEmpty returns is items queue empty
    IsItemEmpty() bool
}

Scheduler is a queue of tasks and items

type Spider Uses

❖
type Spider struct {
    Scheduler  Scheduler
    Downloader Downloader
    AutoStop   bool
    // contains filtered or unexported fields
}

func NewSpider Uses

❖
func NewSpider(exts ...func(s *Spider)) *Spider

func (*Spider) AddTask Uses

❖
func (s *Spider) AddTask(request *Request, handlers ...CtxHandlerFun)

func (*Spider) OnAdd Uses

❖
func (s *Spider) OnAdd(fn func(ctx *Context, t *Task) *Task)

***********************************************************************************

func (*Spider) OnError Uses

❖
func (s *Spider) OnError(fn func(ctx *Context, err error))

***********************************************************************************

func (*Spider) OnFinish Uses

❖
func (s *Spider) OnFinish(fn func(s *Spider))

***********************************************************************************

func (*Spider) OnHTML Uses

❖
func (s *Spider) OnHTML(selector string, fn func(ctx *Context, sel *goquery.Selection))

func (*Spider) OnItem Uses

❖
func (s *Spider) OnItem(fn func(i interface{}) interface{})

***********************************************************************************

func (*Spider) OnJSON Uses

❖
func (s *Spider) OnJSON(q string, fn func(ctx *Context, j gjson.Result))

func (*Spider) OnReq Uses

❖
func (s *Spider) OnReq(fn func(ctx *Context, req *Request) *Request)

***********************************************************************************

func (*Spider) OnResp Uses

❖
func (s *Spider) OnResp(fn CtxHandlerFun)

***********************************************************************************

func (*Spider) OnStart Uses

❖
func (s *Spider) OnStart(fn func(s *Spider))

***********************************************************************************

func (*Spider) Run Uses

❖
func (s *Spider) Run()

func (*Spider) SetItemPoolSize Uses

❖
func (s *Spider) SetItemPoolSize(i int)

func (*Spider) SetTaskPoolSize Uses

❖
func (s *Spider) SetTaskPoolSize(i int)

func (*Spider) Use Uses

❖
func (s *Spider) Use(fn ...func(s *Spider))

type Task Uses

❖
type Task struct {
    Request  *Request
    Handlers []CtxHandlerFun
}

func NewTask Uses

❖
func NewTask(request *Request, handlers ...CtxHandlerFun) *Task

Directories

PathSynopsis
_examples

Package goribot imports 32 packages (graph) and is imported by 1 packages. Updated 2020-07-04. Refresh now. Tools for package owners.