spider

package
v0.0.0-...-a4d71d1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 10, 2018 License: MIT Imports: 17 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	ErrTaskRuleNotExist        = errors.New("task rule not exist")
	ErrTaskRuleIsNil           = errors.New("task rule is nil")
	ErrTaskRuleNameIsEmpty     = errors.New("task rule name is empty")
	ErrTaskRuleNameDuplicated  = errors.New("task rule name is Duplicated")
	ErrTaskRuleHeadIsNil       = errors.New("task rule head is nil")
	ErrTaskRuleNodesLenInvalid = errors.New("task rule nodes len is invalid")
	ErrTaskRuleNodesKeyInvalid = errors.New("task rule nodes key should start from 0 and monotonically increasing")
)
View Source
var (
	ErrOutputFieldsNotMatchOutputRow = errors.New("output fields not match out put row")
)
View Source
var (
	ErrTaskRunningTimeout = errors.New("task running timeout")
)

Functions

func GetTaskRuleKeys

func GetTaskRuleKeys() []string

func Register

func Register(rule *TaskRule)

func Run

func Run(task *Task, retCh chan<- common.MTS) error

TODO: Context添加KV功能,能够结束请求链功能 TODO: 思考出错, 中断后续爬虫的方法

Types

type CSVConf

type CSVConf struct {
	CSVFilePath string
	CSVFileName string
}

type Context

type Context struct {
	// contains filtered or unexported fields
}

func (*Context) Output

func (ctx *Context) Output(row map[int]interface{}) error

func (*Context) Post

func (ctx *Context) Post(URL string, requestData map[string]string) error

func (*Context) PostForNext

func (ctx *Context) PostForNext(URL string, requestData map[string]string) error

func (*Context) PostMultipartForNext

func (ctx *Context) PostMultipartForNext(URL string, requestData map[string][]byte) error

func (*Context) PostRawForNext

func (ctx *Context) PostRawForNext(URL string, requestData []byte) error

func (*Context) Visit

func (ctx *Context) Visit(URL string) error

func (*Context) VisitForNext

func (ctx *Context) VisitForNext(URL string) error

type HTMLElement

type HTMLElement struct {
	Name     string
	Text     string
	Request  *Request
	Response *Response
	DOM      *goquery.Selection
	// contains filtered or unexported fields
}

func (*HTMLElement) Attr

func (h *HTMLElement) Attr(k string) string

func (*HTMLElement) ChildAttr

func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string

func (*HTMLElement) ChildAttrs

func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string

func (*HTMLElement) ChildText

func (h *HTMLElement) ChildText(goquerySelector string) string

func (*HTMLElement) ForEach

func (h *HTMLElement) ForEach(goquerySelector string, callback func(int, *HTMLElement))

type Limit

type Limit struct {
	Enable bool
	// DomainRegexp is a regular expression to match against domains
	DomainRegexp string
	// DomainRegexp is a glob pattern to match against domains
	DomainGlob string
	// Delay is the duration to wait before creating a new request to the matching domains
	Delay time.Duration
	// RandomDelay is the extra randomized duration to wait added to Delay before creating a new request
	RandomDelay time.Duration
	// Parallelism is the number of the maximum allowed concurrent requests of the matching domains
	Parallelism int
}

type MySQLConf

type MySQLConf struct {
	Host     string
	Port     int
	User     string
	Password string
	DBName   string
}

type Node

type Node struct {
	OnRequest  func(ctx *Context, req *Request)
	OnError    func(ctx *Context, res *Response, err error) error
	OnResponse func(ctx *Context, res *Response) error
	OnHTML     map[string]func(ctx *Context, el *HTMLElement) error
	OnXML      map[string]func(ctx *Context, el *XMLElement) error
	OnScraped  func(ctx *Context, res *Response) error
}

type Option

type Option struct {
	UserAgent              string
	MaxDepth               int
	AllowedDomains         []string
	URLFilters             []*regexp.Regexp
	AllowURLRevisit        bool
	MaxBodySize            int
	IgnoreRobotsTxt        bool
	ParseHTTPErrorResponse bool
	DisableCookies         bool
}

type OutputConfig

type OutputConfig struct {
	Type      string
	CSVConf   CSVConf
	MySQLConf MySQLConf
}

type Request

type Request struct {
	URL     *url.URL
	Headers *http.Header
	Method  string
	Body    io.Reader
	// contains filtered or unexported fields
}

func (*Request) Abort

func (r *Request) Abort()

func (*Request) AbsoluteURL

func (r *Request) AbsoluteURL(u string) string

func (*Request) GetAnyReqContextValue

func (r *Request) GetAnyReqContextValue(key string) interface{}

func (*Request) GetReqContextValue

func (r *Request) GetReqContextValue(key string) string

func (*Request) Post

func (r *Request) Post(URL string, requestData map[string]string) error

func (*Request) PostForNext

func (r *Request) PostForNext(URL string, requestData map[string]string) error

func (*Request) PostForNextWithContext

func (r *Request) PostForNextWithContext(URL string, requestData map[string]string) error

func (*Request) PostMultipart

func (r *Request) PostMultipart(URL string, requestData map[string][]byte) error

func (*Request) PostMultipartForNext

func (r *Request) PostMultipartForNext(URL string, requestData map[string][]byte) error

func (*Request) PostRaw

func (r *Request) PostRaw(URL string, requestData []byte) error

func (*Request) PostRawForNext

func (r *Request) PostRawForNext(URL string, requestData []byte) error

func (*Request) PostRawForNextWithContext

func (r *Request) PostRawForNextWithContext(URL string, requestData []byte) error

func (*Request) PutReqContextValue

func (r *Request) PutReqContextValue(key string, value interface{})

func (*Request) Retry

func (r *Request) Retry() error

func (*Request) SetResponseCharacterEncoding

func (r *Request) SetResponseCharacterEncoding(encoding string)

func (*Request) Visit

func (r *Request) Visit(URL string) error

func (*Request) VisitForNext

func (r *Request) VisitForNext(URL string) error

func (*Request) VisitForNextWithContext

func (r *Request) VisitForNextWithContext(URL string) error

type Response

type Response struct {
	StatusCode int
	Body       []byte
	Request    *Request
	Headers    *http.Header
	// contains filtered or unexported fields
}

func (*Response) FileName

func (res *Response) FileName() string

func (*Response) Save

func (res *Response) Save(fileName string) error

type Rule

type Rule struct {
	Head  func(ctx *Context) error
	Nodes map[int]*Node
}

type Task

type Task struct {
	ID uint64
	TaskRule
	TaskConfig
}

func NewTask

func NewTask(id uint64, rule TaskRule, config TaskConfig) *Task

type TaskConfig

type TaskConfig struct {
	CronSpec     string
	Option       Option
	Limit        Limit
	ProxyURLs    []string
	OutputConfig OutputConfig
}

type TaskRule

type TaskRule struct {
	Name                   string
	Description            string
	Namespace              string
	OutputFields           []string
	DisableCookies         bool
	AllowURLRevisit        bool
	IgnoreRobotsTxt        bool
	ParseHTTPErrorResponse bool
	Rule                   *Rule
}

func GetTaskRule

func GetTaskRule(ruleName string) (*TaskRule, error)

type XMLElement

type XMLElement struct {
	Name     string
	Text     string
	Request  *Request
	Response *Response
	DOM      interface{}
	// contains filtered or unexported fields
}

func (*XMLElement) Attr

func (x *XMLElement) Attr(k string) string

func (*XMLElement) ChildAttr

func (x *XMLElement) ChildAttr(xpathQuery, attrName string) string

func (*XMLElement) ChildAttrs

func (x *XMLElement) ChildAttrs(xpathQuery, attrName string) []string

func (*XMLElement) ChildText

func (x *XMLElement) ChildText(xpathQuery string) string

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL