spider

package
v0.0.0-...-58e3b27 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 14, 2020 License: MIT Imports: 26 Imported by: 6

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	// ErrTaskRuleNotExist is the error type for task rule not exist
	ErrTaskRuleNotExist = errors.New("task rule not exist")
	// ErrTaskRuleIsNil is the error thrown when a nil rule registered
	ErrTaskRuleIsNil = errors.New("task rule is nil")
	// ErrTaskRuleNameIsEmpty is the error thrown when the ruleName is empty
	ErrTaskRuleNameIsEmpty = errors.New("task rule name is empty")
	// ErrTaskRuleNameDuplicated is the error thrown if the rule name is duplicated
	ErrTaskRuleNameDuplicated = errors.New("task rule name is Duplicated")
	// ErrTaskRuleHeadIsNil is the error thrown if the rule's head is nil
	ErrTaskRuleHeadIsNil = errors.New("task rule head is nil")
	// ErrTaskRuleNodesLenInvalid is the error thrown if the rule's nodes len is invalid
	ErrTaskRuleNodesLenInvalid = errors.New("task rule nodes len is invalid")
	// ErrTaskRuleNodesKeyInvalid is the error thrown if the rule's key len is invalid
	ErrTaskRuleNodesKeyInvalid = errors.New("task rule nodes key should start from 0 and monotonically increasing")
	// ErrTaskRunningTimeout is the error type for task running timeout
	ErrTaskRunningTimeout = errors.New("task running timeout")
)
View Source
var (
	// ErrOutputFieldsNotMatchOutputRow is the error type for output fields not match out put row
	ErrOutputFieldsNotMatchOutputRow = errors.New("output fields not match out put row")
	// ErrTooManyOutputNamespace is the error type for for too many output namespace
	ErrTooManyOutputNamespace = errors.New("too many output namespace")
	// ErrOutputToMultipleTableDisabled is the error thrown if "OutputToMultipleTable" is false
	ErrOutputToMultipleTableDisabled = errors.New("output to multiple tables disabled")
	// ErrOutputTypeNotSupported is the error type for unknown output type
	ErrOutputTypeNotSupported = errors.New("output type not supported")
	// ErrMultConfNamespaceNotFound is the error type for mult conf namespace not found
	ErrMultConfNamespaceNotFound = errors.New("mult conf namespace not found")
	// ErrOutputParamNotSupported is the error type for unknown output param
	ErrOutputParamNotSupported = errors.New("output param not supported ")
)

Functions

func AutoMigrateHack

func AutoMigrateHack(s *gorm.DB, rule *TaskRule) *gorm.DB

AutoMigrateHack auto create table of the rule

func CancelTask

func CancelTask(taskID uint64) bool

CancelTask cancel a task by taskID

func GetTaskRuleKeys

func GetTaskRuleKeys() []string

GetTaskRuleKeys return all keys of task rule

func NewConstraints

func NewConstraints(columns []string, sizeOrSQLConstraint ...interface{}) (constraints map[string]*OutputConstraint)

NewConstraints is the convenience func to return the custom constraints

func NewSQLString

func NewSQLString(size int, defaultValue ...string) (sql string)

NewSQLString is the convenience func to return varchar sql string

func NewStringsConstraints

func NewStringsConstraints(columns []string, size ...int) (constraints map[string]*OutputConstraint)

NewStringsConstraints is the convenience func to return varchar sql string of a batch columns

func Register

func Register(rule *TaskRule)

Register register a task rule

Types

type CSVConf

type CSVConf struct {
	CSVFilePath string
}

CSVConf is the csv conf of a task

type Context

type Context struct {
	// contains filtered or unexported fields
}

Context gospider context of each callback

func (*Context) Abort

func (ctx *Context) Abort()

Abort abort the current request

func (*Context) AbsoluteURL

func (ctx *Context) AbsoluteURL(u string) string

AbsoluteURL return the absolute URL of u

func (*Context) GetAnyReqContextValue

func (ctx *Context) GetAnyReqContextValue(key string) interface{}

GetAnyReqContextValue return the interface value for a key on ctx

func (*Context) GetOutputDB

func (ctx *Context) GetOutputDB() *sql.DB

GetOutputDB get database of current context

func (*Context) GetReqContextValue

func (ctx *Context) GetReqContextValue(key string) string

GetReqContextValue return the string value for a key on ctx

func (*Context) GetRequest

func (ctx *Context) GetRequest() *Request

GetRequest return the request on this context

func (*Context) Output

func (ctx *Context) Output(value interface{}, namespace ...string) error

func (*Context) OutputCustom

func (ctx *Context) OutputCustom(o Outputer, namespace ...string) error

Output output custom by user

func (*Context) OutputDefault

func (ctx *Context) OutputDefault(row map[int]interface{}, namespace ...string) error

Output output a row data by default

func (*Context) Post

func (ctx *Context) Post(URL string, requestData map[string]string) error

Post issues a POST to the specified URL

func (*Context) PostForNext

func (ctx *Context) PostForNext(URL string, requestData map[string]string) error

PostForNext issues a POST to the specified URL for next step

func (*Context) PostForNextWithContext

func (ctx *Context) PostForNextWithContext(URL string, requestData map[string]string) error

PostForNextWithContext issues a POST to the specified URL for next step with previous context

func (*Context) PostMultipartForNext

func (ctx *Context) PostMultipartForNext(URL string, requestData map[string][]byte) error

PostMultipartForNext issues a multipart POST to the specified URL for next step

func (*Context) PostRawForNext

func (ctx *Context) PostRawForNext(URL string, requestData []byte) error

PostRawForNext issues a rawData POST to the specified URL

func (*Context) PostRawForNextWithContext

func (ctx *Context) PostRawForNextWithContext(URL string, requestData []byte) error

PostRawForNextWithContext issues a rawData POST to the specified URL for next step with previous context

func (*Context) PostWithContext

func (ctx *Context) PostWithContext(URL string, requestData map[string]string) error

PostWithContext issues a POST to the specified URL with current context

func (*Context) PutReqContextValue

func (ctx *Context) PutReqContextValue(key string, value interface{})

PutReqContextValue sets the value for a key

func (*Context) Request

func (ctx *Context) Request(method, URL string, requestData io.Reader, hdr http.Header) error

Request low level method to send HTTP request

func (*Context) RequestForNext

func (ctx *Context) RequestForNext(method, URL string, requestData io.Reader, hdr http.Header) error

RequestForNext low level method to send HTTP request for next step

func (*Context) RequestForNextWithContext

func (ctx *Context) RequestForNextWithContext(method, URL string, requestData io.Reader, hdr http.Header) error

RequestForNextWithContext low level method to send HTTP request for next step with previous context

func (*Context) RequestWithContext

func (ctx *Context) RequestWithContext(method, URL string, requestData io.Reader, hdr http.Header) error

RequestWithContext low level method to send HTTP request with context

func (*Context) Retry

func (ctx *Context) Retry() error

Retry retry current request again

func (*Context) SetResponseCharacterEncoding

func (ctx *Context) SetResponseCharacterEncoding(encoding string)

SetResponseCharacterEncoding set the response charscter encoding on the request

func (*Context) Visit

func (ctx *Context) Visit(URL string) error

Visit issues a GET to the specified URL

func (*Context) VisitForNext

func (ctx *Context) VisitForNext(URL string) error

VisitForNext issues a GET to the specified URL for next step

func (*Context) VisitForNextWithContext

func (ctx *Context) VisitForNextWithContext(URL string) error

VisitForNextWithContext issues a GET to the specified URL for next step with previous context

func (*Context) VisitWithContext

func (ctx *Context) VisitWithContext(URL string) error

VisitWithContext issues a GET to the specified URL with current context

type HTMLElement

type HTMLElement struct {
	Name     string
	Text     string
	Request  *Request
	Response *Response
	DOM      *goquery.Selection
	// contains filtered or unexported fields
}

HTMLElement the html element object

func (*HTMLElement) Attr

func (h *HTMLElement) Attr(k string) string

Attr return the html element attr value

func (*HTMLElement) ChildAttr

func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string

ChildAttr the child attr value of h

func (*HTMLElement) ChildAttrs

func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string

ChildAttrs the child attr list of h

func (*HTMLElement) ChildText

func (h *HTMLElement) ChildText(goquerySelector string) string

ChildText the child text content of h

func (*HTMLElement) ForEach

func (h *HTMLElement) ForEach(goquerySelector string, callback func(int, *HTMLElement))

ForEach calls callback on each goquerySelector element

type Limit

type Limit struct {
	Enable bool
	// DomainRegexp is a regular expression to match against domains
	DomainRegexp string
	// DomainRegexp is a glob pattern to match against domains
	DomainGlob string
	// Delay is the duration to wait before creating a new request to the matching domains
	Delay time.Duration
	// RandomDelay is the extra randomized duration to wait added to Delay before creating a new request
	RandomDelay time.Duration
	// Parallelism is the number of the maximum allowed concurrent requests of the matching domains
	Parallelism int
}

Limit is the limit of a task

type MultipleNamespaceConf

type MultipleNamespaceConf struct {
	OutputFields      []string
	OutputConstraints map[string]*OutputConstraint
	OutputTableOpts   string
}

MultipleNamespaceConf is the mutiple namespace conf

type Node

type Node struct {
	OnRequest  func(ctx *Context, req *Request)
	OnError    func(ctx *Context, res *Response, err error) error
	OnResponse func(ctx *Context, res *Response) error
	OnHTML     map[string]func(ctx *Context, el *HTMLElement) error
	OnXML      map[string]func(ctx *Context, el *XMLElement) error
	OnScraped  func(ctx *Context, res *Response) error
}

Node the rule node of a task

type Option

type Option struct {
	UserAgent              string
	MaxDepth               int
	AllowedDomains         []string
	URLFilters             []*regexp.Regexp
	AllowURLRevisit        bool
	MaxBodySize            int
	IgnoreRobotsTxt        bool
	InsecureSkipVerify     bool
	ParseHTTPErrorResponse bool
	DisableCookies         bool
	RequestTimeout         time.Duration
}

Option is the config option of a task

type OutputConfig

type OutputConfig struct {
	Type      string
	CSVConf   CSVConf
	MySQLConf common.MySQLConf
}

OutputConfig is the output config of a task

type OutputConstraint

type OutputConstraint struct {
	SQL         string
	Index       string
	UniqueIndex string
}

OutputConstraint is the output constraint of db

type Outputer

type Outputer interface {
	Output() error
}

type Request

type Request struct {
	URL     *url.URL
	Headers *http.Header
	Method  string
	Body    io.Reader
	ID      uint32
	// contains filtered or unexported fields
}

Request the object of each request

type Response

type Response struct {
	StatusCode int
	Body       []byte
	Request    *Request
	Headers    *http.Header
	// contains filtered or unexported fields
}

Response the object of each response

func (*Response) FileName

func (res *Response) FileName() string

FileName the filename of response

func (*Response) Save

func (res *Response) Save(fileName string) error

Save save the response to file

type Rule

type Rule struct {
	Head  func(ctx *Context) error
	Nodes map[int]*Node
}

Rule the rule define

type Spider

type Spider struct {
	// contains filtered or unexported fields
}

Spider the spider define

func New

func New(task *Task, retCh chan<- common.MTS) *Spider

New create a new spider object

func (*Spider) Run

func (s *Spider) Run() error

Run run a spider task

func (*Spider) SetDB

func (s *Spider) SetDB(db *sql.DB)

SetDB set the underlayer output db

type Task

type Task struct {
	ID uint64
	TaskRule
	TaskConfig
}

Task is a task define

func NewTask

func NewTask(id uint64, rule TaskRule, config TaskConfig) *Task

NewTask return a new task object

type TaskConfig

type TaskConfig struct {
	CronSpec     string
	Option       Option
	Limit        Limit
	ProxyURLs    []string
	OutputConfig OutputConfig
}

TaskConfig is the config of a task

type TaskRule

type TaskRule struct {
	Name                      string
	Description               string
	OutputToMultipleNamespace bool
	MultipleNamespaceConf     map[string]*MultipleNamespaceConf
	Namespace                 string
	OutputFields              []string
	OutputConstraints         map[string]*OutputConstraint
	OutputTableOpts           string
	DisableCookies            bool
	AllowURLRevisit           bool
	IgnoreRobotsTxt           bool
	InsecureSkipVerify        bool
	ParseHTTPErrorResponse    bool
	Rule                      *Rule
}

TaskRule is the task rule define

func GetTaskRule

func GetTaskRule(ruleName string) (*TaskRule, error)

GetTaskRule get task rule by ruleName

type XMLElement

type XMLElement struct {
	Name     string
	Text     string
	Request  *Request
	Response *Response
	DOM      interface{}
	// contains filtered or unexported fields
}

XMLElement the xml element object

func (*XMLElement) Attr

func (x *XMLElement) Attr(k string) string

Attr return the xml element attr value

func (*XMLElement) ChildAttr

func (x *XMLElement) ChildAttr(xpathQuery, attrName string) string

ChildAttr the child attr value of x

func (*XMLElement) ChildAttrs

func (x *XMLElement) ChildAttrs(xpathQuery, attrName string) []string

ChildAttrs the child attr list of x

func (*XMLElement) ChildText

func (x *XMLElement) ChildText(xpathQuery string) string

ChildText the child text content of x

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL