xspider

package module
v1.0.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 4, 2022 License: MIT Imports: 27 Imported by: 0

README

xspider

golang版本的scrapy,仅实现了主要功能。

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func AllowedDomains

func AllowedDomains(domains []string) func(*Spider)

func BasicAuthHeader

func BasicAuthHeader(username string, password string) string

func BytesToString

func BytesToString(b []byte) string

func CloseFunc

func CloseFunc(fc func(*Spider)) func(*Spider)

func ConcurrentItems

func ConcurrentItems(items int) func(*Setting)

func ConcurrentRequests

func ConcurrentRequests(requests int) func(*Setting)

func ConcurrentRequestsPerDomain

func ConcurrentRequestsPerDomain(requests int) func(*Setting)

func CopyNew

func CopyNew(old interface{}) interface{}

根据对象类型创建一个新对象

func DefaultParseFunc

func DefaultParseFunc(fc ParseFunc) func(*Spider)

func DefaultRequestHeaders

func DefaultRequestHeaders(headers http.Header) func(*Setting)

func DepthLimit

func DepthLimit(depth int) func(*Setting)

func DepthPriority

func DepthPriority(depth int) func(*Setting)

func DepthStatsVerbose

func DepthStatsVerbose(depth bool) func(*Setting)

func DownloadDelay

func DownloadDelay(t time.Duration) func(*Setting)

func DownloadMaxSize

func DownloadMaxSize(size int) func(*Setting)

func DownloadStats

func DownloadStats(stats bool) func(*Setting)

func DownloadTimeout

func DownloadTimeout(t time.Duration) func(*Setting)

func DownloadWarnSize

func DownloadWarnSize(size int) func(*Setting)

func DownloaderClass

func DownloaderClass(c Downloader) func(*Setting)

func DownloaderMiddlewares

func DownloaderMiddlewares(d map[int]DownloaderMiddlewarer) func(*Setting)

func DownloaderMiddlewaresBase

func DownloaderMiddlewaresBase(d map[int]DownloaderMiddlewarer) func(*Setting)

func DropItem

func DropItem(reason string) error

func ErrFile

func ErrFile(file string) func(*Setting)

func FilterClass

func FilterClass(f DupeFilter) func(*Setting)

func FilterEnabled

func FilterEnabled(filter bool) func(*Setting)

func FindScriptVar

func FindScriptVar(b []byte, v string) string

func GetHeaderSize

func GetHeaderSize(header http.Header) int

func GetRequestSize

func GetRequestSize(request *Request) int

func GetResponseSize

func GetResponseSize(response *Response) int

func HttpAuthDomain

func HttpAuthDomain(h []string) func(*Setting)

func HttpError

func HttpError(reason string) error

func HttpErrorAllowAll

func HttpErrorAllowAll(http bool) func(*Setting)

func HttpErrorAllowedCodes

func HttpErrorAllowedCodes(http []int) func(*Setting)

func HttpPass

func HttpPass(h string) func(*Setting)

func HttpUser

func HttpUser(h string) func(*Setting)

func IgnoreRequest

func IgnoreRequest(reason string) error

func InitLog

func InitLog(logPath, errPath string, logLevel zapcore.Level) *zap.SugaredLogger

初始化日志 logger

func ItemPipelines

func ItemPipelines(i map[int]ItemPipeliner) func(*Setting)

func ItemPipelinesBase

func ItemPipelinesBase(i map[int]ItemPipeliner) func(*Setting)

func LogEnabled

func LogEnabled(log bool) func(*Setting)

func LogFile

func LogFile(log string) func(*Setting)

func LogLevel

func LogLevel(log zapcore.Level) func(*Setting)

func MaxRetryTimes

func MaxRetryTimes(retry int) func(*Setting)

func Name

func Name(name string) func(*Spider)

func NewRequestLogger

func NewRequestLogger(log *zap.SugaredLogger, req *Request) *zap.SugaredLogger

func NewResponseLogger

func NewResponseLogger(log *zap.SugaredLogger, response *Response) *zap.SugaredLogger

func RandomizeDownloadDelay

func RandomizeDownloadDelay(delay bool) func(*Setting)

func ReadRequestBody

func ReadRequestBody(request *Request) []byte

func ResponseMaxActiveSize

func ResponseMaxActiveSize(size int) func(*Setting)

func RetryEnabled

func RetryEnabled(retry bool) func(*Setting)

func RetryHttpCodes

func RetryHttpCodes(retry []int) func(*Setting)

func RetryPriorityAdjust

func RetryPriorityAdjust(retry int) func(*Setting)

func SanitizeFileName

func SanitizeFileName(fileName string) string

SanitizeFileName replaces dangerous characters in a string so the return value can be used as a safe file name.

func SchedulerClass

func SchedulerClass(c Scheduler) func(*Setting)

func SchedulerPriorityQueue

func SchedulerPriorityQueue(pq PriorityQueuer) func(*Setting)

func Settings

func Settings(settings *Setting) func(*Spider)

func SpiderMiddlewares

func SpiderMiddlewares(m map[int]SpiderMiddlewarer) func(*Setting)

func SpiderMiddlewaresBase

func SpiderMiddlewaresBase(m map[int]SpiderMiddlewarer) func(*Setting)

func StartRequestsFunc

func StartRequestsFunc(fc func(*Spider) []*Request) func(*Spider)

func StartUrls

func StartUrls(urls []string) func(*Spider)

func StringToBytes

func StringToBytes(s string) []byte

func UrlLengthLimit

func UrlLengthLimit(limits int) func(*Setting)

func UrlSafeBase64Decode

func UrlSafeBase64Decode(source string) []byte

func UrlSafeBase64Encode

func UrlSafeBase64Encode(source []byte) string

func UserAgent

func UserAgent(ua string) func(*Setting)

Types

type BaseItemPipeline

type BaseItemPipeline struct {
	ModuleName string
	Logger     *zap.SugaredLogger
	Stats      StatsCollector
}

func (*BaseItemPipeline) CloseSpider

func (i *BaseItemPipeline) CloseSpider(spider *Spider)

func (*BaseItemPipeline) FromSpider

func (i *BaseItemPipeline) FromSpider(spider *Spider)

func (*BaseItemPipeline) GetModuleName

func (i *BaseItemPipeline) GetModuleName() string

func (*BaseItemPipeline) OpenSpider

func (i *BaseItemPipeline) OpenSpider(spider *Spider)

func (*BaseItemPipeline) ProcessItem

func (i *BaseItemPipeline) ProcessItem(item *Item, spider *Spider)

type BaseMiddleware

type BaseMiddleware struct {
	//中间件模块名称
	ModuleName string
	Logger     *zap.SugaredLogger
	Stats      StatsCollector
}

爬虫中间件及下载器中间件共用的基础中间件数据结构,实现了两者的全部接口

func (*BaseMiddleware) FromSpider

func (mw *BaseMiddleware) FromSpider(spider *Spider)

func (*BaseMiddleware) GetModuleName

func (mw *BaseMiddleware) GetModuleName() string

func (*BaseMiddleware) ProcessException

func (mw *BaseMiddleware) ProcessException(request *Request, err error, spider *Spider) RequestResponse

func (*BaseMiddleware) ProcessRequest

func (mw *BaseMiddleware) ProcessRequest(request *Request, spider *Spider) RequestResponse

func (*BaseMiddleware) ProcessResponse

func (mw *BaseMiddleware) ProcessResponse(request *Request, response *Response, spider *Spider) RequestResponse

func (*BaseMiddleware) ProcessSpiderException

func (mw *BaseMiddleware) ProcessSpiderException(response *Response, err error, spider *Spider) RequestItems

func (*BaseMiddleware) ProcessSpiderInput

func (mw *BaseMiddleware) ProcessSpiderInput(response *Response, spider *Spider)

func (*BaseMiddleware) ProcessSpiderOutput

func (mw *BaseMiddleware) ProcessSpiderOutput(response *Response, result RequestItems, spider *Spider) RequestItems

func (*BaseMiddleware) ProcessStartRequests

func (mw *BaseMiddleware) ProcessStartRequests(result []*Request, spider *Spider) []*Request

type CloseSpiderFunc

type CloseSpiderFunc func(*Spider)

type Context

type Context struct {
	// contains filtered or unexported fields
}

Context provides a tiny layer for passing data between callbacks

func NewContext

func NewContext() *Context

NewContext initializes a new Context instance

func (*Context) ForEach

func (c *Context) ForEach(fn func(k string, v interface{}) interface{}) []interface{}

ForEach iterate context

func (*Context) Get

func (c *Context) Get(key string) interface{}

func (*Context) GetBool

func (c *Context) GetBool(key string) bool

func (*Context) GetBoolWithDefault

func (c *Context) GetBoolWithDefault(key string, dft bool) bool

func (*Context) GetFloat64

func (c *Context) GetFloat64(key string) float64

func (*Context) GetFloat64WithDefault

func (c *Context) GetFloat64WithDefault(key string, dft float64) float64

func (*Context) GetInt

func (c *Context) GetInt(key string) int

func (*Context) GetIntWithDefault

func (c *Context) GetIntWithDefault(key string, dft int) int

func (*Context) GetString

func (c *Context) GetString(key string) string

func (*Context) GetStringWithDefault

func (c *Context) GetStringWithDefault(key string, dft string) string

func (*Context) GetWithDefault

func (c *Context) GetWithDefault(key string, dft interface{}) interface{}

func (*Context) Has

func (c *Context) Has(key string) bool

func (*Context) Put

func (c *Context) Put(key string, value interface{})

Put stores a value of any type in Context

type DefaultDownloader

type DefaultDownloader struct {
	// contains filtered or unexported fields
}

func (*DefaultDownloader) Close

func (d *DefaultDownloader) Close()

func (*DefaultDownloader) Fetch

func (d *DefaultDownloader) Fetch(request *Request) (*Response, error)

func (*DefaultDownloader) FromSpider

func (d *DefaultDownloader) FromSpider(spider *Spider)

func (*DefaultDownloader) IsEmpty

func (d *DefaultDownloader) IsEmpty() bool

func (*DefaultDownloader) IsFree

func (d *DefaultDownloader) IsFree() bool

func (*DefaultDownloader) NextRequestCircle

func (d *DefaultDownloader) NextRequestCircle(heartbeat time.Duration)

func (*DefaultDownloader) ProcessDownloader

func (d *DefaultDownloader) ProcessDownloader(signal *Signal, spider *Spider)

type DefaultDupeFilter

type DefaultDupeFilter struct {
	// contains filtered or unexported fields
}

func (*DefaultDupeFilter) Close

func (df *DefaultDupeFilter) Close()

func (*DefaultDupeFilter) FromSpider

func (df *DefaultDupeFilter) FromSpider(spider *Spider)

func (*DefaultDupeFilter) Log

func (df *DefaultDupeFilter) Log(request *Request)

func (*DefaultDupeFilter) RequestFingerprint

func (df *DefaultDupeFilter) RequestFingerprint(request *Request) string

func (*DefaultDupeFilter) RequestSeen

func (df *DefaultDupeFilter) RequestSeen(request *Request) bool

type DefaultScheduler

type DefaultScheduler struct {
	// contains filtered or unexported fields
}

func (*DefaultScheduler) Close

func (s *DefaultScheduler) Close()

func (*DefaultScheduler) EnqueueRequest

func (s *DefaultScheduler) EnqueueRequest(request *Request) bool

func (*DefaultScheduler) FromSpider

func (s *DefaultScheduler) FromSpider(spider *Spider)

func (*DefaultScheduler) HasPendingRequests

func (s *DefaultScheduler) HasPendingRequests() bool

func (*DefaultScheduler) NextRequest

func (s *DefaultScheduler) NextRequest() *Request

type DepthMiddleware

type DepthMiddleware struct {
	BaseMiddleware
	// contains filtered or unexported fields
}

func (*DepthMiddleware) FromSpider

func (mw *DepthMiddleware) FromSpider(spider *Spider)

func (*DepthMiddleware) ProcessSpiderOutput

func (mw *DepthMiddleware) ProcessSpiderOutput(response *Response, result RequestItems, spider *Spider) RequestItems

type DownloadStatsMiddleware

type DownloadStatsMiddleware struct {
	BaseMiddleware
	// contains filtered or unexported fields
}

func (*DownloadStatsMiddleware) FromSpider

func (mw *DownloadStatsMiddleware) FromSpider(spider *Spider)

func (*DownloadStatsMiddleware) ProcessException

func (mw *DownloadStatsMiddleware) ProcessException(request *Request, err error, spider *Spider) RequestResponse

func (*DownloadStatsMiddleware) ProcessRequest

func (mw *DownloadStatsMiddleware) ProcessRequest(request *Request, spider *Spider) RequestResponse

func (*DownloadStatsMiddleware) ProcessResponse

func (mw *DownloadStatsMiddleware) ProcessResponse(request *Request, response *Response, spider *Spider) RequestResponse

type DownloadTimeoutMiddleware

type DownloadTimeoutMiddleware struct {
	BaseMiddleware
	// contains filtered or unexported fields
}

下载超时中间件,对没有在Meta中指定"download_timeout"的Request添加默认超时时间

func (*DownloadTimeoutMiddleware) FromSpider

func (mw *DownloadTimeoutMiddleware) FromSpider(spider *Spider)

func (*DownloadTimeoutMiddleware) ProcessRequest

func (mw *DownloadTimeoutMiddleware) ProcessRequest(request *Request, spider *Spider) RequestResponse

type Downloader

type Downloader interface {
	FromSpider(spider *Spider)
	Fetch(request *Request) (*Response, error)
	IsFree() bool
	IsEmpty() bool
	Close()
	NextRequestCircle(heartbeat time.Duration)
	ProcessDownloader(signal *Signal, spider *Spider)
}

type DownloaderMiddlewareManager

type DownloaderMiddlewareManager struct {
	ModuleName string
	// contains filtered or unexported fields
}

下载器中间件管理器

func (*DownloaderMiddlewareManager) FromSpider

func (d *DownloaderMiddlewareManager) FromSpider(spider *Spider)

func (*DownloaderMiddlewareManager) ProcessException

func (d *DownloaderMiddlewareManager) ProcessException(signal *Signal, spider *Spider) (index int)

func (*DownloaderMiddlewareManager) ProcessRequest

func (d *DownloaderMiddlewareManager) ProcessRequest(signal *Signal, spider *Spider) (index int)

func (*DownloaderMiddlewareManager) ProcessResponse

func (d *DownloaderMiddlewareManager) ProcessResponse(signal *Signal, spider *Spider) (index int)

type DownloaderMiddlewarer

type DownloaderMiddlewarer interface {
	GetModuleName() string

	FromSpider(spider *Spider)

	ProcessRequest(request *Request, spider *Spider) RequestResponse

	ProcessResponse(request *Request, response *Response, spider *Spider) RequestResponse

	ProcessException(request *Request, err error, spider *Spider) RequestResponse
}

type DownloaderSlot

type DownloaderSlot struct {
	// contains filtered or unexported fields
}

func (*DownloaderSlot) DownloadDelay

func (s *DownloaderSlot) DownloadDelay() int64

func (*DownloaderSlot) IsFree

func (s *DownloaderSlot) IsFree() bool

func (*DownloaderSlot) Len

func (s *DownloaderSlot) Len() int

type DupeFilter

type DupeFilter interface {
	FromSpider(spider *Spider)
	RequestSeen(request *Request) bool
	RequestFingerprint(request *Request) string
	Close()
	Log(request *Request)
}

type ErrBase

type ErrBase struct {
	Reason string
}

func (*ErrBase) Error

func (err *ErrBase) Error() string

type ErrDropItem

type ErrDropItem struct {
	ErrBase
}

type ErrHttpError

type ErrHttpError struct {
	ErrBase
}

type ErrIgnoreRequest

type ErrIgnoreRequest struct {
	ErrBase
}

type ErrorbackFunc

type ErrorbackFunc func(*Failure) RequestItems

ErrorbackFunc Request请求失败后调用的回调函数

type Failure

type Failure struct {
	Request  *Request
	Response *Response
	Spider   *Spider
	Error    error
}

Failure Request请求失败创建的失败信息结构体

type HttpAuthMiddleware

type HttpAuthMiddleware struct {
	BaseMiddleware
	// contains filtered or unexported fields
}

Http认证中间件,如果爬虫的Setting中指定了"HttpUser"或"HttpPass"参数, 且Request的Header中未指定Authorization,则为该Request添加指定认证信息

func (*HttpAuthMiddleware) FromSpider

func (mw *HttpAuthMiddleware) FromSpider(spider *Spider)

func (*HttpAuthMiddleware) ProcessRequest

func (mw *HttpAuthMiddleware) ProcessRequest(request *Request, spider *Spider) RequestResponse

type HttpErrorMiddleware

type HttpErrorMiddleware struct {
	BaseMiddleware
	// contains filtered or unexported fields
}

func (*HttpErrorMiddleware) FromSpider

func (mw *HttpErrorMiddleware) FromSpider(spider *Spider)

func (*HttpErrorMiddleware) ProcessSpiderException

func (mw *HttpErrorMiddleware) ProcessSpiderException(response *Response, err error, spider *Spider) RequestItems

func (*HttpErrorMiddleware) ProcessSpiderInput

func (mw *HttpErrorMiddleware) ProcessSpiderInput(response *Response, spider *Spider)

type Item

type Item map[string]interface{}

type ItemPipelineManager

type ItemPipelineManager struct {
	ModuleName string
	// contains filtered or unexported fields
}

func (*ItemPipelineManager) CloseSpider

func (i *ItemPipelineManager) CloseSpider(spider *Spider) (index int)

func (*ItemPipelineManager) FromSpider

func (i *ItemPipelineManager) FromSpider(spider *Spider)

func (*ItemPipelineManager) OpenSpider

func (i *ItemPipelineManager) OpenSpider(spider *Spider) (index int)

func (*ItemPipelineManager) ProcessItem

func (i *ItemPipelineManager) ProcessItem(item Item, spider *Spider) (index int)

type ItemPipeliner

type ItemPipeliner interface {
	GetModuleName() string
	ProcessItem(Item, *Spider) Item
	OpenSpider(*Spider)
	CloseSpider(*Spider)
}

type ItemSlot

type ItemSlot struct {
	// contains filtered or unexported fields
}

func (*ItemSlot) AddItem

func (s *ItemSlot) AddItem(item Item)

func (*ItemSlot) FinishItem

func (s *ItemSlot) FinishItem(item Item)

func (*ItemSlot) FromSpider

func (s *ItemSlot) FromSpider(spider *Spider)

func (*ItemSlot) IsFree

func (s *ItemSlot) IsFree() bool

type LifoPriorityQueue

type LifoPriorityQueue struct {
	// contains filtered or unexported fields
}

func (*LifoPriorityQueue) Close

func (q *LifoPriorityQueue) Close()

func (*LifoPriorityQueue) FromSpider

func (q *LifoPriorityQueue) FromSpider(spider *Spider)

func (*LifoPriorityQueue) IsEmpty

func (q *LifoPriorityQueue) IsEmpty() bool

func (*LifoPriorityQueue) Peek

func (q *LifoPriorityQueue) Peek() interface{}

func (*LifoPriorityQueue) Pop

func (q *LifoPriorityQueue) Pop() interface{}

func (*LifoPriorityQueue) Push

func (q *LifoPriorityQueue) Push(v interface{}, i int)

func (*LifoPriorityQueue) Size

func (q *LifoPriorityQueue) Size() int

type OpenSpiderFunc

type OpenSpiderFunc func(*Spider)

type ParseFunc

type ParseFunc func(*Response, *Spider) RequestItems

ParseFunc 用户自定义用于解析Response的函数

type PriorityQueuer

type PriorityQueuer interface {
	FromSpider(*Spider)
	Push(item interface{}, priority int)
	Pop() interface{}
	Peek() interface{}
	Size() int
	IsEmpty() bool
	Close()
}

type ProcessExceptionFunc

type ProcessExceptionFunc func(request *Request, err error, spider *Spider) RequestResponse

type ProcessItemFunc

type ProcessItemFunc func(Item, *Spider) Item

type ProcessRequestFunc

type ProcessRequestFunc func(request *Request, spider *Spider) RequestResponse

type ProcessResponseFunc

type ProcessResponseFunc func(request *Request, response *Response, spider *Spider) RequestResponse

type ProcessSpiderExceptionFunc

type ProcessSpiderExceptionFunc func(response *Response, err error, spider *Spider) RequestItems

type ProcessSpiderInputFunc

type ProcessSpiderInputFunc func(response *Response, spider *Spider)

type ProcessSpiderOutputFunc

type ProcessSpiderOutputFunc func(response *Response, result RequestItems, spider *Spider) RequestItems

type ProcessStartRequestsFunc

type ProcessStartRequestsFunc func(result []*Request, spider *Spider) []*Request

type Request

type Request struct {
	Url        *url.URL
	Method     string
	Headers    *http.Header
	Body       io.Reader
	Cookies    []http.Cookie
	Encoding   string
	Priority   int
	DontFilter bool
	Ctx        *Context
	//请求抛出错误时的回调函数,错误包括404、请求超时等
	Errback ErrorbackFunc
	//request请求下载完成后处理其response的回调函数
	//默认调用Parse()
	Callback ParseFunc
}

func DefaultStartRequests

func DefaultStartRequests(s *Spider) []*Request

func NewRequest

func NewRequest(method, URL string, body io.Reader) (*Request, error)

func (*Request) Domain

func (r *Request) Domain() string

type RequestItems

type RequestItems []interface{}

type RequestResponse

type RequestResponse interface{}

下载器中间件各功能函数的返回值,Request、Response、nil或者ErrIgnoreRequest

type Response

type Response struct {
	StatusCode int
	Body       []byte
	Ctx        *Context
	Request    *Request
	Headers    *http.Header
}

func NewResponse

func NewResponse(response *http.Response, request *Request) (*Response, error)

func (*Response) FileName

func (r *Response) FileName() string

FileName returns the sanitized file name parsed from "Content-Disposition" header or from URL

func (*Response) Save

func (r *Response) Save(fileName string) error

Save writes response body to disk

type ResponseSlot

type ResponseSlot struct {
	// contains filtered or unexported fields
}

func (*ResponseSlot) AddResponse

func (s *ResponseSlot) AddResponse(response *Response)

func (*ResponseSlot) FinishResponse

func (s *ResponseSlot) FinishResponse(response *Response)

func (*ResponseSlot) FromSpider

func (s *ResponseSlot) FromSpider(spider *Spider)

func (*ResponseSlot) IsFree

func (s *ResponseSlot) IsFree() bool

type RetryMiddleware

type RetryMiddleware struct {
	BaseMiddleware
	// contains filtered or unexported fields
}

重试中间件,对符合要求的Response或者错误对应的Request进行重试

func (*RetryMiddleware) FromSpider

func (mw *RetryMiddleware) FromSpider(spider *Spider)

func (*RetryMiddleware) ProcessException

func (mw *RetryMiddleware) ProcessException(request *Request, err error, spider *Spider) RequestResponse

func (*RetryMiddleware) ProcessResponse

func (mw *RetryMiddleware) ProcessResponse(request *Request, response *Response, spider *Spider) RequestResponse

type Scheduler

type Scheduler interface {
	FromSpider(spider *Spider)
	HasPendingRequests() bool
	Close()
	EnqueueRequest(request *Request) bool
	NextRequest() *Request
}

type Setter

type Setter interface {
	Add(elems ...interface{})
	Clear()
	Pop() interface{}
	Remove(elems ...interface{}) error
	Discard(elems ...interface{})
	Has(elems ...interface{}) bool
	Len() int
}

func NewSet

func NewSet(size int) Setter

type Setting

type Setting struct {
	// 爬虫机器人名称
	// BotName string
	// 是否启用下载状态记录功能
	DownloadStats bool
	// 下载超时时间
	DownloadTimeout time.Duration
	// 单个域名下每个Request的请求间隔
	DownloadDelay time.Duration
	//启用后实际请求间隔会在0.5到1.5倍的DownloadDelay之间随机选取
	RandomizeDownloadDelay bool
	//单个Request最大下载数据量
	DownloadMaxSize int
	//单个Request开始在日志中输出警报信息的下载数据量
	DownloadWarnSize int
	//单个页面允许爬取的最大深度,为0代表无限制
	DepthLimit int
	//用来根据请求深度调整Request中Priority值的整数
	//调整计算公式如下:
	//Request.Priority = Request.Priority - ( Request.Depth * DepthPriority )
	DepthPriority int
	// 是否启用request深度相关记录功能
	DepthStatsVerbose bool
	//是否自动重试
	RetryEnabled bool
	//除第一次外最大重试次数
	MaxRetryTimes int
	//自动重试的状态码
	RetryHttpCodes []int
	// 重试请求优先级的调整参数
	// request.Priority += priorityAdjust
	RetryPriorityAdjust int

	//是否对Request进行去重过滤
	FilterEnabled bool
	//同时处理的最大Item个数
	ConcurrentItems int
	//爬虫下载器同时下载的最大Request个数
	ConcurrentRequests int
	//单个域名允许同时访问的请求个数
	ConcurrentRequestsPerDomain int
	// 允许下载数据占用的最大内存
	ResponseMaxActiveSize int
	// 是否启用日志功能
	LogEnabled bool
	// 除终端外额外将日志内容保存到指定文件
	// 指定ErrFile时仅将Warning以下的日志信息保存到该文件
	LogFile string
	// 指定时会将Warning及以上的日志信息保存到指定文件
	// 仅指定ErrFile时,Warning以下的日志信息仅在终端显示而不保存
	ErrFile string
	// 日志显示及保存的最低日志等级
	LogLevel zapcore.Level
	// url最大长度限制
	UrlLengthLimit int

	HttpErrorAllowAll     bool
	HttpErrorAllowedCodes []int
	// 默认请求头
	DefaultRequestHeaders http.Header
	// 默认UserAgent
	UserAgent string
	// 请求过滤器
	FilterClass DupeFilter
	// 调度器优先级队列
	SchedulerPriorityQueue PriorityQueuer
	// 用户自定义的设置项
	ExtensionSettings *Context

	DownloaderMiddlewaresBase map[int]DownloaderMiddlewarer

	DownloaderMiddlewares map[int]DownloaderMiddlewarer

	SpiderMiddlewaresBase map[int]SpiderMiddlewarer

	SpiderMiddlewares map[int]SpiderMiddlewarer

	ItemPipelinesBase map[int]ItemPipeliner

	ItemPipelines map[int]ItemPipeliner

	SchedulerClass Scheduler

	DownloaderClass Downloader

	HttpUser       string
	HttpPass       string
	HttpAuthDomain []string
}

func NewSetting

func NewSetting(options ...func(*Setting)) *Setting

func (*Setting) Init

func (s *Setting) Init()

type Signal

type Signal struct {
	From  int
	To    int
	Index int
	Body  interface{}
}

type Spider

type Spider struct {
	// 该spider的名称,用于日志记录
	Name string
	// 可选。包含了spider允许爬取的域名(domain)列表(list)。
	// 当OffsiteMiddleware启用时,域名不在列表中的URL不会被跟进。
	AllowedDomains []string
	// URL列表。当没有制定特定的URL时,spider将从该列表中开始进行爬取。
	// 因此,第一个被获取到的页面的URL将是该列表之一。
	// 后续的URL将会从获取到的数据中提取。
	StartUrls []string
	// 用于生成该爬虫爬取的起始Request,默认使用StartUrls中的链接生成Request
	StartRequestsFunc func(*Spider) []*Request
	// 当response没有指定回调函数时,该方法是xspider处理下载的response的默认方法。
	DefaultParseFunc ParseFunc
	// 爬取结束后执行的自定义函数
	CloseFunc func(*Spider)
	// 爬虫的设置参数,多个爬虫可共用一个相同的设置
	Settings *Setting
	// 日志记录器
	Log *zap.SugaredLogger
	// 爬虫状态记录器
	Stats StatsCollector
	// contains filtered or unexported fields
}

func NewSpider

func NewSpider(options ...func(*Spider)) *Spider

func (*Spider) Init

func (s *Spider) Init()

func (*Spider) Run

func (s *Spider) Run()

type SpiderMiddlewareManager

type SpiderMiddlewareManager struct {
	ModuleName string
	// contains filtered or unexported fields
}

爬虫中间件管理器

func (*SpiderMiddlewareManager) FromSpider

func (s *SpiderMiddlewareManager) FromSpider(spider *Spider)

func (*SpiderMiddlewareManager) ProcessSpiderException

func (s *SpiderMiddlewareManager) ProcessSpiderException(signal *Signal, spider *Spider) (index int)

func (*SpiderMiddlewareManager) ProcessSpiderInput

func (s *SpiderMiddlewareManager) ProcessSpiderInput(signal *Signal, spider *Spider) (index int)

func (*SpiderMiddlewareManager) ProcessSpiderOutput

func (s *SpiderMiddlewareManager) ProcessSpiderOutput(signal *Signal, spider *Spider) (index int)

func (*SpiderMiddlewareManager) ProcessStartRequests

func (s *SpiderMiddlewareManager) ProcessStartRequests(signal *Signal, spider *Spider) (index int)

type SpiderMiddlewarer

type SpiderMiddlewarer interface {
	GetModuleName() string

	FromSpider(spider *Spider)

	ProcessSpiderInput(response *Response, spider *Spider)

	ProcessSpiderOutput(response *Response, result RequestItems, spider *Spider) RequestItems

	ProcessSpiderException(response *Response, err error, spider *Spider) RequestItems

	ProcessStartRequests(result []*Request, spider *Spider) []*Request
}

爬虫中间件接口

type SpiderOutputData

type SpiderOutputData struct {
	Response *Response
	Result   RequestItems
}

type StatsCollect

type StatsCollect struct {
	// contains filtered or unexported fields
}

func (*StatsCollect) ClearStats

func (s *StatsCollect) ClearStats()

func (*StatsCollect) GetStats

func (s *StatsCollect) GetStats() StatsMap

func (*StatsCollect) GetValue

func (s *StatsCollect) GetValue(key string, dft int) int

func (*StatsCollect) IncValue

func (s *StatsCollect) IncValue(key string, count int, start int)

func (*StatsCollect) MaxValue

func (s *StatsCollect) MaxValue(key string, value int)

func (*StatsCollect) MinValue

func (s *StatsCollect) MinValue(key string, value int)

func (*StatsCollect) SetStats

func (s *StatsCollect) SetStats(stats StatsMap)

func (*StatsCollect) SetValue

func (s *StatsCollect) SetValue(key string, value int)

type StatsCollector

type StatsCollector interface {
	GetValue(key string, dft int) int
	GetStats() StatsMap
	SetValue(key string, value int)
	SetStats(stats StatsMap)
	IncValue(key string, count int, start int)
	MaxValue(key string, value int)
	MinValue(key string, value int)
	ClearStats()
}

func NewStatsCollector

func NewStatsCollector() StatsCollector

type StatsMap

type StatsMap map[string]int

type UrlLengthMiddleware

type UrlLengthMiddleware struct {
	BaseMiddleware
	// contains filtered or unexported fields
}

func (*UrlLengthMiddleware) FromSpider

func (mw *UrlLengthMiddleware) FromSpider(spider *Spider)

func (*UrlLengthMiddleware) ProcessSpiderOutput

func (mw *UrlLengthMiddleware) ProcessSpiderOutput(response *Response, result RequestItems, spider *Spider) RequestItems

type UserAgentMiddleware

type UserAgentMiddleware struct {
	BaseMiddleware
	// contains filtered or unexported fields
}

UserAgent中间件,对没有在Headers中指定"User-Agent"参数的Request添加默认值

func (*UserAgentMiddleware) FromSpider

func (mw *UserAgentMiddleware) FromSpider(spider *Spider)

func (*UserAgentMiddleware) ProcessRequest

func (mw *UserAgentMiddleware) ProcessRequest(request *Request, spider *Spider) RequestResponse

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL