tegenaria

package module
v0.5.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 23, 2023 License: MIT Imports: 44 Imported by: 1

README

Tegenaria crawl framework

Go Report Card codecov go workflow CodeQL
tegenaria是一个基于golang开发的快速、高效率的网络爬虫框架

特性

  • 支持分布式

  • 支持自定义分布式组件,包括去重、request缓存队列和基本统计指标的分布式运行

  • 支持自定义的事件监控

  • 支持命令行控制

  • 支持gRPC和web api远程控制

  • 支持定时轮询启动爬虫

    安装

  1. go 版本要求>=1.19
go get -u github.com/wetrycode/tegenaria@latest
  1. 在您的项目中导入
import "github.com/wetrycode/tegenaria"

快速开始

查看实例demo example

文档

TODO

  • 管理WEB API

Contribution

Feel free to PR and raise issues.
Send me an email directly, vforfreedom96@gmail.com

License

MIT © wetrycode

Documentation

Overview

Package tegenaria is a crawler framework based on golang

tegenaria是一个基于golang开发的快速、高效率的网络爬虫框架

Index

Constants

View Source
const (
	// RequestStats 发起的请求总数
	RequestStats string = "requests"
	// ItemsStats 获取到的items总数
	ItemsStats string = "items"
	// DownloadFailStats 请求失败总数
	DownloadFailStats string = "download_fail"
	// ErrorStats 错误总数
	ErrorStats string = "errors"
)

Variables

View Source
var (
	// ErrSpiderMiddleware 下载中间件处理异常
	ErrSpiderMiddleware error = errors.New("handle spider middleware error")
	// ErrSpiderCrawls 抓取流程错误
	ErrSpiderCrawls error = errors.New("handle spider crawl error")
	// ErrDuplicateSpiderName 爬虫名重复错误
	ErrDuplicateSpiderName error = errors.New("register a duplicate spider name error")
	// ErrEmptySpiderName 爬虫名不能为空
	ErrEmptySpiderName error = errors.New("register a empty spider name error")
	// ErrSpiderNotExist 爬虫实例不存在
	ErrSpiderNotExist error = errors.New("not found spider")
	// ErrNotAllowStatusCode 不允许的状态码
	ErrNotAllowStatusCode error = errors.New("not allow handle status code")
	// ErrGetCacheItem 获取item 错误
	ErrGetCacheItem error = errors.New("getting item from cache error")
	// ErrGetHttpProxy 获取http代理错误
	ErrGetHttpProxy error = errors.New("getting http proxy ")
	// ErrGetHttpsProxy 获取https代理错误
	ErrGetHttpsProxy error = errors.New("getting https proxy ")
	// ErrParseSocksProxy 解析socks代理错误
	ErrParseSocksProxy error = errors.New("parse socks proxy ")
	// ErrResponseRead 响应读取失败
	ErrResponseRead error = errors.New("read response to buffer error")
	// ErrResponseParse 响应解析失败
	ErrResponseParse error = errors.New("parse response error")
	// ErrNoMaterNodeLive 找不到主节点
	ErrNoMaterNodeLive error = errors.New("no any master node is active")
)
View Source
var ProcessId string = uuid.New().String()

Functions

func AbsFilePathTest added in v0.5.0

func AbsFilePathTest(t *testing.T, path string) string

func DefaultWatcher added in v0.5.0

func DefaultWatcher(ch chan EventType, hooker EventHooksInterface) error

DefaultWatcher 默认的事件监听器 ch 用于接收事件 hooker 事件处理实例化接口,比如DefaultHooks

func GetEngineID added in v0.5.0

func GetEngineID() string

GetEngineID 获取当前进程的引擎实例id

func GetFunctionName added in v0.4.1

func GetFunctionName(fn Parser) string

GetFunctionName 提取解析函数名

func GetLogger

func GetLogger(Name string) *logrus.Entry

func GetMachineIP added in v0.5.0

func GetMachineIP() (string, error)

GetMachineIP 获取本机ip

func GetParserByName added in v0.4.1

func GetParserByName(spider SpiderInterface, name string) reflect.Value

GetParserByName 通过函数名从spider实例中获取解析函数

func GetUUID

func GetUUID() string

func GoRunner added in v0.5.0

func GoRunner(wg *conc.WaitGroup, funcs ...GoFunc) <-chan error

GoRunner 执行协程任务

func Interface2Uint added in v0.5.0

func Interface2Uint(value interface{}) uint

func MD5 added in v0.5.0

func MD5(s string) string

func Map2String added in v0.4.1

func Map2String(m interface{}) string

Map2String 将map转为string

func NewTestProxyServer added in v0.5.0

func NewTestProxyServer() *httptest.Server

func NewTestServer added in v0.5.0

func NewTestServer() *httptest.Server

func OptimalNumOfBits added in v0.4.1

func OptimalNumOfBits(n int, p float64) int

OptimalNumOfBits 计算位数组长度

func OptimalNumOfHashFunctions added in v0.4.1

func OptimalNumOfHashFunctions(n int, m int) int

OptimalNumOfHashFunctions 计算最优的布隆过滤器哈希函数个数

Types

type BaseSpider

type BaseSpider struct {
	// Name spider name
	Name string

	// FeedUrls feed urls
	FeedUrls []string
}

BaseSpider base spider

func NewBaseSpider

func NewBaseSpider(name string, feedUrls []string) *BaseSpider

NewBaseSpider 构建公共爬虫对象

type CacheInterface

type CacheInterface interface {
	// enqueue ctx写入缓存
	Enqueue(ctx *Context) error
	// dequeue ctx 从缓存出队列
	Dequeue() (interface{}, error)
	// isEmpty 缓存是否为空
	IsEmpty() bool
	// getSize 缓存大小
	GetSize() uint64
	// close 关闭缓存
	Close() error
	// SetCurrentSpider 设置当前的spider
	SetCurrentSpider(spider SpiderInterface)
}

CacheInterface request缓存组件

type CheckMasterLive added in v0.4.1

type CheckMasterLive func() (bool, error)

CheckMasterLive 检查所有的master节点是否都在线

type ComponentInterface added in v0.5.0

type ComponentInterface interface {
	// GetDupefilter 获取过滤器组件
	GetDupefilter() RFPDupeFilterInterface
	// GetQueue 获取请求队列接口
	GetQueue() CacheInterface
	// GetLimiter 限速器组件
	GetLimiter() LimitInterface
	// GetStats 指标统计组件
	GetStats() StatisticInterface
	// GetEventHooks 事件监控组件
	GetEventHooks() EventHooksInterface
	// CheckWorkersStop 爬虫停止的条件
	CheckWorkersStop() bool
	// SetCurrentSpider 当前正在运行的爬虫实例
	SetCurrentSpider(spider SpiderInterface)
	// SpiderBeforeStart 启动StartRequest之前的动作
	SpiderBeforeStart(engine *CrawlEngine, spider SpiderInterface) error
}

ComponentInterface 系统组件接口 包含了爬虫系统运行的必要组件

type Configuration

type Configuration struct {
	// Log *Logger `ymal:"log"`
	*viper.Viper
}
var Config *Configuration = nil

func (*Configuration) GetValue added in v0.4.1

func (c *Configuration) GetValue(key string) (interface{}, error)

type Context

type Context struct {
	// Request 请求对象
	Request *Request

	// Response 响应对象
	Response *Response

	// CtxID context 唯一id由uuid生成
	CtxID string

	// Error 处理过程中的错误信息
	Error error

	// Cancel context.CancelFunc
	Cancel context.CancelFunc

	// Items 读写item的管道
	Items chan *ItemMeta

	// Spider 爬虫实例
	Spider SpiderInterface
	// contains filtered or unexported fields
}

Context 在引擎中的数据流通载体,负责单个抓取任务的生命周期维护

func NewContext

func NewContext(request *Request, Spider SpiderInterface, opts ...ContextOption) *Context

NewContext 从内存池中构建context对象

func NewTestRequest added in v0.5.0

func NewTestRequest(spider SpiderInterface, opts ...RequestOption) *Context

func (*Context) Close added in v0.4.1

func (c *Context) Close()

Close 关闭context

func (*Context) Deadline

func (c *Context) Deadline() (deadline time.Time, ok bool)

Deadline context.Deadline implementation

func (*Context) Done

func (c *Context) Done() <-chan struct{}

Done context.Done implementation

func (*Context) Err

func (c *Context) Err() error

Err context.Err implementation

func (Context) GetCtxID added in v0.5.0

func (c Context) GetCtxID() string

GetCtxID get context id

func (*Context) Value

func (c *Context) Value(key interface{}) interface{}

Value context.WithValue implementation

type ContextOption

type ContextOption func(c *Context)

ContextOption 上下文选项

func WithContext

func WithContext(ctx context.Context) ContextOption

WithContext 设置父context

func WithContextID added in v0.5.0

func WithContextID(ctxID string) ContextOption

WithContextID 设置自定义的ctxId

func WithItemChannelSize added in v0.4.1

func WithItemChannelSize(size int) ContextOption

WithItemChannelSize 设置 items 管道的缓冲大小

type CrawlEngine added in v0.4.1

type CrawlEngine struct {
	// contains filtered or unexported fields
}

CrawlEngine 引擎是整个框架数据流调度核心

func NewEngine added in v0.4.1

func NewEngine(opts ...EngineOption) *CrawlEngine

NewEngine 构建新的引擎

func NewTestEngine added in v0.5.0

func NewTestEngine(spiderName string, opts ...EngineOption) *CrawlEngine

func (*CrawlEngine) EventsWatcherRunner added in v0.4.1

func (e *CrawlEngine) EventsWatcherRunner() error

EventsWatcherRunner 事件监听器运行组件

func (*CrawlEngine) Execute added in v0.4.1

func (e *CrawlEngine) Execute(spiderName string) StatisticInterface

func (*CrawlEngine) GetComponents added in v0.5.0

func (e *CrawlEngine) GetComponents() ComponentInterface

func (*CrawlEngine) GetCurrentSpider added in v0.5.0

func (e *CrawlEngine) GetCurrentSpider() SpiderInterface

GetCurrentSpider 获取当前正在运行的spider

func (*CrawlEngine) GetRuntimeStatus added in v0.5.0

func (e *CrawlEngine) GetRuntimeStatus() *RuntimeStatus

func (*CrawlEngine) GetSpiders added in v0.4.1

func (e *CrawlEngine) GetSpiders() *Spiders

GetSpiders 获取所有的已经注册到引擎的spider实例

func (*CrawlEngine) GetStatic added in v0.5.0

func (e *CrawlEngine) GetStatic() StatisticInterface

GetStatic 获取StatisticInterface 统计指标

func (*CrawlEngine) RegisterDownloadMiddlewares added in v0.4.1

func (e *CrawlEngine) RegisterDownloadMiddlewares(middlewares MiddlewaresInterface)

RegisterDownloadMiddlewares 注册下载中间件到引擎

func (*CrawlEngine) RegisterPipelines added in v0.4.1

func (e *CrawlEngine) RegisterPipelines(pipeline PipelinesInterface)

RegisterPipelines 注册pipelines到引擎

func (*CrawlEngine) RegisterSpiders added in v0.4.1

func (e *CrawlEngine) RegisterSpiders(spider SpiderInterface)

RegisterSpiders 将spider实例注册到引擎的 spiders

func (*CrawlEngine) Scheduler added in v0.4.1

func (e *CrawlEngine) Scheduler() error

Scheduler 调度器

type DefaultComponents added in v0.5.0

type DefaultComponents struct {
	// contains filtered or unexported fields
}

DefaultComponents 默认的组件

func NewDefaultComponents added in v0.5.0

func NewDefaultComponents(opts ...DefaultComponentsOption) *DefaultComponents

func (*DefaultComponents) CheckWorkersStop added in v0.5.0

func (d *DefaultComponents) CheckWorkersStop() bool

func (*DefaultComponents) GetDupefilter added in v0.5.0

func (d *DefaultComponents) GetDupefilter() RFPDupeFilterInterface

func (*DefaultComponents) GetEventHooks added in v0.5.0

func (d *DefaultComponents) GetEventHooks() EventHooksInterface

func (*DefaultComponents) GetLimiter added in v0.5.0

func (d *DefaultComponents) GetLimiter() LimitInterface

func (*DefaultComponents) GetQueue added in v0.5.0

func (d *DefaultComponents) GetQueue() CacheInterface

func (*DefaultComponents) GetStats added in v0.5.0

func (d *DefaultComponents) GetStats() StatisticInterface

func (*DefaultComponents) SetCurrentSpider added in v0.5.0

func (d *DefaultComponents) SetCurrentSpider(spider SpiderInterface)

func (*DefaultComponents) SpiderBeforeStart added in v0.5.0

func (d *DefaultComponents) SpiderBeforeStart(engine *CrawlEngine, spider SpiderInterface) error

type DefaultComponentsOption added in v0.5.0

type DefaultComponentsOption func(d *DefaultComponents)

func DefaultComponentsWithDefaultHooks added in v0.5.0

func DefaultComponentsWithDefaultHooks(events *DefaultHooks) DefaultComponentsOption

func DefaultComponentsWithDefaultLimiter added in v0.5.0

func DefaultComponentsWithDefaultLimiter(limiter *DefaultLimiter) DefaultComponentsOption

func DefaultComponentsWithDefaultQueue added in v0.5.0

func DefaultComponentsWithDefaultQueue(queue *DefaultQueue) DefaultComponentsOption

func DefaultComponentsWithDefaultStatistic added in v0.5.0

func DefaultComponentsWithDefaultStatistic(statistic *DefaultStatistic) DefaultComponentsOption

func DefaultComponentsWithDupefilter added in v0.5.0

func DefaultComponentsWithDupefilter(dupefilter *DefaultRFPDupeFilter) DefaultComponentsOption

type DefaultFieldHook

type DefaultFieldHook struct {
}

func (*DefaultFieldHook) Fire

func (hook *DefaultFieldHook) Fire(entry *logrus.Entry) error

func (*DefaultFieldHook) Levels

func (hook *DefaultFieldHook) Levels() []logrus.Level

type DefaultHooks added in v0.4.5

type DefaultHooks struct {
	// contains filtered or unexported fields
}

func NewDefaultHooks added in v0.4.5

func NewDefaultHooks() *DefaultHooks

NewDefaultHooks 构建新的默认事件监听器

func (*DefaultHooks) Error added in v0.4.5

func (d *DefaultHooks) Error(params ...interface{}) error

Error 处理ERROR事件

func (*DefaultHooks) EventsWatcher added in v0.4.5

func (d *DefaultHooks) EventsWatcher(ch chan EventType) error

EventsWatcher DefualtHooks 的事件监听器

func (*DefaultHooks) Exit added in v0.4.5

func (d *DefaultHooks) Exit(params ...interface{}) error

Exit 处理EXIT事件

func (*DefaultHooks) Heartbeat added in v0.4.5

func (d *DefaultHooks) Heartbeat(params ...interface{}) error

Heartbeat 处理HEARTBEAT事件

func (*DefaultHooks) Pause added in v0.5.0

func (d *DefaultHooks) Pause(params ...interface{}) error

Pause 处理STOP事件

func (*DefaultHooks) SetCurrentSpider added in v0.5.0

func (d *DefaultHooks) SetCurrentSpider(spider SpiderInterface)

func (*DefaultHooks) Start added in v0.4.5

func (d *DefaultHooks) Start(params ...interface{}) error

Start 处理START事件

type DefaultLimiter added in v0.5.0

type DefaultLimiter struct {
	// contains filtered or unexported fields
}

defaultLimiter 默认的限速器

func NewDefaultLimiter added in v0.4.1

func NewDefaultLimiter(limitRate int) *DefaultLimiter

NewDefaultLimiter 创建一个新的限速器 limitRate 最大请求速率

func (*DefaultLimiter) CheckAndWaitLimiterPass added in v0.5.0

func (d *DefaultLimiter) CheckAndWaitLimiterPass() error

checkAndWaitLimiterPass 检查当前并发量 如果并发量达到上限则等待

func (*DefaultLimiter) SetCurrentSpider added in v0.5.0

func (d *DefaultLimiter) SetCurrentSpider(spider SpiderInterface)

setCurrrentSpider 设置当前的spider名

type DefaultQueue added in v0.5.0

type DefaultQueue struct {
	// contains filtered or unexported fields
}

RequestCache request缓存队列

func NewDefaultQueue added in v0.5.0

func NewDefaultQueue(size int) *DefaultQueue

NewDefaultQueue get a new DefaultQueue

func (*DefaultQueue) Close added in v0.5.0

func (c *DefaultQueue) Close() error

close 关闭缓存

func (*DefaultQueue) Dequeue added in v0.5.0

func (c *DefaultQueue) Dequeue() (interface{}, error)

dequeue 从队列中获取request对象

func (*DefaultQueue) Enqueue added in v0.5.0

func (c *DefaultQueue) Enqueue(ctx *Context) error

enqueue request对象入队列

func (*DefaultQueue) GetSize added in v0.5.0

func (c *DefaultQueue) GetSize() uint64

getSize 缓存大小

func (*DefaultQueue) IsEmpty added in v0.5.0

func (c *DefaultQueue) IsEmpty() bool

isEmpty 缓存是否为空

func (*DefaultQueue) SetCurrentSpider added in v0.5.0

func (c *DefaultQueue) SetCurrentSpider(spider SpiderInterface)

SetCurrentSpider 设置当前的spider

type DefaultRFPDupeFilter added in v0.5.0

type DefaultRFPDupeFilter struct {
	// contains filtered or unexported fields
}

RFPDupeFilter 去重组件

func NewRFPDupeFilter

func NewRFPDupeFilter(bloomP float64, bloomN int) *DefaultRFPDupeFilter

NewRFPDupeFilter 新建去重组件 bloomP容错率 bloomN数据规模

func (*DefaultRFPDupeFilter) DoDupeFilter added in v0.5.0

func (f *DefaultRFPDupeFilter) DoDupeFilter(ctx *Context) (bool, error)

DoDupeFilter 通过布隆过滤器对request对象进行去重处理

func (*DefaultRFPDupeFilter) Fingerprint added in v0.5.0

func (f *DefaultRFPDupeFilter) Fingerprint(ctx *Context) ([]byte, error)

Fingerprint 计算指纹

func (*DefaultRFPDupeFilter) SetCurrentSpider added in v0.5.0

func (f *DefaultRFPDupeFilter) SetCurrentSpider(spider SpiderInterface)

type DefaultStatistic added in v0.5.0

type DefaultStatistic struct {

	// Metrics 指标-数值缓存
	Metrics map[string]*uint64
	// contains filtered or unexported fields
}

Statistic 数据统计指标

func NewDefaultStatistic added in v0.5.0

func NewDefaultStatistic() *DefaultStatistic

NewStatistic 默认统计数据组件构造函数

func (*DefaultStatistic) Get added in v0.5.0

func (s *DefaultStatistic) Get(metric string) uint64

Get 获取某个指标的数值

func (*DefaultStatistic) GetAllStats added in v0.5.0

func (s *DefaultStatistic) GetAllStats() map[string]uint64

GetAllStats 格式化统计数据

func (*DefaultStatistic) Incr added in v0.5.0

func (s *DefaultStatistic) Incr(metrics string)

Incr 新增一个指标值

func (*DefaultStatistic) SetCurrentSpider added in v0.5.0

func (s *DefaultStatistic) SetCurrentSpider(spider SpiderInterface)

SetCurrentSpider 设置当前的spider

type DistributedWorkerInterface added in v0.4.1

type DistributedWorkerInterface interface {

	// AddNode 新增一个节点
	AddNode() error
	// DelNode 删除当前的节点
	DelNode() error
	// PauseNode 停止当前的节点
	PauseNode() error
	// Heartbeat 心跳
	Heartbeat() error
	// CheckAllNodesStop 检查所有的节点是否都已经停止
	CheckAllNodesStop() (bool, error)
	// CheckMasterLive 检测主节点是否还在线
	CheckMasterLive() (bool, error)
	// SetMaster 是否将当前的节点设置为主节点
	SetMaster(flag bool)
	// SetCurrentSpider 设置当前的spider
	SetCurrentSpider(spider SpiderInterface)
	// GetWorkerID 当前工作节点的id
	GetWorkerID() string
	// IsMaster 是否是主节点
	IsMaster() bool
}

DistributedWorkerInterface 分布式组件接口

type Downloader

type Downloader interface {
	// Download 下载函数
	Download(ctx *Context) (*Response, error)

	// CheckStatus 检查响应状态码的合法性
	CheckStatus(statusCode uint64, allowStatus []uint64) bool
}

Downloader 下载器接口

func NewDownloader

func NewDownloader(opts ...DownloaderOption) Downloader

NewDownloader 构建新的下载器

type DownloaderOption

type DownloaderOption func(d *SpiderDownloader)

DownloaderOption 下载器可选参数函数

func DownloadWithClient

func DownloadWithClient(client http.Client) DownloaderOption

DownloadWithClient 设置下载器的http.Client客户端

func DownloadWithH2 added in v0.4.1

func DownloadWithH2(h2 bool) DownloaderOption

DownloadWithH2 下载器是否开启http2

func DownloadWithTLSConfig added in v0.5.0

func DownloadWithTLSConfig(tls *tls.Config) DownloaderOption

DownloadWithTLSConfig 设置下载器的tls

func DownloadWithTimeout

func DownloadWithTimeout(timeout time.Duration) DownloaderOption

DownloadWithTimeout 设置下载器的网络请求超时时间

func DownloaderWithtransport

func DownloaderWithtransport(transport *http.Transport) DownloaderOption

DownloaderWithtransport 为下载器设置 http.Transport

type EngineOption

type EngineOption func(r *CrawlEngine)

EngineOption 引擎构造过程中的可选参数

func EngineWithComponents added in v0.5.0

func EngineWithComponents(components ComponentInterface) EngineOption

func EngineWithDownloader

func EngineWithDownloader(downloader Downloader) EngineOption

EngineWithDownloader 引擎使用的下载器组件

func EngineWithReqChannelSize added in v0.5.0

func EngineWithReqChannelSize(size int) EngineOption

EngineWithReqChannelSize

func EngineWithUniqueReq

func EngineWithUniqueReq(uniqueReq bool) EngineOption

EngineWithUniqueReq 是否进行去重处理, true则进行去重处理,默认值为true

type ErrorOption

type ErrorOption func(e *HandleError)

ErrorOption HandleError 可选参数

func ErrorWithExtras added in v0.4.1

func ErrorWithExtras(extras map[string]interface{}) ErrorOption

ErrorWithExtras HandleError 添加额外的数据

type EventHooksInterface added in v0.4.1

type EventHooksInterface interface {
	// Start 处理引擎启动事件
	Start(params ...interface{}) error
	// Stop 处理引擎停止事件
	Pause(params ...interface{}) error
	// Error处理错误事件
	Error(params ...interface{}) error
	// Exit 退出引擎事件
	Exit(params ...interface{}) error
	// Heartbeat 心跳检查事件
	Heartbeat(params ...interface{}) error
	// EventsWatcher 事件监听器
	EventsWatcher(ch chan EventType) error

	SetCurrentSpider(spider SpiderInterface)
}

EventHooksInterface 事件处理函数接口

type EventType added in v0.4.1

type EventType int

EventType 事件类型

const (
	// START 启动
	START EventType = iota
	// HEARTBEAT 心跳
	HEARTBEAT
	// PAUSE 暂停
	PAUSE
	// ERROR 错误
	ERROR
	// EXIT 退出
	EXIT
)

type EventsWatcher added in v0.4.1

type EventsWatcher func(ch chan EventType) error

EventsWatcher 事件监听器

type GoFunc added in v0.4.1

type GoFunc func() error

GoFunc 协程函数

type HandleError

type HandleError struct {
	// CtxID 上下文id
	CtxID string
	// Err 处理过程的错误
	Err error
	// Extras 携带的额外信息
	Extras map[string]interface{}
}

HandleError 错误处理接口

func NewError

func NewError(ctx *Context, err error, opts ...ErrorOption) *HandleError

NewError 构建新的HandleError实例

func (*HandleError) Error

func (e *HandleError) Error() string

Error 获取HandleError错误信息

type Hook added in v0.4.1

type Hook func(params ...interface{}) error

Hook 事件处理函数类型

type ItemInterface

type ItemInterface interface {
}

ItemInterface item实例接口

type ItemMeta

type ItemMeta struct {
	// CtxID 对应的context id
	CtxID string
	// Item item对象
	Item ItemInterface
}

ItemMeta item元数据结构

func NewItem

func NewItem(ctx *Context, item ItemInterface) *ItemMeta

NewItem 构建新的ItemMeta对象

type ItemPipelines

type ItemPipelines []PipelinesInterface

func (ItemPipelines) Len

func (p ItemPipelines) Len() int

func (ItemPipelines) Less

func (p ItemPipelines) Less(i, j int) bool

func (ItemPipelines) Swap

func (p ItemPipelines) Swap(i, j int)

type LimitInterface added in v0.4.1

type LimitInterface interface {
	// checkAndWaitLimiterPass 检查当前并发量
	// 如果并发量达到上限则等待
	CheckAndWaitLimiterPass() error
	// setCurrrentSpider 设置当前正在的运行的spider
	SetCurrentSpider(spider SpiderInterface)
}

LimitInterface 限速器接口

type Middlewares

type Middlewares []MiddlewaresInterface

Middlewares 下载中间件队列

func (Middlewares) Len

func (p Middlewares) Len() int

实现sort接口

func (Middlewares) Less

func (p Middlewares) Less(i, j int) bool

func (Middlewares) Swap

func (p Middlewares) Swap(i, j int)

type MiddlewaresBase

type MiddlewaresBase struct {
	Priority int
}

type MiddlewaresInterface

type MiddlewaresInterface interface {
	// GetPriority 获取优先级,数字越小优先级越高
	GetPriority() int

	// ProcessRequest 处理request请求对象
	// 此处用于增加请求头
	// 按优先级执行
	ProcessRequest(ctx *Context) error

	// ProcessResponse 用于处理请求成功之后的response
	// 执行顺序你优先级,及优先级越高执行顺序越晚
	ProcessResponse(ctx *Context, req chan<- *Context) error

	// GetName 获取中间件的名称
	GetName() string
}

MiddlewaresInterface 下载中间件的接口用于处理进入下载器之前的request对象 和下载之后的response

type Parser

type Parser func(resp *Context, req chan<- *Context) error

Parser 响应解析函数结构

type PipelinesBase

type PipelinesBase struct {
	Priority int
}

type PipelinesInterface

type PipelinesInterface interface {
	// GetPriority 获取当前pipeline的优先级
	GetPriority() int
	// ProcessItem item处理单元
	ProcessItem(spider SpiderInterface, item *ItemMeta) error
}

PipelinesInterface pipeline 接口 pipeline 主要用于处理item,例如数据存储、数据清洗 将多个pipeline注册到引擎可以实现责任链模式的数据处理

type ProcessResponse

type ProcessResponse func(ctx *Context) error

ProcessResponse 处理下载之后的response函数

type Proxy

type Proxy struct {
	// ProxyUrl 代理链接
	ProxyUrl string
}

Proxy 代理数据结构

type RFPDupeFilterInterface

type RFPDupeFilterInterface interface {
	// Fingerprint request指纹计算
	Fingerprint(ctx *Context) ([]byte, error)

	// DoDupeFilter request去重
	DoDupeFilter(ctx *Context) (bool, error)

	SetCurrentSpider(spider SpiderInterface)
}

RFPDupeFilterInterface request 对象指纹计算和布隆过滤器去重

type RedirectError

type RedirectError struct {
	RedirectNum int
}

RedirectError 重定向错误

func (*RedirectError) Error

func (e *RedirectError) Error() string

Error获取RedirectError错误

type Request

type Request struct {
	// Url 请求Url
	Url string `json:"url"`
	// Headers 请求头
	Headers map[string]string `json:"headers"`
	// Method 请求方式
	Method RequestMethod `json:"method"`

	// Params 请求url的参数
	Params map[string]string `json:"params"`
	// Proxy 代理实例
	Proxy *Proxy `json:"-"`
	// Cookies 请求携带的cookies
	Cookies map[string]string `json:"cookies"`
	// Meta 请求携带的额外的信息
	Meta map[string]interface{} `json:"meta"`
	// AllowRedirects 是否允许跳转默认允许
	AllowRedirects bool `json:"allowRedirects"`
	// MaxRedirects 最大的跳转次数
	MaxRedirects int `json:"maxRedirects"`
	// Parser 该请求绑定的响应解析函数,必须是一个spider实例
	Parser string `json:"parser"`
	// MaxConnsPerHost 单个域名最大的连接数
	MaxConnsPerHost int `json:"maxConnsPerHost"`

	// AllowStatusCode 允许的状态码
	AllowStatusCode []uint64 `json:"allowStatusCode"`
	// Timeout 请求超时时间
	Timeout time.Duration `json:"timeout"`
	// DoNotFilter
	DoNotFilter bool
	// contains filtered or unexported fields
}

Request 请求对象的结构

func NewRequest

func NewRequest(url string, method RequestMethod, parser Parser, opts ...RequestOption) *Request

请注意parser函数必须是某一个spiderinterface实例的解析函数 否则无法正常调用该解析函数

func RequestFromMap added in v0.4.1

func RequestFromMap(src map[string]interface{}, opts ...RequestOption) *Request

RequestFromMap 从map创建requests

func (*Request) ToMap added in v0.4.1

func (r *Request) ToMap() (map[string]interface{}, error)

ToMap 将request对象转为map

type RequestMethod added in v0.4.1

type RequestMethod string

RequestMethod 请求方式

const (
	// GET 请求
	GET RequestMethod = "GET"
	// POST 请求
	POST RequestMethod = "POST"
	// PUT 请求
	PUT RequestMethod = "PUT"
	// DELETE 请求
	DELETE RequestMethod = "DELETE"
	// OPTIONS 请求
	OPTIONS RequestMethod = "OPTIONS"
	// HEAD 请求
	HEAD RequestMethod = "HEAD"
)

type RequestOption added in v0.4.1

type RequestOption func(r *Request)

Option NewRequest 可选参数

func RequestWithAllowRedirects

func RequestWithAllowRedirects(allowRedirects bool) RequestOption

RequestWithAllowRedirects 设置是否允许跳转 如果不允许则MaxRedirects=0

func RequestWithAllowedStatusCode added in v0.4.1

func RequestWithAllowedStatusCode(allowStatusCode []uint64) RequestOption

RequestWithAllowedStatusCode 设置AllowStatusCode

func RequestWithBodyReader added in v0.5.0

func RequestWithBodyReader(body io.Reader) RequestOption

RequestWithBodyReader set request body io.Reader

func RequestWithDoNotFilter added in v0.4.6

func RequestWithDoNotFilter(doNotFilter bool) RequestOption

RequestWithDoNotFilter 设置当前请求是否进行过滤处理, true则认为该条请求无需进入去重流程,默认值为false

func RequestWithMaxConnsPerHost

func RequestWithMaxConnsPerHost(maxConnsPerHost int) RequestOption

RequestWithMaxConnsPerHost 设置MaxConnsPerHost

func RequestWithMaxRedirects

func RequestWithMaxRedirects(maxRedirects int) RequestOption

RequestWithMaxRedirects 设置最大的跳转次数 若maxRedirects <= 0则认为不允许跳转AllowRedirects = false

func RequestWithParser added in v0.4.1

func RequestWithParser(parser Parser) RequestOption

RequestWithParser 设置Parser

func RequestWithPostForm added in v0.5.0

func RequestWithPostForm(payload url.Values) RequestOption

RequestWithPostForm set application/x-www-form-urlencoded request body reader

func RequestWithRequestBody

func RequestWithRequestBody(body map[string]interface{}) RequestOption

RequestWithRequestBody 传入请求体到request

func RequestWithRequestBytesBody added in v0.4.1

func RequestWithRequestBytesBody(body []byte) RequestOption

RequestWithRequestBytesBody request绑定bytes body

func RequestWithRequestCookies

func RequestWithRequestCookies(cookies map[string]string) RequestOption

RequestWithRequestCookies 设置cookie

func RequestWithRequestHeader

func RequestWithRequestHeader(headers map[string]string) RequestOption

RequestWithRequestHeader 设置请求头

func RequestWithRequestMeta

func RequestWithRequestMeta(meta map[string]interface{}) RequestOption

RequestWithRequestMeta 设置 meta

func RequestWithRequestParams

func RequestWithRequestParams(params map[string]string) RequestOption

RequestWithRequestParams 设置请求的url参数

func RequestWithRequestProxy

func RequestWithRequestProxy(proxy Proxy) RequestOption

RequestWithRequestProxy 设置代理

func RequestWithTimeout added in v0.4.1

func RequestWithTimeout(timeout time.Duration) RequestOption

RequestWithTimeout 设置请求超时时间 若timeout<=0则认为没有超时时间

type Response

type Response struct {
	// Status状态码
	Status int
	// Headers 响应头
	Headers map[string][]string // Header response header
	// Delay 请求延迟
	Delay float64 // Delay the time of handle download request
	// ContentLength 响应体大小
	ContentLength uint64 // ContentLength response content length
	// URL 请求url
	URL string // URL of request url
	// Buffer 响应体缓存
	Buffer *bytes.Buffer // buffer read response buffer

	Body io.ReadCloser
	// contains filtered or unexported fields
}

Response 请求响应体的结构

func NewResponse

func NewResponse() *Response

NewResponse 从内存池中创建新的response对象

func (*Response) Json

func (r *Response) Json() (map[string]interface{}, error)

Json 将响应数据转为json

func (*Response) String

func (r *Response) String() (string, error)

String 将响应数据转为string

func (*Response) WriteTo added in v0.5.0

func (r *Response) WriteTo(writer io.Writer) (int64, error)

type RuntimeStatus added in v0.5.0

type RuntimeStatus struct {
	StartAt   int64
	Duration  float64
	StopAt    int64
	RestartAt int64
	// StatusOn 当前引擎的状态
	StatusOn StatusType
}

func NewRuntimeStatus added in v0.5.0

func NewRuntimeStatus() *RuntimeStatus

func (*RuntimeStatus) GetDuration added in v0.5.0

func (r *RuntimeStatus) GetDuration() float64

GetDuration 爬虫运行时长

func (*RuntimeStatus) GetRestartAt added in v0.5.0

func (r *RuntimeStatus) GetRestartAt() int64

GetStartAt 获取引擎启动的时间戳

func (*RuntimeStatus) GetStartAt added in v0.5.0

func (r *RuntimeStatus) GetStartAt() int64

GetStartAt 获取引擎启动的时间戳

func (*RuntimeStatus) GetStatusOn added in v0.5.0

func (r *RuntimeStatus) GetStatusOn() StatusType

GetStatusOn 获取引擎的状态

func (*RuntimeStatus) GetStopAt added in v0.5.0

func (r *RuntimeStatus) GetStopAt() int64

GetStopAt 爬虫停止的时间戳

func (*RuntimeStatus) SetDuration added in v0.5.0

func (r *RuntimeStatus) SetDuration(duration float64)

func (*RuntimeStatus) SetRestartAt added in v0.5.0

func (r *RuntimeStatus) SetRestartAt(startAt int64)

func (*RuntimeStatus) SetStartAt added in v0.5.0

func (r *RuntimeStatus) SetStartAt(startAt int64)

func (*RuntimeStatus) SetStatus added in v0.5.0

func (r *RuntimeStatus) SetStatus(status StatusType)

SetStatus 设置引擎状态 用于控制引擎的启停

func (*RuntimeStatus) SetStopAt added in v0.5.0

func (r *RuntimeStatus) SetStopAt(stopAt int64)

type Settings

type Settings interface {
	// GetValue 获取指定的参数值
	GetValue(key string) (interface{}, error)
}

type SpiderDownloader

type SpiderDownloader struct {

	// ProxyFunc 对单个请求进行代理设置
	ProxyFunc func(req *http.Request) (*url.URL, error)
	// contains filtered or unexported fields
}

SpiderDownloader tegenaria 爬虫下载器

func (*SpiderDownloader) CheckStatus

func (d *SpiderDownloader) CheckStatus(statusCode uint64, allowStatus []uint64) bool

CheckStatus 检查状态码是否合法

func (*SpiderDownloader) Download

func (d *SpiderDownloader) Download(ctx *Context) (*Response, error)

Download 处理request请求

type SpiderInterface

type SpiderInterface interface {
	// StartRequest 通过GetFeedUrls()获取种子
	// urls并构建初始请求
	StartRequest(req chan<- *Context)

	// Parser 默认的请求响应解析函数
	// 在解析过程中生成的新的请求可以推送到req channel
	Parser(resp *Context, req chan<- *Context) error

	// ErrorHandler 错误处理函数,允许在此过程中生成新的请求
	// 并推送到req channel
	ErrorHandler(err *Context, req chan<- *Context)

	// GetName 获取spider名称
	GetName() string
	// GetFeedUrls 获取种子urls
	GetFeedUrls() []string
}

SpiderInterface Tegenaria spider interface, developer can custom spider must be based on this interface to achieve custom spider.

type Spiders

type Spiders struct {
	// SpidersModules spider名称和spider实例的映射
	SpidersModules map[string]SpiderInterface
	// Parsers parser函数名和函数的映射
	// 用于序列化和反序列化
	Parsers map[string]Parser
}

Spiders 全局spiders管理器 用于接收注册的SpiderInterface实例

var SpidersList *Spiders

SpidersList 注册到引擎的爬虫列表

func NewSpiders

func NewSpiders() *Spiders

NewSpiders 构建Spiders实例

func (*Spiders) GetAllSpidersName added in v0.5.0

func (s *Spiders) GetAllSpidersName() []string

func (*Spiders) GetSpider

func (s *Spiders) GetSpider(name string) (SpiderInterface, error)

GetSpider 通过爬虫名获取spider实例

func (*Spiders) Register

func (s *Spiders) Register(spider SpiderInterface) error

Register spider实例注册到Spiders.SpidersModules

type StatisticInterface added in v0.4.1

type StatisticInterface interface {
	// GetAllStats 获取所有的指标数据
	GetAllStats() map[string]uint64
	// Incr 指定的指标计数器自增1
	Incr(metric string)
	// Get 获取指标的数值
	Get(metric string) uint64
	// SetCurrentSpider 设置当前的爬虫实例
	SetCurrentSpider(spider SpiderInterface)
}

StatisticInterface 数据统计组件接口

type StatsFieldType added in v0.4.1

type StatsFieldType string

StatsFieldType 统计指标的数据类型

type StatusType added in v0.5.0

type StatusType uint

StatusType 当前引擎的状态

const (
	// ON_START 启动状态
	ON_START StatusType = iota
	// ON_STOP 停止状态
	ON_STOP
	// ON_PAUSE 暂停状态
	ON_PAUSE
)

func (StatusType) GetTypeName added in v0.5.0

func (p StatusType) GetTypeName() string

GetTypeName 获取引擎状态的字符串形式

type TestDownloadMiddler added in v0.5.0

type TestDownloadMiddler struct {
	Priority int
	Name     string
}

func (TestDownloadMiddler) GetName added in v0.5.0

func (m TestDownloadMiddler) GetName() string

func (TestDownloadMiddler) GetPriority added in v0.5.0

func (m TestDownloadMiddler) GetPriority() int

func (TestDownloadMiddler) ProcessRequest added in v0.5.0

func (m TestDownloadMiddler) ProcessRequest(ctx *Context) error

func (TestDownloadMiddler) ProcessResponse added in v0.5.0

func (m TestDownloadMiddler) ProcessResponse(ctx *Context, req chan<- *Context) error

type TestDownloadMiddler2 added in v0.5.0

type TestDownloadMiddler2 struct {
	Priority int
	Name     string
}

func (TestDownloadMiddler2) GetName added in v0.5.0

func (m TestDownloadMiddler2) GetName() string

func (TestDownloadMiddler2) GetPriority added in v0.5.0

func (m TestDownloadMiddler2) GetPriority() int

func (TestDownloadMiddler2) ProcessRequest added in v0.5.0

func (m TestDownloadMiddler2) ProcessRequest(ctx *Context) error

func (TestDownloadMiddler2) ProcessResponse added in v0.5.0

func (m TestDownloadMiddler2) ProcessResponse(ctx *Context, req chan<- *Context) error

type TestItemPipeline added in v0.5.0

type TestItemPipeline struct {
	Priority int
}

func (*TestItemPipeline) GetPriority added in v0.5.0

func (p *TestItemPipeline) GetPriority() int

func (*TestItemPipeline) ProcessItem added in v0.5.0

func (p *TestItemPipeline) ProcessItem(spider SpiderInterface, item *ItemMeta) error

type TestItemPipeline2 added in v0.5.0

type TestItemPipeline2 struct {
	Priority int
}

func (*TestItemPipeline2) GetPriority added in v0.5.0

func (p *TestItemPipeline2) GetPriority() int

func (*TestItemPipeline2) ProcessItem added in v0.5.0

func (p *TestItemPipeline2) ProcessItem(spider SpiderInterface, item *ItemMeta) error

type TestItemPipeline3 added in v0.5.0

type TestItemPipeline3 struct {
	Priority int
}

func (*TestItemPipeline3) GetPriority added in v0.5.0

func (p *TestItemPipeline3) GetPriority() int

func (*TestItemPipeline3) ProcessItem added in v0.5.0

func (p *TestItemPipeline3) ProcessItem(spider SpiderInterface, item *ItemMeta) error

type TestItemPipeline4 added in v0.5.0

type TestItemPipeline4 struct {
	Priority int
}

func (*TestItemPipeline4) GetPriority added in v0.5.0

func (p *TestItemPipeline4) GetPriority() int

func (*TestItemPipeline4) ProcessItem added in v0.5.0

func (p *TestItemPipeline4) ProcessItem(spider SpiderInterface, item *ItemMeta) error

type TestSpider added in v0.5.0

type TestSpider struct {
	*BaseSpider
}

func (*TestSpider) ErrorHandler added in v0.5.0

func (s *TestSpider) ErrorHandler(err *Context, req chan<- *Context)

func (*TestSpider) GetFeedUrls added in v0.5.0

func (s *TestSpider) GetFeedUrls() []string

func (*TestSpider) GetName added in v0.5.0

func (s *TestSpider) GetName() string

func (*TestSpider) Parser added in v0.5.0

func (s *TestSpider) Parser(resp *Context, req chan<- *Context) error

func (*TestSpider) StartRequest added in v0.5.0

func (s *TestSpider) StartRequest(req chan<- *Context)

Directories

Path Synopsis
example
rpc
pb
Package pb is a reverse proxy.
Package pb is a reverse proxy.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL