spider

package
v0.0.0-...-caa0154 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 23, 2023 License: MIT Imports: 10 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Context

type Context struct {
	Req  *Request
	Body []byte
}

func (*Context) GetRule

func (ctx *Context) GetRule(ruleName string) *Rule

func (*Context) Output

func (ctx *Context) Output(data interface{}) *DataCell

func (*Context) OutputJs

func (ctx *Context) OutputJs(reg string) ParseResult

func (*Context) ParseJsReg

func (ctx *Context) ParseJsReg(name string, reg string) ParseResult

ParseJsReg parse规则

type DataCell

type DataCell struct {
	Data map[string]interface{}
	Task *Task
}

func (*DataCell) GetTableName

func (d *DataCell) GetTableName() string

func (*DataCell) GetTaskName

func (d *DataCell) GetTaskName() string

type Fetcher

type Fetcher interface {
	Get(url *Request) ([]byte, error)
}

type LimitConfig

type LimitConfig struct {
	EventCount int // 数量
	EventDur   int // 秒
	Bucket     int // 桶大小
}

type Option

type Option func(opts *Options)

func WithCookie

func WithCookie(cookie string) Option

func WithFetcher

func WithFetcher(f Fetcher) Option

func WithLogger

func WithLogger(logger *zap.Logger) Option

func WithMaxDepth

func WithMaxDepth(maxDepth int64) Option

func WithName

func WithName(name string) Option

func WithReload

func WithReload(reload bool) Option

func WithStorage

func WithStorage(s Storage) Option

func WithURL

func WithURL(url string) Option

func WithWaitTime

func WithWaitTime(waitTime int64) Option

type Options

type Options struct {
	Name     string `json:"name"` // 任务名称,应保证唯一性
	URL      string `json:"url"`
	Cookie   string `json:"cookie"`
	WaitTime int64  `json:"wait_time"` // 随机休眠时间,秒
	Reload   bool   `json:"reload"`    // 网站是否可以重复爬取
	MaxDepth int64  `json:"max_depth"`
	Fetcher  Fetcher
	Storage  Storage
	Limiter  limiter.RateLimiter
	// contains filtered or unexported fields
}

type ParseResult

type ParseResult struct {
	Requests []*Request
	Items    []interface{}
}

type Property

type Property struct {
	Name     string `json:"name"` // 任务名称,是全局唯一的
	Url      string `json:"url"`
	Cookie   string `json:"cookie"`
	WaitTime int64  `json:"wait_time"` // 随机休眠时间,单位是秒
	MaxDepth int64  `json:"max_depth"` // 爬取的最大深度
	Reload   bool   `json:"reload"`    // 网站是否可以重复爬取
}

type Request

type Request struct {
	Method   string
	Task     *Task
	URL      string
	Depth    int64
	Priority int64
	RuleName string
	TmpData  *Temp
}

Request 单个任务请求

func (*Request) Check

func (r *Request) Check() error

func (*Request) Fetch

func (r *Request) Fetch() ([]byte, error)

func (*Request) Unique

func (r *Request) Unique() string

Unique 生成请求的唯一识别码

type Rule

type Rule struct {
	ItemFields []string
	ParseFunc  func(*Context) (ParseResult, error) // 内容解析函数
}

Rule 采集规则节点

type RuleModule

type RuleModule struct {
	Name      string `json:"name"`
	ParseFunc string `json:"parse_script"`
}

type RuleTree

type RuleTree struct {
	Root  func() ([]*Request, error) // 根节点(执行入口)
	Trunk map[string]*Rule           // 规则哈希表
}

RuleTree 采集规则树

type Storage

type Storage interface {
	Save(ds ...*DataCell) error
}

type Task

type Task struct {
	Visited     map[string]bool //是否爬过该网站
	VisitedLock sync.Mutex
	Rule        RuleTree
	Closed      bool // 用于标识任务已经删除

	Options
}

Task 一个任务实例

func NewTask

func NewTask(opts ...Option) *Task

type TaskConfig

type TaskConfig struct {
	Name     string
	Cookie   string
	WaitTime int64
	Reload   bool
	MaxDepth int64
	Fetcher  string
	Limits   []LimitConfig
}

type TaskModule

type TaskModule struct {
	Property
	Root  string       `json:"root_script"`
	Rules []RuleModule `json:"rule"`
}

type Temp

type Temp struct {
	// contains filtered or unexported fields
}

func (*Temp) Get

func (t *Temp) Get(key string) interface{}

func (*Temp) Set

func (t *Temp) Set(key string, value interface{}) error

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL