gospider

package module
v0.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 2, 2021 License: MIT Imports: 25 Imported by: 0

README

gospider

用go写的简单爬虫

程序分别有以下几部分组成

下载器(Downloader)    负责下载网页           
处理器(Handler)       负责解析网页           
调度器(Scheduler)     负责调度待处理请求            
结果处理器(Pipeline)  负责持久化数据或者其他的任务         
监听器(Listener)      监听下载情况           
代理提供者(ProxyProvider)     代理ip提供         
客户端生成器(ClientGenerator)  客户端生成          
存储器(Store)      提供数据存储,默认实现是基于goleveldb,也可自行实现基于其他数据库的          

这些都有默认的实现,也可以自己根据接口实现相应的组件

一般使用只需要自己实现处理器(Handler)和结果处理器(Pipeline)

DEMO1:

func TestExample_1(t *testing.T) {
    fmt.Println("start spider....")
    spider := NewSpider("https://studygolang.com/pkgdoc")
    spider.Run()
}

DEMO2:

func TestExample_2(t *testing.T) {
  fmt.Println("start spider....")
  spider := NewSpider("https://studygolang.com/pkgdoc")
  spider.SetTimeOut(10 * time.Second) //10s 后退出
  spider.AddSeedUrl("https://studygolang.com/pkgdoc")
  spider.SetSleepTime(1 * time.Second)
  spider.AddHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36")
  spider.SetGoroutines(1)
  //spider.SetDownloader() //设置自己实现的下载器,下载器必须实现Downloader接口,不设置则使用默认下载器
  //spider.AddHandler() //添加自己实现的处理html文本的处理器,处理器必须实现Handler接口,不设置则使用默认的,可以添加多个
  //spider.AddPipeline() //添加自己实现的结果处理器,处理器必须实现Pipeline接口,负责处理Handler返回的数据,不设置则使用默认的,可以添加多个
  //spider.SetScheduler() //设置自己实现的调度器,调度器必须实现Scheduler接口,不设置则使用默认的
  spider.Run()
}

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ErrorSkip = errors.New("skip")

Functions

func CreateDataDB

func CreateDataDB(path string) *leveldb.DB

CreateDataDB 创建存储库

func DecodeGBK

func DecodeGBK(s []byte) ([]byte, error)

gbk转utf8

func RequestStringify

func RequestStringify(req Request) (string, error)

Types

type ByteHandler

type ByteHandler interface {
	Handle(s []byte) ([]byte, error)
}

type ClientGenerator

type ClientGenerator interface {
	Generate() *http.Client
	SetProxyProvider(pxyProvider ProxyProvider)
}

ClientGenerator 客户端生成器

type ConsoleHandler

type ConsoleHandler struct {
}

Handler的默认实现

func (*ConsoleHandler) Handle

func (hh *ConsoleHandler) Handle(resp Response, handleResult *Result, ctx context.Context) error

处理结果写入handleResult 返回 false则不处理

type ConsolePipeline

type ConsolePipeline struct {
}

Pipeline的默认实现

func (*ConsolePipeline) Process

func (c *ConsolePipeline) Process(handleResult *Result, ctx context.Context) error

type DataStore

type DataStore struct {
	// contains filtered or unexported fields
}

Store的默认实现

func (*DataStore) Add

func (lvdb *DataStore) Add(key string, value string) error

func (*DataStore) BatchAdd

func (lvdb *DataStore) BatchAdd(m map[string]string) error

func (*DataStore) Clear

func (lvdb *DataStore) Clear(prefix string, limit ...string)

func (*DataStore) Del

func (lvdb *DataStore) Del(key string) error

func (*DataStore) Get

func (lvdb *DataStore) Get(key string) (string, error)

func (*DataStore) List

func (lvdb *DataStore) List(prefix string, limit ...string) ([]string, error)

type DefaultListener

type DefaultListener struct {
	// contains filtered or unexported fields
}

Listener的认实现

func (*DefaultListener) OnError

func (listen *DefaultListener) OnError(req Request, e error, ctx context.Context)

func (*DefaultListener) OnSuccess

func (listen *DefaultListener) OnSuccess(req Request, ctx context.Context)

type Downloader

type Downloader interface {
	Download(req *Request, ctx context.Context) (resp *Response, err error)
	SetClientGenerator(generator ClientGenerator)
}

type GBKByteHandler

type GBKByteHandler struct {
}

func (*GBKByteHandler) Handle

func (h *GBKByteHandler) Handle(s []byte) ([]byte, error)

type Handler

type Handler interface {
	//处理结果写入handleResult
	//返回 false则不处理
	Handle(resp Response, handleResult *Result, ctx context.Context) error
}

type HttpDownloader

type HttpDownloader struct {
	// contains filtered or unexported fields
}

Downloader的默认实现

func (*HttpDownloader) Download

func (d *HttpDownloader) Download(request *Request, ctx context.Context) (r *Response, err error)

func (*HttpDownloader) SetClientGenerator

func (d *HttpDownloader) SetClientGenerator(generator ClientGenerator)

type Listener

type Listener interface {
	OnError(req Request, e error, ctx context.Context)
	OnSuccess(req Request, ctx context.Context)
}

监听接口

type Pipeline

type Pipeline interface {
	Process(handleResult *Result, ctx context.Context) error
}

type Proxy

type Proxy struct {
	Scheme   string
	Host     string
	Port     string
	Username string
	Password string
}

Proxy 代理对象

func CreateProxy

func CreateProxy(url url.URL) Proxy

func (Proxy) String

func (pxy Proxy) String() string

type ProxyProvider

type ProxyProvider interface {
	GetProxy() *Proxy
	AddProxy(pxy ...Proxy)
}

ProxyProvider 代理提供器

type Request

type Request struct {
	Id         string
	Url        string                 //请求资源地址
	Method     string                 //请求方法,
	Header     map[string][]string    //请求头
	Downloader Downloader             //下载器
	Extras     map[string]interface{} //额外信息
	Skip       bool                   //跳过请求不处理
	State      RequestState           //请求的状态
	CycleTime  int                    //请求失败之后重复请求的次数
}

func NewRequest

func NewRequest() Request

func ParseRequest

func ParseRequest(str string) (*Request, error)

func (*Request) AddExtras

func (req *Request) AddExtras(key string, value interface{})

func (*Request) GetExtras

func (req *Request) GetExtras(key string) interface{}

type RequestFilter

type RequestFilter interface {
	Filter(requests ...Request) []Request
}

FilterRequest 去重复的url

type RequestHandle

type RequestHandle func(req *Request)

type RequestScheduler

type RequestScheduler struct {
	// contains filtered or unexported fields
}

Scheduler的默认实现

func (*RequestScheduler) Len

func (s *RequestScheduler) Len() int

func (*RequestScheduler) Poll

func (s *RequestScheduler) Poll() Request

func (*RequestScheduler) PollN

func (s *RequestScheduler) PollN(n int) ([]Request, int)

func (*RequestScheduler) Push

func (s *RequestScheduler) Push(reqs ...Request)

type RequestState

type RequestState string
const (
	RequestNormal  RequestState = "normal"
	RequestSuccess RequestState = "success"
	RequestError   RequestState = "error"
)

type Response

type Response struct {
	Body          []byte
	Status        string // e.g. "200 OK"
	StatusCode    int    // e.g. 200
	Header        map[string][]string
	ContentLength int64
	Request       *Request
}

type Result

type Result struct {
	TargetRequests []Request
	TargetItems    map[string]interface{}
}

func (*Result) AddItem

func (hdl *Result) AddItem(key string, val interface{})

func (*Result) AddTargetRequest

func (hdl *Result) AddTargetRequest(target Request)

func (*Result) AddTargetUrl

func (hdl *Result) AddTargetUrl(target string)

type Scheduler

type Scheduler interface {
	Push(requests ...Request)
	Poll() Request
	//取n个 返回数据切片和取到的真是数量
	PollN(n int) ([]Request, int)
	Len() int
}

type SimpleClientGenerator

type SimpleClientGenerator struct {
	// contains filtered or unexported fields
}

SimpleClientGenerator ClientGenerator的默认实现

func (*SimpleClientGenerator) Generate

func (sg *SimpleClientGenerator) Generate() *http.Client

func (*SimpleClientGenerator) SetProxyProvider

func (sg *SimpleClientGenerator) SetProxyProvider(pxyProvider ProxyProvider)

type SimpleProxyProvider

type SimpleProxyProvider struct {
	// contains filtered or unexported fields
}

SimpleProxyProvider ProxyProvider默认实现

func (*SimpleProxyProvider) AddProxy

func (sp *SimpleProxyProvider) AddProxy(pxy ...Proxy)

func (*SimpleProxyProvider) GetProxy

func (sp *SimpleProxyProvider) GetProxy() *Proxy

GetProxy 实现ProxyProvider接口

type Spider

type Spider struct {
	RequestsStore    []Store       //保存请求对象数据
	PreHandleRequest RequestHandle //执行请求前的请求处理
	// contains filtered or unexported fields
}

func NewSpider

func NewSpider(seedUrl ...string) *Spider

NewSpider 创建一个爬虫程序 seedUrl 种子Url

func (*Spider) AddHandler

func (s *Spider) AddHandler(handler Handler)

AddHandler 添加处理器

func (*Spider) AddHeader

func (s *Spider) AddHeader(key, value string)

AddHeader 添加请求头

func (*Spider) AddListener

func (s *Spider) AddListener(listener Listener)

AddListener 添加监听器

func (*Spider) AddPipeline

func (s *Spider) AddPipeline(pipeline Pipeline)

AddPipeline 添加结果处理器

func (*Spider) AddProxy

func (s *Spider) AddProxy(pxy ...Proxy)

AddProxy 添加代理

func (*Spider) AddRequestStore

func (s *Spider) AddRequestStore(store Store)

SetStoreDB 存储器 存储请求数据

func (*Spider) AddSeedUrl

func (s *Spider) AddSeedUrl(seedUrls ...string)

AddInitUrl 添加种子链接

func (*Spider) ClearRequestStore

func (s *Spider) ClearRequestStore()

Clear 清楚存储的数据

func (*Spider) Run

func (s *Spider) Run()

Run 运行

func (*Spider) SaveHtml

func (s *Spider) SaveHtml(savepath string, suffixGenerate func() string)

SaveHtml 是否保存html 默认false不保存 savepath保存地址 也可以在自定义的Handler处理器中自行实现保存逻辑 suffixGenerate 名字后缀函数,html存储名字和生成的后缀拼接

func (*Spider) SetByteHandler

func (s *Spider) SetByteHandler(handler ByteHandler)

SetByteHandler 设置字节处理器 对下载的字节进行处理

func (*Spider) SetClientGenerator

func (s *Spider) SetClientGenerator(clientGenerator ClientGenerator)

SetClientGenerator 客户端生成器

func (*Spider) SetCycleTime

func (s *Spider) SetCycleTime(time int)

SetCycleTime 设置请求失败后重复请求次数

func (*Spider) SetDownloader

func (s *Spider) SetDownloader(downloader Downloader)

SetDownloader 设置下载器

func (*Spider) SetGoroutines

func (s *Spider) SetGoroutines(n int)

SetGoroutines 协程数

func (*Spider) SetProxyProvider

func (s *Spider) SetProxyProvider(proxyProvider ProxyProvider)

SetProxyProvider 代理提供者

func (*Spider) SetRequestFilter

func (s *Spider) SetRequestFilter(filter RequestFilter)

SetRequestFilter 设置请求过滤器

func (*Spider) SetScheduler

func (s *Spider) SetScheduler(scheduler Scheduler)

SetScheduler 设置调度器

func (*Spider) SetSleepTime

func (s *Spider) SetSleepTime(t time.Duration)

SetSleepTime 睡眠时间

func (*Spider) SetTimeOut

func (s *Spider) SetTimeOut(t time.Duration)

SetTimeOut 设置程序在没有数据之后退出的时间 当t<0时 程序一直运行不退出

type Store

type Store interface {
	Add(key string, value string) error
	BatchAdd(m map[string]string) error
	Get(key string) (string, error)
	Del(key string) error
	List(prefix string, limit ...string) ([]string, error)
	Clear(prefix string, limit ...string)
}

key-value存储数据

type StoreRequestFilter

type StoreRequestFilter struct {
	// contains filtered or unexported fields
}

FilterRequest的默认鸟实现

func (*StoreRequestFilter) Filter

func (filter *StoreRequestFilter) Filter(requests ...Request) []Request

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL