go_spider

package module
v1.0.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 3, 2019 License: MIT Imports: 15 Imported by: 0

README

A lightweight crawl go framework

a lightweight crawl framework written in go

Install

go get github.com/bennya8/go-spider

Usage

    // Configuare crawl task options 
    job51Opts := []TaskOpt{
        TaskOptEnableCookie(true),
        TaskOptGapLimit(5000),
        TaskOptCache("cache"),
        TaskOptProxy([]string{"127.0.0.1:8700"}),
        TaskOptSrcCharset("gbk"),
        TaskOptDomains([]string{"www.51job.com", "search.51job.com", "jobs.51job.com"}),
    }
    // Craete new task handler and passing options,NewTaskHandler(name string,entry string,opts ...opts)
    job51 := NewTaskHandler("job51", "https://www.51job.com", job51Opts...)

    // Before request event
    job51.OnRequest(func(url string, header *req.Header, param *req.Param, err error) {
        fmt.Println(url, header, param, err)
    })

    // After request event
    job51.OnResponse(func(resp *req.Resp, err error) {
        fmt.Println(resp, err)
    })
    
    // Dom search 
    // allowing nest selection, check (github.com/PuerkitoBio/goquery)to get more example
    job51.OnQuery(".cn.hlist a", func(url string, selection *goquery.Selection) {
        selection.Each(func(i int, selection *goquery.Selection) {
            href, exists := selection.Attr("href")
            if exists {
                job51.Visit(href)
            }
        })
    })

    job51.OnQuery(".dw_table .el", func(url string, selection *goquery.Selection) {
        selection.Each(func(i int, selection *goquery.Selection) {
            selection.Find("p.t1 a").Each(func(i int, selection *goquery.Selection) {
                href, exists := selection.Attr("href")
                if exists {
                    job51.Visit(href)
                }
            })
        })
    })
    
    // create main spider thread
    spider := NewGoSpider()
    
    // register current task to the main spider thread
    // supported muti-tasking
    spider.AddTask(job51)

    // execution 
    spider.Run()

Change log

v1.0.1

  • [ADDED] url cache feature
  • [ADDED] page encode convert feature

v1.0.0.alpha (2019-10-05 22:22 UTC+8:00)

  • build framework skeleton
  • [TODO] mock ua/ url caching

3rd dependencies

github.com/imroc/req

effective go http request library

github.com/PuerkitoBio/goquery

dom parser

github.com/axgle/mahonia

character set converter

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type GoSpider

type GoSpider struct {
	// contains filtered or unexported fields
}

func NewGoSpider

func NewGoSpider() *GoSpider

func (*GoSpider) AddTask

func (g *GoSpider) AddTask(handler *TaskHandler)

func (*GoSpider) Run

func (g *GoSpider) Run()

type OnErrorCallback

type OnErrorCallback func(msg string, err error)

type OnQueryCallback

type OnQueryCallback func(url string, selection *goquery.Selection)

type OnRequestCallback

type OnRequestCallback func(url string, header *req.Header, param *req.Param, err error)

type OnResponseCallback

type OnResponseCallback func(url string, rsp *req.Resp, err error)

type TaskHandler

type TaskHandler struct {
	Name        string
	Entry       string
	GapLimit    int
	GapLimitMin int
	GapLimitMax int
	IdleLimit   int
	WorkerLimit int
	SrcCharset  string

	Http            *req.Req
	Headers         *req.Header
	Params          *req.Param
	Domains         []string
	Proxies         []string
	Queue           chan string
	QueueProcessNum int
	QueueTotalNum   int
	CachePath       string
	// contains filtered or unexported fields
}

task handler

func NewTaskHandler

func NewTaskHandler(name string, entry string, opts ...TaskOpt) *TaskHandler

func (*TaskHandler) Clone

func (t *TaskHandler) Clone() *TaskHandler

func (*TaskHandler) Handle

func (t *TaskHandler) Handle()

func (*TaskHandler) OnQuery

func (t *TaskHandler) OnQuery(selector string, cb OnQueryCallback)

func (*TaskHandler) OnRequest

func (t *TaskHandler) OnRequest(cb OnRequestCallback)

func (*TaskHandler) OnResponse

func (t *TaskHandler) OnResponse(cb OnResponseCallback)

func (*TaskHandler) Visit

func (t *TaskHandler) Visit(url string)

type TaskOpt

type TaskOpt func(*TaskHandler)

func TaskOptCache

func TaskOptCache(path string) TaskOpt

func TaskOptDomains

func TaskOptDomains(domains []string) TaskOpt

func TaskOptEnableCookie

func TaskOptEnableCookie(b bool) TaskOpt

func TaskOptGapLimit

func TaskOptGapLimit(num int) TaskOpt

func TaskOptGapLimitRandom added in v1.0.2

func TaskOptGapLimitRandom(min int, max int) TaskOpt

func TaskOptProxy

func TaskOptProxy(proxy []string) TaskOpt

func TaskOptSrcCharset

func TaskOptSrcCharset(charset string) TaskOpt

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL