service

package module
v0.4.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 15, 2023 License: MIT Imports: 26 Imported by: 0

README

Rabida 中文

Rabida is a simply crawler framework based on chromedp .

Supported features

  • Pagination: specify css selector for next page.
  • PrePaginate: do something before pagination, such as click button.
  • HttpCookies: enable browser cookie for current job.
  • Delay And Timeout: can customize delay and timeout.
  • AntiDetection: default loaded anti_detetion script for current job. script sourced from puppeteer-extra-stealth
  • Strict Mode: useragent、browser、platform must be matched,will be related chrome-mac if true.
  • Xpath: specify xpath expression to lookup elements.
  • Iframe: be able to specify the iframe selector.
  • Scroll: scroll for current page. ScrollType is scrollBy and scrollTo. default is scrollBy, behave like window.scrollBy, window.scrollTo.

Install

go get -u github.com/Ted-Young/rabida

Configuration

add .env file for your project

RABI_DELAY=1s,2s
RABI_CONCURRENCY=1
RABI_THROTTLE_NUM=2
RABI_THROTTLE_DURATION=1s
RABI_TIMEOUT=3s
RABI_MODE=headless
RABI_DEBUG=false
RABI_OUT=out
RABI_STRICT=false
RABI_PROXY=

Usage

See examples for more details

Css Selector:

func TestRabidaImplCrawl(t *testing.T) {
    conf := config.LoadFromEnv()
    fmt.Printf("%+v\n", conf)
    rabi := NewRabida(conf)
    job := Job{
        Link: "https://tieba.baidu.com/f?kw=nba",
        CssSelector: CssSelector{
            Scope: `#thread_list > li.j_thread_list`,
            Attrs: map[string]CssSelector{
                "title": {
                    Css: "div.threadlist_title > a",
                },
                "date": {
                    Css: "span.threadlist_reply_date",
                },
            },
        },
        Paginator: CssSelector{
            Css: "#frs_list_pager > a.next.pagination-item",
        },
        Limit: 3,
    }
    err := rabi.Crawl(context.Background(), job, func(ret []interface{}, nextPageUrl string, currentPageNo int) bool {
        for _, item := range ret {
            fmt.Println(gabs.Wrap(item).StringIndent("", "  "))
        }
        if currentPageNo >= job.Limit {
            return true
        }
        return false
    }, nil, []chromedp.Action{
        chromedp.EmulateViewport(1777, 903, chromedp.EmulateLandscape),
    })
    if err != nil {
        panic(fmt.Sprintf("%+v", err))
    }
}

Xpath Expression:

func TestRabidaXpathImpl_Crawl(t *testing.T) {
    conf := config.LoadFromEnv()
    fmt.Printf("%+v\n", conf)

    rabi := NewRabida(conf)
    job := Job{
        Link: "https://you.ctrip.com/sight/shenzhen26/2778.html",
        CssSelector: CssSelector{
            XpathScope: `//*[@id="commentModule"]/div[@class='commentList']/div`,
            Attrs: map[string]CssSelector{
                "content": {
                    Xpath: "//div[@class='commentDetail']",
                },
                "date": {
                    Xpath: `//div[@class='commentTime']`,
                },
            },
        },
        Paginator: CssSelector{
            Xpath: "//*[@id='commentModule']//li[@class=' ant-pagination-next' and not(@aria-disabled='true')]",
        },
        Limit: 3,
    }
    err := rabi.Crawl(context.Background(), job, func(ret []interface{}, nextPageUrl string, currentPageNo int) bool {
        for _, item := range ret {
            fmt.Println(gabs.Wrap(item).StringIndent("", "  "))
        }
        logrus.Printf("currentPageNo: %d\n", currentPageNo)
        if currentPageNo >= job.Limit {
            return true
        }
        return false
    }, nil, []chromedp.Action{
        chromedp.EmulateViewport(1777, 903, chromedp.EmulateLandscape),
    })
    if err != nil {
        t.Error(fmt.Sprintf("%+v", err))
    }
}

Scorll API:

func TestRabidaImplCrawlScrollSmooth(t *testing.T) {
    t.Run("CrawlScrollSmooth", func(t *testing.T) {
        conf := config.LoadFromEnv()
        fmt.Printf("%+v\n", conf)
        rabi := NewRabida(conf)
        job := Job{
            Link: "https://twitter.com/NASA",
            CssSelector: CssSelector{
                Scope: `div[data-testid='cellInnerDiv'] article[data-testid='tweet']`,
                Attrs: map[string]CssSelector{
                    "title": {
                        Css: `div[data-testid="tweetText"]`,
                    },
                    "date": {
                        Css:  `a > time`,
                        Attr: `datetime`,
                    },
                    "link": {
                        Css:  `a[role="link"][href*=status]`,
                        Attr: `href`,
                    },
                    "reply": {
                        Css:  `div[data-testid="reply"]`,
                        Attr: `aria-label`,
                    },
                    "retweet": {
                        Css:  `div[data-testid="retweet"]`,
                        Attr: `aria-label`,
                    },
                    "like": {
                        Css:  `div[data-testid="like"]`,
                        Attr: `aria-label`,
                    },
                },
            },
            Limit: 5,
        }
        err := rabi.CrawlScrollSmooth(context.Background(), job, func(ret []interface{}, currentPageNo int) bool {
            for _, item := range ret {
                fmt.Println(gabs.Wrap(item).StringIndent("", "  "))
            }
            if currentPageNo >= job.Limit {
                return true
            }
            return false
        }, nil, nil)
        if err != nil {
            t.Errorf("%+v", err)
        }

    })
}

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ErrNotFound error = errNotFound{}

Functions

func CssOrXpath

func CssOrXpath(cssSelector CssSelector) string

func DelaySleep

func DelaySleep(conf config.RabiConfig, tag string)

func ExecEventCondition

func ExecEventCondition(ctx context.Context, conf config.RabiConfig, condition *Condition, queryActions []chromedp.QueryOption) (bool, error)

Types

type Condition

type Condition struct {
	Value        string `json:"value"`
	CheckFunc    func(text, value string) bool
	ExecSelector ExecSelector `json:"execSelector"`
}

type CssSelector

type CssSelector struct {
	Css string `json:"css"`
	// Attr default is innerText
	Attr string `json:"attr"`
	// Scope supply a scope to each selector
	// In jQuery, this would look something like this: $(scope).find(selector)
	Scope string `json:"scope"`
	// Attrs map each attribute to a css selector. when Attrs equals nil, stop recursively populating
	Attrs map[string]CssSelector `json:"attrs"`
	// Iframe if true, we will look for the element(s) within the first iframe in the page. if IframeSelector exist, will look for this.
	Iframe bool `json:"iframe"`
	// IframeSelector specify the iframe selector if have multiple iframe elements
	IframeSelector *CssSelector `json:"iframeSelector"`
	// XpathScope Note: only choose one between xpath and css selector
	XpathScope string `json:"xpathScope"`
	// Xpath xpath expression
	// eg: //*[@id="zz"]/div[2]/ul/li[1]/text()
	// eg: //div[@id="indexCarousel"]//div[@class="item"]//img/@src
	Xpath    string         `json:"xpath"`
	SetAttrs []SetAttribute `json:"setAttrs"`
	// Before dosomething before retrieve value
	Before    []EventSelector `json:"before"`
	Condition *Condition      `json:"condition"`
}

type Event

type Event string
const (
	ClickEvent              Event = "click"
	SetAttributesValueEvent Event = "setAttributesValue"
	TextEvent               Event = "getTextValue"
	GetAttributeValueEvent  Event = "getAttributeValue"
)

type EventSelector

type EventSelector struct {
	Type      Event       `json:"type"`
	Condition Condition   `json:"condition"`
	Selector  CssSelector `json:"selector"`
}

type ExecSelector

type ExecSelector struct {
	Type     Event       `json:"type"`
	Selector CssSelector `json:"selector"`
}

type HttpCookies

type HttpCookies struct {
	RawCookies string `json:"rawCookies"`
	Domain     string `json:"domain"`
	// Expires hour, default 1 year
	Expires int `json:"expires"`
}

type Job

type Job struct {
	// Link the url you want to crawl
	Link string `json:"link"`
	// CssSelector root css selector
	CssSelector CssSelector `json:"cssSelector"`
	// PrePaginate do something before paginate
	PrePaginate []EventSelector `json:"prePaginate"`
	// Paginator css selector for next page
	Paginator     CssSelector `json:"paginator"`
	PaginatorFunc func(currentPageNo int) CssSelector
	// Limit limits how many pages should be crawled
	Limit         int         `json:"limit"`
	StartPageBtn  CssSelector `json:"startPageBtn"`
	StartPageUrl  string      `json:"startPageUrl"`
	EnableCookies HttpCookies `json:"enableCookies"`
}

type Rabida

type Rabida interface {
	Crawl(ctx context.Context, job Job,

		callback func(ret []interface{}, nextPageUrl string, currentPageNo int) bool,

		before []chromedp.Action,

		after []chromedp.Action,
	) error

	CrawlWithConfig(ctx context.Context, job Job,

		callback func(ret []interface{}, nextPageUrl string, currentPageNo int) bool,

		before []chromedp.Action,

		after []chromedp.Action,
		conf config.RabiConfig,
		options ...chromedp.ExecAllocatorOption,
	) error

	CrawlWithListeners(ctx context.Context, job Job,

		callback func(ctx context.Context, ret []interface{}, nextPageUrl string, currentPageNo int) bool,

		before []chromedp.Action,

		after []chromedp.Action,
		confPtr *config.RabiConfig,
		options []chromedp.ExecAllocatorOption,
		listeners ...func(ev interface{}),
	) error

	DownloadFile(ctx context.Context, job Job,

		callback func(file string),
		confPtr *config.RabiConfig,
		options ...chromedp.ExecAllocatorOption,
	) error

	CrawlScroll(ctx context.Context, job Job,

		callback func(ret []interface{}, cursor int, currentPageNo int) bool,

		before []chromedp.Action,

		after []chromedp.Action,
	) error

	CrawlScrollWithConfig(ctx context.Context, job Job,

		callback func(ret []interface{}, cursor int, currentPageNo int) bool,

		before []chromedp.Action,

		after []chromedp.Action,
		conf config.RabiConfig,
		options ...chromedp.ExecAllocatorOption,
	) error

	CrawlScrollWithListeners(ctx context.Context, job Job,

		callback func(ctx context.Context, ret []interface{}, cursor int, currentPageNo int) bool,

		before []chromedp.Action,

		after []chromedp.Action,
		confPtr *config.RabiConfig,
		options []chromedp.ExecAllocatorOption,
		listeners ...func(ev interface{}),
	) error

	CrawlScrollSmooth(ctx context.Context, job Job,

		callback func(ret []interface{}, currentPageNo int) bool,

		before []chromedp.Action,

		after []chromedp.Action,
	) error

	CrawlScrollSmoothWithConfig(ctx context.Context, job Job,

		callback func(ret []interface{}, currentPageNo int) bool,

		before []chromedp.Action,

		after []chromedp.Action,
		conf config.RabiConfig,
		options ...chromedp.ExecAllocatorOption,
	) error

	CrawlScrollSmoothWithListeners(ctx context.Context, job Job,

		callback func(ctx context.Context, ret []interface{}, currentPageNo int) bool,

		before []chromedp.Action,

		after []chromedp.Action,
		confPtr *config.RabiConfig,
		options []chromedp.ExecAllocatorOption,
		listeners ...func(ev interface{}),
	) error
}

func NewRabida

func NewRabida(conf *config.RabiConfig) Rabida

type RabidaImpl

type RabidaImpl struct {
	// contains filtered or unexported fields
}

func (RabidaImpl) Crawl

func (r RabidaImpl) Crawl(ctx context.Context, job Job, callback func(ret []interface{}, nextPageUrl string, currentPageNo int) bool,
	before []chromedp.Action, after []chromedp.Action) error

func (RabidaImpl) CrawlScroll

func (r RabidaImpl) CrawlScroll(ctx context.Context, job Job, callback func(ret []interface{}, cursor int, currentPageNo int) bool,
	before []chromedp.Action, after []chromedp.Action) error

func (RabidaImpl) CrawlScrollSmooth

func (r RabidaImpl) CrawlScrollSmooth(ctx context.Context, job Job, callback func(ret []interface{}, currentPageNo int) bool,
	before []chromedp.Action, after []chromedp.Action) error

func (RabidaImpl) CrawlScrollSmoothWithConfig

func (r RabidaImpl) CrawlScrollSmoothWithConfig(ctx context.Context, job Job, callback func(ret []interface{}, currentPageNo int) bool, before []chromedp.Action, after []chromedp.Action, conf config.RabiConfig, options ...chromedp.ExecAllocatorOption) error

func (RabidaImpl) CrawlScrollSmoothWithListeners

func (r RabidaImpl) CrawlScrollSmoothWithListeners(ctx context.Context, job Job, callback func(ctx context.Context, ret []interface{}, currentPageNo int) bool, before []chromedp.Action, after []chromedp.Action, confPtr *config.RabiConfig, options []chromedp.ExecAllocatorOption, listeners ...func(ev interface{})) error

func (RabidaImpl) CrawlScrollWithConfig

func (r RabidaImpl) CrawlScrollWithConfig(ctx context.Context, job Job, callback func(ret []interface{}, cursor int, currentPageNo int) bool, before []chromedp.Action, after []chromedp.Action, conf config.RabiConfig, options ...chromedp.ExecAllocatorOption) error

func (RabidaImpl) CrawlScrollWithListeners

func (r RabidaImpl) CrawlScrollWithListeners(ctx context.Context, job Job, callback func(ctx context.Context, ret []interface{}, cursor int, currentPageNo int) bool, before []chromedp.Action, after []chromedp.Action, confPtr *config.RabiConfig, options []chromedp.ExecAllocatorOption, listeners ...func(ev interface{})) error

func (RabidaImpl) CrawlWithConfig

func (r RabidaImpl) CrawlWithConfig(ctx context.Context, job Job, callback func(ret []interface{}, nextPageUrl string, currentPageNo int) bool, before []chromedp.Action, after []chromedp.Action, conf config.RabiConfig, options ...chromedp.ExecAllocatorOption) error

func (RabidaImpl) CrawlWithListeners

func (r RabidaImpl) CrawlWithListeners(ctx context.Context, job Job, callback func(ctx context.Context, ret []interface{}, nextPageUrl string, currentPageNo int) bool, before []chromedp.Action, after []chromedp.Action, confPtr *config.RabiConfig, options []chromedp.ExecAllocatorOption, listeners ...func(ev interface{})) error

func (RabidaImpl) DownloadFile

func (r RabidaImpl) DownloadFile(ctx context.Context, job Job, callback func(file string), confPtr *config.RabiConfig, options ...chromedp.ExecAllocatorOption) error

func (RabidaImpl) Html

func (r RabidaImpl) Html(ctx context.Context, father *cdp.Node, conf config.RabiConfig) *html.Node

type SetAttribute

type SetAttribute struct {
	AttributeName  string `json:"attributeName"`
	AttributeValue string `json:"attributeValue"`
}

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL