crawler

package
v1.7.5 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 6, 2024 License: MIT Imports: 22 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// TaskDefault marks result for printing only.
	TaskDefault taskFlag = iota
	// TaskCrawl marks result as to-be-crawled.
	TaskCrawl
	// TaskDone marks result as final - crawling ends here.
	TaskDone
)
View Source
const (
	// DefaultRobotsPolicy is a default policy name for robots handling.
	DefaultRobotsPolicy = "ignore"
	// DefaultDirsPolicy is a default policy name for non-resource URLs.
	DefaultDirsPolicy = "show"
)

Variables

View Source
var ErrUnknownPolicy = errors.New("unknown policy")

ErrUnknownPolicy is returned when requested policy unknown.

Functions

This section is empty.

Types

type Crawler

type Crawler struct {
	// contains filtered or unexported fields
}

Crawler holds crawling process config and state.

func New

func New(opts ...Option) (c *Crawler)

New creates Crawler instance.

func (*Crawler) DumpConfig

func (c *Crawler) DumpConfig() string

DumpConfig returns internal config representation.

func (*Crawler) Run

func (c *Crawler) Run(uri string, urlcb func(string)) (err error)

Run starts crawling process for given base uri.

type DirsPolicy

type DirsPolicy byte

DirsPolicy is a policy for non-resorce urls.

const (
	// DirsShow show directories.
	DirsShow DirsPolicy = 0
	// DirsHide hide directories from output.
	DirsHide DirsPolicy = 1
	// DirsOnly show only directories in output.
	DirsOnly DirsPolicy = 2
)

func ParseDirsPolicy

func ParseDirsPolicy(s string) (p DirsPolicy, err error)

ParseDirsPolicy parses dirs policy from string.

type Option

type Option func(*config)

Option is a configuration func.

func WithBruteMode

func WithBruteMode(v bool) Option

WithBruteMode enables "brute-mode" - html comments scan.

func WithDelay

func WithDelay(v time.Duration) Option

WithDelay sets crawl delay.

func WithDirsPolicy

func WithDirsPolicy(v DirsPolicy) Option

WithDirsPolicy sets DirsPolicy for crawler.

func WithExtraCookies

func WithExtraCookies(v []string) Option

WithExtraCookies add cookies to requests.

func WithExtraHeaders

func WithExtraHeaders(v []string) Option

WithExtraHeaders add extra HTTP headers to requests.

func WithIgnored

func WithIgnored(v []string) Option

WithSkipPatterns apply URL skip filter for crawler.

func WithMaxCrawlDepth

func WithMaxCrawlDepth(v int) Option

WithMaxCrawlDepth sets maximum depth to crawl.

func WithProxyAuth

func WithProxyAuth(v string) Option

WithProxyAuth enables proxy credentials.

func WithRobotsPolicy

func WithRobotsPolicy(v RobotsPolicy) Option

WithRobotsPolicy sets RobotsPolicy for crawler.

func WithScanCSS

func WithScanCSS(v bool) Option

WithScanCSS enables css scanning.

func WithScanJS

func WithScanJS(v bool) Option

WithScanJS enables js scanning.

func WithSkipSSL

func WithSkipSSL(v bool) Option

WithSkipSSL tells crawley to skip any ssl handshake errors.

func WithTagsFilter

func WithTagsFilter(v []string) Option

WithTagsFilter apply tag filter for crawler.

func WithTimeout

func WithTimeout(v time.Duration) Option

WithTimeout sets request timeout.

func WithUserAgent

func WithUserAgent(v string) Option

WithUserAgent sets User-Agent string.

func WithWorkersCount

func WithWorkersCount(v int) Option

WithWorkersCount sets maximum workers.

func WithoutHeads

func WithoutHeads(v bool) Option

WithoutHeads disables pre-flight HEAD requests.

type RobotsPolicy

type RobotsPolicy byte

RobotsPolicy is a policy for robots.txt.

const (
	// RobotsIgnore ignores robots.txt completly.
	RobotsIgnore RobotsPolicy = 0
	// RobotsCrawl crawls urls from robots.txt, ignoring its rules.
	RobotsCrawl RobotsPolicy = 1
	// RobotsRespect same as above, but respects given rules.
	RobotsRespect RobotsPolicy = 2
)

func ParseRobotsPolicy

func ParseRobotsPolicy(s string) (p RobotsPolicy, err error)

ParseRobotsPolicy parses robots policy from string.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL