api

package
v0.2.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 18, 2024 License: MIT Imports: 15 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func FindLinks(body []byte) (hrefs []string)

func Hostname

func Hostname(link string) (string, error)

func TestHostname

func TestHostname(t *testing.T)

Types

type Fetcher

type Fetcher interface {
	Fetch(ctx context.Context, req *Request) (*Response, error)
	Close() error
}

type FilterRule

type FilterRule struct {
	Hostname string
	Allow    []*regexp.Regexp
	Disallow []*regexp.Regexp
}

type MetricsMonitor

type MetricsMonitor interface {
	IncTotalRequests()
	IncSuccessfulRequests()
	IncFailedRequests()

	IncTotalLink()
	IncCrawledLink()
	IncSkippedLink()
	IncDuplicatedLink()

	Metrics() map[string]int64
}

type Param

type Param struct {
	Proxy       string
	UserAgent   string
	Referer     string
	MaxBodySize int64
	Timeout     time.Duration
}

type ParsedURL

type ParsedURL struct {
	Hash string
	Root string
	URL  *url.URL
}

func NewURL

func NewURL(raw string) (*ParsedURL, error)

func (*ParsedURL) String

func (u *ParsedURL) String() string

type Queue

type Queue interface {
	Push(ctx context.Context, req *Request) error
	Pop(ctx context.Context) (*Request, error)
	Len() int32
	Close() error
}

type RateLimit

type RateLimit struct {
	Hostname string
	Rate     string
}

type Request

type Request struct {
	Target *ParsedURL
	Param  *Param
	Depth  int32
}

func (*Request) ResolveURL

func (r *Request) ResolveURL(u string) (*url.URL, error)

type Response

type Response struct {
	URL         *ParsedURL
	Status      int
	Body        []byte
	NextURLs    []*ParsedURL
	Depth       int32
	ElapsedTime time.Duration
	Err         error
}

type Store

type Store interface {
	HasVisited(ctx context.Context, u *ParsedURL) (bool, error)
	Close() error
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL