scraper

package module
v0.0.0-...-3adb936 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 29, 2023 License: MIT, Unlicense Imports: 27 Imported by: 0

README

scraper - fetch, store, and scrape web pages

Go package docs Build status

Dual-licensed under MIT or the UNLICENSE.

Documentation

Index

Constants

View Source
const LatestPageVersion = 0

Variables

This section is empty.

Functions

This section is empty.

Types

type DoOption

type DoOption interface {
	// contains filtered or unexported methods
}

type FetchStatusNotOKError

type FetchStatusNotOKError struct {
	Page *Page
}

FetchStatusNotOKError is returned when the fetch status is not 200 OK. The Page contains the response and status.

func (*FetchStatusNotOKError) Error

func (e *FetchStatusNotOKError) Error() string

type FetchThrottledError

type FetchThrottledError struct{}

FetchThrottledError is returned when the fetch is throttled.

func (*FetchThrottledError) Error

func (e *FetchThrottledError) Error() string

type Limiter

type Limiter interface {
	Take() time.Time
}

type OptDoBrowser

type OptDoBrowser struct{}

type OptDoLimiter

type OptDoLimiter struct {
	Limiter Limiter
}

type OptDoReplace

type OptDoReplace struct{}

type OptDoSilentThrottle

type OptDoSilentThrottle struct {
	PageBytesRegexp *regexp.Regexp
}

type Option

type Option interface {
	// contains filtered or unexported methods
}

func OptScraperAlwaysDoBrowser

func OptScraperAlwaysDoBrowser() Option

type Page

type Page struct {
	Meta     PageMeta     `json:"meta"`
	Request  PageRequest  `json:"request"`
	Response PageResponse `json:"response"`
}

type PageMeta

type PageMeta struct {
	Version     uint16        `json:"version"`
	Source      string        `json:"-"`
	RetrieveDur time.Duration `json:"-"`
	ScrapedAt   time.Time     `json:"scraped_at"`
	FetchDur    time.Duration `json:"fetch_dur"`
}

type PageRequest

type PageRequest struct {
	URL           string      `json:"url"`
	RedirectedURL string      `json:"redirected_url,omitempty"`
	Method        string      `json:"method"`
	Header        http.Header `json:"header,omitempty"`
	Body          []byte      `json:"body,omitempty"`
}

type PageResponse

type PageResponse struct {
	StatusCode int         `json:"status_code"`
	Header     http.Header `json:"header,omitempty"`
	Body       []byte      `json:"body,omitempty"`
}

type Scraper

type Scraper struct {
	// contains filtered or unexported fields
}

func NewScraper

func NewScraper(
	ctx context.Context,
	blob *blob.Bucket,
	opts ...Option,
) (*Scraper, error)

func (*Scraper) Close

func (s *Scraper) Close()

func (*Scraper) Do

func (s *Scraper) Do(
	ctx context.Context,
	req *http.Request,
	options ...DoOption,
) (page *Page, err error)

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL