crawler

package
v0.0.0-...-eb77424 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 25, 2024 License: MIT Imports: 23 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func NewFromHTTPResponse

func NewFromHTTPResponse(r *http.Response) (*models.PageReport, *html.Node, error)

Create a new PageReport from an http.Response.

func NewHTMLParser

func NewHTMLParser(u *url.URL, status int, headers *http.Header, body []byte) (*models.PageReport, *html.Node, error)

Return a new PageReport.

Types

type BasicAuthClient

type BasicAuthClient struct {
	Options *ClientOptions
	// contains filtered or unexported fields
}

func NewClient

func NewClient(options *ClientOptions) *BasicAuthClient

func (*BasicAuthClient) Do

func (c *BasicAuthClient) Do(req *http.Request) (*http.Response, error)

Does a request and returns its response and error. It sets the client's User-Agent as well as the BasicAuth details if they are available.

func (*BasicAuthClient) Get

func (c *BasicAuthClient) Get(u string) (*http.Response, error)

Makes a GET request to an URL and returns the http response or an error.

func (*BasicAuthClient) Head

func (c *BasicAuthClient) Head(u string) (*http.Response, error)

Makes a HEAD request to an URL and returns the http response or an error.

type Client

type Client interface {
	Get(u string) (*http.Response, error)
	Head(u string) (*http.Response, error)
	Do(req *http.Request) (*http.Response, error)
}

type ClientOptions

type ClientOptions struct {
	UserAgent        string
	BasicAuth        bool
	BasicAuthDomains []string
	AuthUser         string
	AuthPass         string
}

type Crawler

type Crawler struct {
	// contains filtered or unexported fields
}

func NewCrawler

func NewCrawler(url *url.URL, options *Options) *Crawler

func (*Crawler) RobotstxtExists

func (c *Crawler) RobotstxtExists() bool

Returns true if the robots.txt file exists

func (*Crawler) SitemapExists

func (c *Crawler) SitemapExists() bool

Returns true if the sitemap.xml file exists

func (*Crawler) SitemapIsBlocked

func (c *Crawler) SitemapIsBlocked() bool

Returns true if any of the website's sitemaps is blocked in the robots.txt file

func (*Crawler) Stop

func (c *Crawler) Stop()

Stops the cralwer by canceling the cralwer context.

func (*Crawler) Stream

func (c *Crawler) Stream() <-chan *models.PageReportMessage

Returns the PageReportMessage channel that streams all generated PageReports into a PageReportMessage struct.

type CrawlerClient

type CrawlerClient interface {
	Get(u string) (*http.Response, error)
	Head(u string) (*http.Response, error)
	Do(req *http.Request) (*http.Response, error)
}

type HttpCrawler

type HttpCrawler struct {
	// contains filtered or unexported fields
}

func New

func New(client Client, urlStream <-chan *RequestMessage) *HttpCrawler

func (*HttpCrawler) Crawl

func (c *HttpCrawler) Crawl(ctx context.Context) <-chan *ResponseMessage

Crawl starts crawling the URLs received in the urlStream channel and sends ResponseMessage of the crawled URLs through the rStream channel. It will end when the context is cancelled.

type Options

type Options struct {
	MaxPageReports     int
	IgnoreRobotsTxt    bool
	FollowNofollow     bool
	IncludeNoindex     bool
	UserAgent          string
	CrawlSitemap       bool
	AllowSubdomains    bool
	BasicAuth          bool
	AuthUser           string
	AuthPass           string
	CheckExternalLinks bool
}

type Parser

type Parser struct {
	ParsedURL *url.URL
	Headers   *http.Header
	// contains filtered or unexported fields
}

type Queue

type Queue struct {
	// contains filtered or unexported fields
}

func NewQueue

func NewQueue(ctx context.Context) *Queue

func (*Queue) Ack

func (q *Queue) Ack(s string)

Acknowledges a message has been processed.

func (*Queue) Active

func (q *Queue) Active() bool

Active returns true if the queue is not empty or has active elements.

func (*Queue) Count

func (q *Queue) Count() int

Returns the number of items currently in the queue.

func (*Queue) Poll

func (q *Queue) Poll() *RequestMessage

Returns the first element in the queue.

func (*Queue) Push

func (q *Queue) Push(value *RequestMessage)

Adds a new value to the queue's end.

type RequestMessage

type RequestMessage struct {
	URL   string
	Depth int
}

type ResponseMessage

type ResponseMessage struct {
	URL      string
	Response *http.Response
	Error    error
	Depth    int
}

type RobotsChecker

type RobotsChecker struct {
	// contains filtered or unexported fields
}

func NewRobotsChecker

func NewRobotsChecker(client Client, ua string) *RobotsChecker

func (*RobotsChecker) Exists

func (r *RobotsChecker) Exists(u *url.URL) bool

Returns true if the robots.txt file exists and is valid

func (*RobotsChecker) GetSitemaps

func (r *RobotsChecker) GetSitemaps(u *url.URL) []string

Returns a list of sitemaps found in the robots.txt file

func (*RobotsChecker) IsBlocked

func (r *RobotsChecker) IsBlocked(u *url.URL) bool

Returns true if the URL is blocked by robots.txt

type SitemapChecker

type SitemapChecker struct {
	// contains filtered or unexported fields
}

func NewSitemapChecker

func NewSitemapChecker(client Client, limit int) *SitemapChecker

func (*SitemapChecker) ParseSitemaps

func (sc *SitemapChecker) ParseSitemaps(URLs []string, callback func(u string))

Parse the sitemaps using a callback function on each entry For each URL provided check if it's an index sitemap

func (*SitemapChecker) SitemapExists

func (sc *SitemapChecker) SitemapExists(URLs []string) bool

Check if any of the sitemap URLs provided exist

type URLStorage

type URLStorage struct {
	// contains filtered or unexported fields
}

func NewURLStorage

func NewURLStorage() *URLStorage

func (*URLStorage) Add

func (s *URLStorage) Add(u string)

Adds an URL string to the slice.

func (*URLStorage) Iterate

func (s *URLStorage) Iterate(f func(string))

Iterate over the seen map, applying the provided function f to the iteration's current element.

func (*URLStorage) Seen

func (s *URLStorage) Seen(u string) bool

Returns true if a URL string has already been added.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL