Documentation ¶
Index ¶
- func NewFromHTTPResponse(r *http.Response) (*models.PageReport, *html.Node, error)
- func NewHTMLParser(u *url.URL, status int, headers *http.Header, body []byte) (*models.PageReport, *html.Node, error)
- type BasicAuthClient
- type Client
- type ClientOptions
- type Crawler
- type CrawlerClient
- type HttpCrawler
- type Options
- type Parser
- type Queue
- type RequestMessage
- type ResponseMessage
- type RobotsChecker
- type SitemapChecker
- type URLStorage
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func NewFromHTTPResponse ¶
Create a new PageReport from an http.Response.
Types ¶
type BasicAuthClient ¶
type BasicAuthClient struct { Options *ClientOptions // contains filtered or unexported fields }
func NewClient ¶
func NewClient(options *ClientOptions) *BasicAuthClient
func (*BasicAuthClient) Do ¶
Does a request and returns its response and error. It sets the client's User-Agent as well as the BasicAuth details if they are available.
type ClientOptions ¶
type Crawler ¶
type Crawler struct {
// contains filtered or unexported fields
}
func (*Crawler) RobotstxtExists ¶
Returns true if the robots.txt file exists
func (*Crawler) SitemapExists ¶
Returns true if the sitemap.xml file exists
func (*Crawler) SitemapIsBlocked ¶
Returns true if any of the website's sitemaps is blocked in the robots.txt file
func (*Crawler) Stream ¶
func (c *Crawler) Stream() <-chan *models.PageReportMessage
Returns the PageReportMessage channel that streams all generated PageReports into a PageReportMessage struct.
type CrawlerClient ¶
type HttpCrawler ¶
type HttpCrawler struct {
// contains filtered or unexported fields
}
func New ¶
func New(client Client, urlStream <-chan *RequestMessage) *HttpCrawler
func (*HttpCrawler) Crawl ¶
func (c *HttpCrawler) Crawl(ctx context.Context) <-chan *ResponseMessage
Crawl starts crawling the URLs received in the urlStream channel and sends ResponseMessage of the crawled URLs through the rStream channel. It will end when the context is cancelled.
type Queue ¶
type Queue struct {
// contains filtered or unexported fields
}
func (*Queue) Push ¶
func (q *Queue) Push(value *RequestMessage)
Adds a new value to the queue's end.
type RequestMessage ¶
type ResponseMessage ¶
type RobotsChecker ¶
type RobotsChecker struct {
// contains filtered or unexported fields
}
func NewRobotsChecker ¶
func NewRobotsChecker(client Client, ua string) *RobotsChecker
func (*RobotsChecker) Exists ¶
func (r *RobotsChecker) Exists(u *url.URL) bool
Returns true if the robots.txt file exists and is valid
func (*RobotsChecker) GetSitemaps ¶
func (r *RobotsChecker) GetSitemaps(u *url.URL) []string
Returns a list of sitemaps found in the robots.txt file
type SitemapChecker ¶
type SitemapChecker struct {
// contains filtered or unexported fields
}
func NewSitemapChecker ¶
func NewSitemapChecker(client Client, limit int) *SitemapChecker
func (*SitemapChecker) ParseSitemaps ¶
func (sc *SitemapChecker) ParseSitemaps(URLs []string, callback func(u string))
Parse the sitemaps using a callback function on each entry For each URL provided check if it's an index sitemap
func (*SitemapChecker) SitemapExists ¶
func (sc *SitemapChecker) SitemapExists(URLs []string) bool
Check if any of the sitemap URLs provided exist
type URLStorage ¶
type URLStorage struct {
// contains filtered or unexported fields
}
func NewURLStorage ¶
func NewURLStorage() *URLStorage
func (*URLStorage) Iterate ¶
func (s *URLStorage) Iterate(f func(string))
Iterate over the seen map, applying the provided function f to the iteration's current element.
func (*URLStorage) Seen ¶
func (s *URLStorage) Seen(u string) bool
Returns true if a URL string has already been added.