crawler

package

v0.0.0-...-eb77424 Latest Latest Go to latest Published: Apr 25, 2024 License: MIT Imports: 23 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/stjudewashere/seonaut

Links

Open Source Insights

Documentation ¶

Index ¶

func NewFromHTTPResponse(r *http.Response) (*models.PageReport, *html.Node, error)
func NewHTMLParser(u *url.URL, status int, headers *http.Header, body []byte) (*models.PageReport, *html.Node, error)
type BasicAuthClient
- func NewClient(options *ClientOptions) *BasicAuthClient
- func (c *BasicAuthClient) Do(req *http.Request) (*http.Response, error)
- func (c *BasicAuthClient) Get(u string) (*http.Response, error)
- func (c *BasicAuthClient) Head(u string) (*http.Response, error)
type Client
type ClientOptions
type Crawler
- func NewCrawler(url *url.URL, options *Options) *Crawler
- func (c *Crawler) RobotstxtExists() bool
- func (c *Crawler) SitemapExists() bool
- func (c *Crawler) SitemapIsBlocked() bool
- func (c *Crawler) Stop()
- func (c *Crawler) Stream() <-chan *models.PageReportMessage
type CrawlerClient
type HttpCrawler
- func New(client Client, urlStream <-chan *RequestMessage) *HttpCrawler
- func (c *HttpCrawler) Crawl(ctx context.Context) <-chan *ResponseMessage
type Options
type Parser
type Queue
- func NewQueue(ctx context.Context) *Queue
- func (q *Queue) Ack(s string)
- func (q *Queue) Active() bool
- func (q *Queue) Count() int
- func (q *Queue) Poll() *RequestMessage
- func (q *Queue) Push(value *RequestMessage)
type RequestMessage
type ResponseMessage
type RobotsChecker
- func NewRobotsChecker(client Client, ua string) *RobotsChecker
- func (r *RobotsChecker) Exists(u *url.URL) bool
- func (r *RobotsChecker) GetSitemaps(u *url.URL) []string
- func (r *RobotsChecker) IsBlocked(u *url.URL) bool
type SitemapChecker
- func NewSitemapChecker(client Client, limit int) *SitemapChecker
- func (sc *SitemapChecker) ParseSitemaps(URLs []string, callback func(u string))
- func (sc *SitemapChecker) SitemapExists(URLs []string) bool
type URLStorage
- func NewURLStorage() *URLStorage
- func (s *URLStorage) Add(u string)
- func (s *URLStorage) Iterate(f func(string))
- func (s *URLStorage) Seen(u string) bool

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func NewFromHTTPResponse ¶

func NewFromHTTPResponse(r *http.Response) (*models.PageReport, *html.Node, error)

Create a new PageReport from an http.Response.

func NewHTMLParser ¶

func NewHTMLParser(u *url.URL, status int, headers *http.Header, body []byte) (*models.PageReport, *html.Node, error)

Return a new PageReport.

Types ¶

type BasicAuthClient ¶

type BasicAuthClient struct {
	Options *ClientOptions
	// contains filtered or unexported fields
}

func NewClient ¶

func NewClient(options *ClientOptions) *BasicAuthClient

func (*BasicAuthClient) Do ¶

func (c *BasicAuthClient) Do(req *http.Request) (*http.Response, error)

Does a request and returns its response and error. It sets the client's User-Agent as well as the BasicAuth details if they are available.

func (*BasicAuthClient) Get ¶

func (c *BasicAuthClient) Get(u string) (*http.Response, error)

Makes a GET request to an URL and returns the http response or an error.

func (*BasicAuthClient) Head ¶

func (c *BasicAuthClient) Head(u string) (*http.Response, error)

Makes a HEAD request to an URL and returns the http response or an error.

type Client ¶

type Client interface {
	Get(u string) (*http.Response, error)
	Head(u string) (*http.Response, error)
	Do(req *http.Request) (*http.Response, error)
}

type ClientOptions ¶

type ClientOptions struct {
	UserAgent        string
	BasicAuth        bool
	BasicAuthDomains []string
	AuthUser         string
	AuthPass         string
}

type Crawler ¶

type Crawler struct {
	// contains filtered or unexported fields
}

func NewCrawler ¶

func NewCrawler(url *url.URL, options *Options) *Crawler

func (*Crawler) RobotstxtExists ¶

func (c *Crawler) RobotstxtExists() bool

Returns true if the robots.txt file exists

func (*Crawler) SitemapExists ¶

func (c *Crawler) SitemapExists() bool

Returns true if the sitemap.xml file exists

func (*Crawler) SitemapIsBlocked ¶

func (c *Crawler) SitemapIsBlocked() bool

Returns true if any of the website's sitemaps is blocked in the robots.txt file

func (*Crawler) Stop ¶

func (c *Crawler) Stop()

Stops the cralwer by canceling the cralwer context.

func (*Crawler) Stream ¶

func (c *Crawler) Stream() <-chan *models.PageReportMessage

Returns the PageReportMessage channel that streams all generated PageReports into a PageReportMessage struct.

type CrawlerClient ¶

type CrawlerClient interface {
	Get(u string) (*http.Response, error)
	Head(u string) (*http.Response, error)
	Do(req *http.Request) (*http.Response, error)
}

type HttpCrawler ¶

type HttpCrawler struct {
	// contains filtered or unexported fields
}

func New ¶

func New(client Client, urlStream <-chan *RequestMessage) *HttpCrawler

func (*HttpCrawler) Crawl ¶

func (c *HttpCrawler) Crawl(ctx context.Context) <-chan *ResponseMessage

Crawl starts crawling the URLs received in the urlStream channel and sends ResponseMessage of the crawled URLs through the rStream channel. It will end when the context is cancelled.

type Options ¶

type Options struct {
	MaxPageReports     int
	IgnoreRobotsTxt    bool
	FollowNofollow     bool
	IncludeNoindex     bool
	UserAgent          string
	CrawlSitemap       bool
	AllowSubdomains    bool
	BasicAuth          bool
	AuthUser           string
	AuthPass           string
	CheckExternalLinks bool
}

type Parser ¶

type Parser struct {
	ParsedURL *url.URL
	Headers   *http.Header
	// contains filtered or unexported fields
}

type Queue ¶

type Queue struct {
	// contains filtered or unexported fields
}

func NewQueue ¶

func NewQueue(ctx context.Context) *Queue

func (*Queue) Ack ¶

func (q *Queue) Ack(s string)

Acknowledges a message has been processed.

func (*Queue) Active ¶

func (q *Queue) Active() bool

Active returns true if the queue is not empty or has active elements.

func (*Queue) Count ¶

func (q *Queue) Count() int

Returns the number of items currently in the queue.

func (*Queue) Poll ¶

func (q *Queue) Poll() *RequestMessage

Returns the first element in the queue.

func (*Queue) Push ¶

func (q *Queue) Push(value *RequestMessage)

Adds a new value to the queue's end.

type RequestMessage ¶

type RequestMessage struct {
	URL   string
	Depth int
}

type ResponseMessage ¶

type ResponseMessage struct {
	URL      string
	Response *http.Response
	Error    error
	Depth    int
}

type RobotsChecker ¶

type RobotsChecker struct {
	// contains filtered or unexported fields
}

func NewRobotsChecker ¶

func NewRobotsChecker(client Client, ua string) *RobotsChecker

func (*RobotsChecker) Exists ¶

func (r *RobotsChecker) Exists(u *url.URL) bool

Returns true if the robots.txt file exists and is valid

func (*RobotsChecker) GetSitemaps ¶

func (r *RobotsChecker) GetSitemaps(u *url.URL) []string

Returns a list of sitemaps found in the robots.txt file

func (*RobotsChecker) IsBlocked ¶

func (r *RobotsChecker) IsBlocked(u *url.URL) bool

Returns true if the URL is blocked by robots.txt

type SitemapChecker ¶

type SitemapChecker struct {
	// contains filtered or unexported fields
}

func NewSitemapChecker ¶

func NewSitemapChecker(client Client, limit int) *SitemapChecker

func (*SitemapChecker) ParseSitemaps ¶

func (sc *SitemapChecker) ParseSitemaps(URLs []string, callback func(u string))

Parse the sitemaps using a callback function on each entry For each URL provided check if it's an index sitemap

func (*SitemapChecker) SitemapExists ¶

func (sc *SitemapChecker) SitemapExists(URLs []string) bool

Check if any of the sitemap URLs provided exist

type URLStorage ¶

type URLStorage struct {
	// contains filtered or unexported fields
}

func NewURLStorage ¶

func NewURLStorage() *URLStorage

func (*URLStorage) Add ¶

func (s *URLStorage) Add(u string)

Adds an URL string to the slice.

func (*URLStorage) Iterate ¶

func (s *URLStorage) Iterate(f func(string))

Iterate over the seen map, applying the provided function f to the iteration's current element.

func (*URLStorage) Seen ¶

func (s *URLStorage) Seen(u string) bool

Returns true if a URL string has already been added.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL