Documentation ¶
Index ¶
- func MatchURLRule(rule, url string) bool
- type InvalidRobots
- type RobotDenied
- type RobotFile
- func (l *RobotFile) Allowed(userAgent, url string) bool
- func (l *RobotFile) GetDelay(userAgent string, defaultDelay time.Duration) time.Duration
- func (l *RobotFile) GetSitemap(userAgent string, client http.RoundTripper) (*Sitemap, error)
- func (l *RobotFile) GetUserAgentRules(userAgent string) *UserAgentRules
- type RobotRules
- type Sitemap
- type SitemapLocation
- type UserAgentRules
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func MatchURLRule ¶
MatchURLRule will return true if the given robot exclusion rule matches the given URL. Supports wildcards ('*') and end of line ('$').
Types ¶
type InvalidRobots ¶
InvalidRobots indicates an invalid robots.txt file.
func (InvalidRobots) Error ¶
func (e InvalidRobots) Error() string
type RobotDenied ¶
RobotDenied indicates a request was denied by a site's robots.txt file.
func (RobotDenied) Error ¶
func (e RobotDenied) Error() string
type RobotFile ¶
type RobotFile struct {
// contains filtered or unexported fields
}
RobotFile holds all the information in a robots exclusion file.
func NewRobotFileFromReader ¶
RobotFileFromReader will parse a robot exclusion file from an io.Reader. Returns a default error if it encounters an invalid directive.
func NewRobotFileFromURL ¶
func (*RobotFile) Allowed ¶
Allowed returns true if the user agent is allowed to access the given url.
func (*RobotFile) GetDelay ¶
GetDelay returns the User-agent specific crawl-delay if it exists, otherwise the catch-all delay. Returns def if neither a specific or global crawl-delay exist.
func (*RobotFile) GetSitemap ¶
Sitemap returns the URL to the sitemap for the given User-agent. Returns the default sitemap if no User-agent specific sitemap was specified, otherwise nil.
func (*RobotFile) GetUserAgentRules ¶
func (l *RobotFile) GetUserAgentRules(userAgent string) *UserAgentRules
GetUserAgentRules gets the rules for the userAgent, returns the default (*) group if it was present and no other groups apply. Returns nil if no groups apply and no default group was supplied.
type RobotRules ¶
type RobotRules struct {
// contains filtered or unexported fields
}
RobotRules holds the robot exclusions for multiple domains.
func NewRobotRules ¶
func NewRobotRules() *RobotRules
NewRobotRules instantiates a new robot limit cache.
func (*RobotRules) AddLimits ¶
func (c *RobotRules) AddLimits(robotFile *RobotFile, host string)
AddLimits adds or replaces the limits for a host.
func (*RobotRules) Allowed ¶
Allowed returns true if the userAgent is allowed to access the given path on the given domain. Returns error if no robot file is cached for the given domain.
func (*RobotRules) GetRulesForHost ¶
func (c *RobotRules) GetRulesForHost(host string) (*RobotFile, error)
GetRulesForHost gets the rules for a host. Returns an error when no limits are cached for the given host.
type Sitemap ¶
type Sitemap struct { Index []SitemapLocation `xml:"sitemap"` URLSet []SitemapLocation `xml:"url"` }
func NewSitemap ¶
func NewSitemap() *Sitemap
func NewSitemapFromURL ¶
func NewSitemapFromURL(url string, client http.RoundTripper) (*Sitemap, error)
func (*Sitemap) GetLocations ¶
func (s *Sitemap) GetLocations(client http.RoundTripper, limit int) ([]SitemapLocation, error)
GetLocations gets up to <limit> sitemap locations. Sitemaps usually come in pages of 50k entries, this means the limit may be exceeded by up to 49_999 entries.
type SitemapLocation ¶
type UserAgentRules ¶
type UserAgentRules struct {
// contains filtered or unexported fields
}
UserAgentRules holds limits for a single user agent.
func (*UserAgentRules) Allowed ¶
func (g *UserAgentRules) Allowed(url string) bool
Allowed returns true if the url is allowed by the group rules. Check if the group applies to the user agent first by using Applies.
func (*UserAgentRules) Applies ¶
func (g *UserAgentRules) Applies(userAgent string) bool
Applies returns true if the group applies to the given userAgent