crawler

package
v0.2.3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 5, 2022 License: Apache-2.0 Imports: 13 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func GetSitemapUrls

func GetSitemapUrls(sitemapURL string) (urls []*url.URL, err error)

GetSitemapUrls returns all URLs found from the sitemap passed as parameter. This function will only retrieve URLs in the sitemap pointed, and in sitemaps directly listed (i.e. only 1 level deep or less)

func GetSitemapUrlsAsStrings

func GetSitemapUrlsAsStrings(sitemapURL string) (urls []string, err error)

GetSitemapUrlsAsStrings returns all URLs found as string, from in the sitemap passed as parameter. This function will only retrieve URLs in the sitemap pointed, and in sitemaps directly listed (i.e. only 1 level deep or less)

func PrintJSONSummary added in v0.1.1

func PrintJSONSummary(stats CrawlStats)

PrintJSONSummary prints a summary of HTTP response codes in JSON format

func PrintResult added in v0.1.1

func PrintResult(result *HTTPResponse)

PrintResult will print information relative to the HTTPResponse

func PrintSummary

func PrintSummary(stats CrawlStats)

PrintSummary prints a summary of HTTP response codes

func RunConcurrentGet added in v0.1.1

func RunConcurrentGet(httpGet HTTPGetter, urls []string, config HTTPConfig,
	maxConcurrent int, resultChan chan<- *HTTPResponse, quit <-chan struct{})

RunConcurrentGet runs multiple HTTP requests in parallel, and returns the result in resultChan

Types

type BaseConcurrentHTTPGetter added in v0.1.1

type BaseConcurrentHTTPGetter struct {
	Get HTTPGetter
}

BaseConcurrentHTTPGetter implements HTTPGetter interface using net/http package

func (*BaseConcurrentHTTPGetter) ConcurrentHTTPGet added in v0.1.1

func (getter *BaseConcurrentHTTPGetter) ConcurrentHTTPGet(urls []string, config HTTPConfig,
	maxConcurrent int, quit <-chan struct{}) <-chan *HTTPResponse

ConcurrentHTTPGet will GET the urls passed and result the results of the crawling

type ConcurrentHTTPGetter added in v0.1.1

type ConcurrentHTTPGetter interface {
	ConcurrentHTTPGet(urls []string, config HTTPConfig, maxConcurrent int,
		quit <-chan struct{}) <-chan *HTTPResponse
}

ConcurrentHTTPGetter allows concurrent execution of an HTTPGetter

type CrawlConfig

type CrawlConfig struct {
	Throttle   int
	Host       string
	HTTP       HTTPConfig
	Links      CrawlLinksConfig
	HTTPGetter ConcurrentHTTPGetter
}

CrawlConfig holds crawling configuration.

type CrawlLinksConfig added in v0.2.0

type CrawlLinksConfig struct {
	CrawlExternalLinks bool
	CrawlHyperlinks    bool
	CrawlImages        bool
}

CrawlLinksConfig holds the crawling policy for links

type CrawlResult added in v0.1.1

type CrawlResult struct {
	URL         string        `json:"url"`
	StatusCode  int           `json:"status-code"`
	Time        time.Duration `json:"server-time"`
	LinkingURLs []string      `json:"linking-urls"`
}

CrawlResult is the result from a single crawling

type CrawlStats

type CrawlStats struct {
	Total          int
	StatusCodes    map[int]int
	Average200Time time.Duration
	Max200Time     time.Duration
	Non200Urls     []CrawlResult
}

CrawlStats holds crawling related information: status codes, time and totals

func AsyncCrawl

func AsyncCrawl(urls []string, config CrawlConfig, quit <-chan struct{}) (stats CrawlStats, err error)

AsyncCrawl crawls asynchronously URLs from a sitemap and prints related information. Throttle is the maximum number of parallel HTTP requests. Host overrides the hostname used in the sitemap if provided, and user/pass are optional basic auth credentials

func MergeCrawlStats

func MergeCrawlStats(statsA, statsB CrawlStats) (stats CrawlStats)

MergeCrawlStats merges two sets of crawling statistics together.

type HTTPConfig

type HTTPConfig struct {
	User       string
	Pass       string
	Timeout    time.Duration
	ParseLinks bool
}

HTTPConfig hold settings used to get pages via HTTP/S

type HTTPGetter added in v0.1.1

type HTTPGetter func(url string, config HTTPConfig) (response *HTTPResponse)

HTTPGetter performs a single HTTP/S to the url, and return information related to the result as an HTTPResponse

type HTTPResponse

type HTTPResponse struct {
	URL        string
	Response   *http.Response
	Result     *httpstat.Result
	StatusCode int
	EndTime    time.Time
	Err        error
	Links      []Link
}

HTTPResponse holds information from a GET to a specific URL

func HTTPGet

func HTTPGet(urlStr string, config HTTPConfig) (response *HTTPResponse)

HTTPGet issues a GET request to a single URL and returns an HTTPResponse

type Link struct {
	Type       LinkType
	Name       string
	TargetURL  url.URL
	IsExternal bool
}

Link type holds information of URL links

func ExtractLinks(htmlBody io.ReadCloser, currentURL url.URL) ([]Link, error)

ExtractLinks returns links found in the html page provided and currentURL. The URL is used to differentiate between internal and external links

type LinkType added in v0.2.0

type LinkType int

LinkType represent the type of link to crawl

const (
	// Hyperlink is html 'a' tag
	Hyperlink LinkType = 0
	// Image is html 'img' tag
	Image LinkType = 1
)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL