Documentation ¶
Index ¶
Constants ¶
View Source
const UserAgent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
Default useragent
Variables ¶
View Source
var ( ErrMimeType = errors.New("MIME type not supported") ErrInvalidChar = errors.New("Crawler result contains invalid characters") )
Errors
View Source
var ( // Valid url ValidUrl = regexp.MustCompile(`https?://[-A-Za-z0-9+&@#/%?=~_()|!:,.;]*`) // Valid Page Title ValidPageTitle = regexp.MustCompile(`^(.)+$`) )
Some regular expressions
View Source
var AllowedMimeTypes = map[string]bool{ "text/html; charset=utf-8": true, }
Map of allowed mime types
Functions ¶
func ExtractUrl ¶
Types ¶
type Client ¶
type Client struct { // Dialer used for requests Dial func(network, addr string) (net.Conn, error) // The useragent used in requests UserAgent string // UserName and PassWord for authentication with the WebServer UserName string PassWord string }
Client structure
type CrawlResult ¶
type CrawlResult struct { Title string `xml:"head>title"` // Title of the page //Desc string 'xml: "head>meta"' // Description of the page Size int // Size of webpage }
Store some information about the page
Click to show internal directories.
Click to hide internal directories.