Documentation ¶
Index ¶
- Constants
- func Crawling(surl string) (ResponseBodyString string, StatusCode int, ContentType string)
- func DomArrayToUrl(cU CUrl, a [][]string, cH chan<- CUrl, tM map[string]int)
- func ExtractBody(s string) ([][]string, [][]string)
- func GetDomainHost(u string) (string, string, error)
- func GetFromRedirectUrl(lu string, rn int) (string, int, string)
- func GetUrlFromLocation(resp http.Response) string
- func IterCrawl(cu CUrl, tM map[string]int, cH chan<- CUrl, fA *[]CUrl, eA *[]CUrl, ...)
- func LanuchCrawl(rla []string, lp string, rp string)
- func PutChannel(cu CUrl, ch chan<- CUrl)
- func ReArrayToUrl(cU CUrl, a [][]string, cH chan<- CUrl, tM map[string]int)
- func ReDomainMatch(s string) bool
- func ReHaveMoreSlash(s string) bool
- func ReHaveSinlgeSlash(s string) bool
- func ReHrefSubMatch(s string) [][]string
- func ReIsHttp(s string) bool
- func ReIsLink(s string) bool
- func ReLinkSubMatch(s string) [][]string
- func ReSrcSubMatch(s string) [][]string
- func ReadJsonConfig(tm map[string]int, rdl []string) []string
- func SpaceMap(str string) string
- func StatAndCreate(p string) error
- func StitchDomain(s string, h string) string
- func StitchUrl(DomainString string, PathString string) (UString string)
- func UrlToChMAP(cu CUrl, ch chan<- CUrl, tm map[string]int)
- type CUrl
- type ConfigJson
Constants ¶
View Source
const ( PATTERN_SRC = `src=\"(.*?)\"` PATTERN_HERF = `href=\"(.*?)\"` PATTERN_HTTP = `^http(.*?)` PATTERN_LINK = `^https?:\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)` PATTERN_SINGLE_SLASH = `^/([^/].*)?$` PATTERN_MORE_SLASH = `^//(.*?)` ALLOW_DOMAIN = `(qiniu.com)|(qiniu.com.cn)` )
Variables ¶
This section is empty.
Functions ¶
func GetFromRedirectUrl ¶
检查重定向是否正确
func GetUrlFromLocation ¶
func LanuchCrawl ¶
func ReArrayToUrl ¶
读取数组内的路径,处理为完整url,如果不在Map里放入ch和map
func ReLinkSubMatch ¶
func StatAndCreate ¶
Types ¶
type CUrl ¶
type CUrl struct { Id bson.ObjectId `json:"id" bson:"_id"` CrawlUrl string `json:"CrawlUrl" bson:"crawl_url"` StatusCode int `json:"StatusCode" bson:"status_code"` Origin string `json:"Origin" bson:"origin"` Domain string `json:"Domain" bson:"domain"` RefUrl string `json:"RefUrl" bson:"ref_url"` ContentType string `json:"ContentType" bson:"content_type"` QueryError string `json:"QueryError" bson:"query_error"` // contains filtered or unexported fields }
-1 链接放入管道未爬取 -2 http请求报错 -3 读取管道超时,一般为没有新链接放入管道,自动结束
type ConfigJson ¶
Click to show internal directories.
Click to hide internal directories.