crawler

package module
v1.1.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 10, 2021 License: BSD-2-Clause Imports: 23 Imported by: 2

Documentation

Index

Constants

This section is empty.

Variables

View Source
var Crawler_CacheDirectory = ""

指定缓存目录

View Source
var Crawler_Capacity int = 10

标签页面上限

View Source
var Crawler_Headless = true

全局配置参数 无头模式

View Source
var Crawler_LoadTimeOut = 30

全局超时时间(秒)

View Source
var Crawler_WaitTime = 0

全局页面加载完成后等待时间(毫秒)

View Source
var Default_ResourceType_Allow = map[network.ResourceType]struct{}{network.ResourceTypeImage: struct{}{}, network.ResourceTypeScript: struct{}{}, network.ResourceTypeStylesheet: struct{}{}, network.ResourceTypeFont: struct{}{}}
View Source
var ERR_INVALID_RESPONSE error = errors.New("无效的响应")
View Source
var ERR_INVALID_URL error = errors.New("无效的网站")
View Source
var ERR_URL_LOAD_FAIL error = errors.New("网站加载失败")
View Source
var ERR_URL_TIMEOUT error = errors.New("网站已超时")

Functions

func ClearCache

func ClearCache() error

清理缓存,预防第一次爬取就304

func ConvertResourceType

func ConvertResourceType(contentType string) network.ResourceType

func Instance

func Instance() *browser

单例对象

func IsValidStatus

func IsValidStatus(statuscode int) bool

func New

func New() *browser

实例对象

func NewPagenation

func NewPagenation(tab *Tab, pagerule string, spdierrule string, data interface{}, f func() *Tab) (*pagenation, error)

func SimpleGet

func SimpleGet(url string) (res []byte, contentType string, statuscode int, err error)

Types

type DocumentInfo

type DocumentInfo struct {
	//服务器IP
	Ip string
	//服务器端口
	Port int
	//响应url
	RespUrl string
	//DNS加载时间,毫秒
	DnsTime int
	//页面加载时间,毫秒
	LoadTime int
	//网站响应时间,毫秒
	ResponseTime int
	//状态码
	StatusCode int
	//资源类型
	ResourceType network.ResourceType
	//可以做筛选
	Resources map[string]Resource
}

type Resource

type Resource struct {
	Referer    string
	Url        string
	StatusCode int
	Type       network.ResourceType
	Value      []byte
}

type Tab

type Tab struct {
	LoadTimeOut  int  //秒
	WaitTime     int  //毫秒
	AcceptDialog bool //true表示在js弹出窗中按确认, false表示取消(默认)
	// contains filtered or unexported fields
}

func (*Tab) Close

func (self *Tab) Close()

func (*Tab) DisableCrawlResource

func (self *Tab) DisableCrawlResource() *resourceParams

func (*Tab) Evaluate

func (self *Tab) Evaluate(rule string, v interface{}) error

在当前页面执行脚本

func (self *Tab) GetAllLinks() ([]string, error)

获取当前页面的所有链接

func (*Tab) GetDocument

func (self *Tab) GetDocument() (res []byte, err error)

获取页面元素文本

func (*Tab) GetPdfBytes

func (self *Tab) GetPdfBytes(url string) ([]byte, error)

获取pdf字节数组

func (*Tab) GetSnapShot

func (self *Tab) GetSnapShot(url string) (string, error)

todo 暂时未测试 页面截图

func (*Tab) Navigate

func (self *Tab) Navigate(rawUrl string) (doc DocumentInfo, err error)

跳转页面并获取各种页面信息

func (*Tab) NavigateEvaluate

func (self *Tab) NavigateEvaluate(rawUrl string, rule string, v interface{}) (err error)

跳转一个页面,并执行脚本,返回数据给v

func (*Tab) NewPagenation

func (self *Tab) NewPagenation(pagerule string, spdierrule string, data interface{}) (page *pagenation, err error)

建立一个分页对象

func (*Tab) SetLoadTimeOut

func (self *Tab) SetLoadTimeOut(loadtime int) *Tab

默认为0,为0时取browser的时间

func (*Tab) SetWaitTime

func (self *Tab) SetWaitTime(waittime int) *Tab

默认为0,为0时取browser的时间

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL