spider

package
v0.0.0-...-43f4138 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 10, 2017 License: Apache-2.0 Imports: 12 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// 暂停时间 default wait time
	WaitTime = 5

	// HTTP方法
	POST     = "POST"
	POSTJSON = "POSTJSON"
	POSTXML  = "POSTXML"
	POSTFILE = "POSTFILE"

	// 实现了!
	PUT     = "PUT"
	PUTJSON = "PUTJSON"
	PUTXML  = "PUTXML"
	PUTFILE = "PUTFILE"

	DELETE = "DELETE"
	GET    = "GET"
	OTHER  = "OTHER"

	CRITICAL = "CRITICAL"
	ERROR    = "ERROR"
	WARNING  = "WARNING"
	NOTICE   = "NOTICE"
	INFO     = "INFO"
	DEBUG    = "DEBUG"

	HTTPFORMContentType = "application/x-www-form-urlencoded"
	HTTPJSONContentType = "application/json"
	HTTPXMLContentType  = "text/xml"
	HTTPFILEContentType = "multipart/form-data"
)

Variables

View Source
var (
	//default client to ask get or post
	// 默认的官方客户端,带cookie,方便使用,没有超时时间
	Client = &http.Client{
		CheckRedirect: func(req *http.Request, via []*http.Request) error {
			Logger.Debugf("-----------Redirect:%v------------", req.URL)
			return nil
		},
		Jar: NewJar(),
	}

	// 没有cookie的客户端
	NoCookieClient = &http.Client{
		CheckRedirect: func(req *http.Request, via []*http.Request) error {
			Logger.Debugf("-----------Redirect:%v------------", req.URL)
			return nil
		},
	}
)
View Source
var (
	// 浏览器头部 default header ua
	// 默认的,取消使用!!
	FoxfireLinux = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"
	SpiderHeader = map[string][]string{
		"User-Agent": {
			FoxfireLinux,
		},
	}
	// http get and post No timeout
	// 不设置时没有超时时间
	DefaultTimeOut = 0
)
View Source
var (
	// 爬虫池子
	Pool = &_Spider{brower: make(map[string]*Spider)}
	Ua   = map[int]string{}
)
View Source
var LevelNames = []string{
	"CRITICAL",
	"ERROR",
	"WARNING",
	"NOTICE",
	"INFO",
	"DEBUG",
}

level name you can refer

View Source
var Logger = logging.MustGetLogger("GoSpider")

全局日志

Functions

func CloneHeader

func CloneHeader(h map[string][]string) map[string][]string

clone a header 克隆头部,因为是引用

func CopyM

func CopyM(h http.Header) http.Header

Header map[string][]string ,can use to copy a http header, so that they are not effect each other

func Log

func Log() *logging.Logger

返回全局对象 return global log

func MergeCookie

func MergeCookie(before []*http.Cookie, after []*http.Cookie) []*http.Cookie

merge Cookie,后来的覆盖前来的 暂时没有用的

func NewClient

func NewClient() (*http.Client, error)

a client 不带代理客户端

func NewHeader

func NewHeader(ua interface{}, host string, refer interface{}) map[string][]string

usually a header has ua,host and refer 浏览器标志,主机名,来源

func NewJar

func NewJar() *cookiejar.Jar

cookie record 记录Cookie

func NewProxyClient

func NewProxyClient(proxystring string) (*http.Client, error)

a proxy client 带代理客户端,全部有带cookie

func OutputMaps

func OutputMaps(info string, args map[string][]string)

just debug a map

func RandomUa

func RandomUa() string

返回随机Ua

func SetGlobalTimeout

func SetGlobalTimeout(num int)

超时目前只能这样设置全局

func SetLogLevel

func SetLogLevel(level string)

设置日志级别 set log level

func TooSortSizes

func TooSortSizes(data []byte, sizes float64) error

if a file size small than sizes(KB) ,it will be throw a error

func UaInit

func UaInit()

Ua初始化

func Wait

func Wait(waittime int)

Wait some secord

Types

type Spider

type Spider struct {
	*SpiderConfig
	Preurl        string        // pre url 上一次访问的URL
	Raw           []byte        // 抓取到的二进制流
	UrlStatuscode int           // the last url response code,such as 404 响应状态码
	Client        *http.Client  // 真正客户端
	Fetchtimes    int           // url fetch number times 抓取次数
	Errortimes    int           // error times 失败次数
	Ipstring      string        // spider ip,just for user to record their proxyip 代理IP地址,没有代理默认localhost
	Request       *http.Request // 增加方便外部调试
	Response      *http.Response
	// contains filtered or unexported fields
}

爬虫结构体

var DefaultSpider *Spider

全局爬虫

func GetSpider

func GetSpider() *Spider

获取默认Spider

func New

func New(ipstring interface{}) (*Spider, error)

新建爬虫别名函数

func NewAPI

func NewAPI() *Spider

API爬虫,不用保存Cookie,可用于对接各种API,但仍然有默认UA

func NewSpider

func NewSpider(ipstring interface{}) (*Spider, error)

新建一个爬虫,如果ipstring是一个代理IP地址,那使用代理客户端

func NewSpiderByClient

func NewSpiderByClient(client *http.Client) *Spider

通过官方Client来新建爬虫,方便您更灵活

func (*Spider) Cookies

func (sp *Spider) Cookies() []*http.Cookie

返回cookie

func (*Spider) Delete

func (sp *Spider) Delete() (body []byte, e error)

func (*Spider) Get

func (sp *Spider) Get() (body []byte, e error)

Get method,can take a client 手动调用方法

func (*Spider) Go

func (sp *Spider) Go() (body []byte, e error)

auto decide which method 自动根据方法调用相应函数,默认GET方法

func (*Spider) JsonToString

func (sp *Spider) JsonToString() (string, error)

将抓到的数据变成字符串,但数据是编码的JSON

func (*Spider) NewHeader

func (sp *Spider) NewHeader(ua interface{}, host string, refer interface{})

class method 创建新头部快捷方法

func (*Spider) OtherGo

func (sp *Spider) OtherGo(method, contenttype string) (body []byte, e error)

其他Method

  Method         = "OPTIONS"                ; Section 9.2
                 | "GET"                    ; Section 9.3
                 | "HEAD"                   ; Section 9.4
                 | "POST"                   ; Section 9.5
                 | "PUT"                    ; Section 9.6
                 | "DELETE"                 ; Section 9.7
                 | "TRACE"                  ; Section 9.8
                 | "CONNECT"                ; Section 9.9
                 | extension-method
extension-method = token
  token          = 1*<any CHAR except CTLs or separators>

// content type

"application/x-www-form-urlencoded"
"application/json"
"text/xml"
"multipart/form-data"

func (*Spider) Post

func (sp *Spider) Post() (body []byte, e error)

Post附带信息 can take a client

"application/x-www-form-urlencoded"
"application/json"
"text/xml"
"multipart/form-data"

func (*Spider) PostFILE

func (sp *Spider) PostFILE() (body []byte, e error)

func (*Spider) PostJSON

func (sp *Spider) PostJSON() (body []byte, e error)

func (*Spider) PostXML

func (sp *Spider) PostXML() (body []byte, e error)

func (*Spider) Put

func (sp *Spider) Put() (body []byte, e error)

Put

func (*Spider) PutFILE

func (sp *Spider) PutFILE() (body []byte, e error)

func (*Spider) PutJSON

func (sp *Spider) PutJSON() (body []byte, e error)

func (*Spider) PutXML

func (sp *Spider) PutXML() (body []byte, e error)

func (*Spider) ToString

func (sp *Spider) ToString() string

将抓到的数据变成字符串

type SpiderConfig

type SpiderConfig struct {
	Url    string      // now fetch url 这次要抓取的Url
	Method string      // Get Post 请求方法
	Header http.Header // 请求头部
	Data   url.Values  // post form data 表单字段
	BData  []byte      // binary data 文件上传二进制流
	Wait   int         // sleep time 等待时间
}

func (*SpiderConfig) Clear

func (config *SpiderConfig) Clear() *SpiderConfig

func (*SpiderConfig) ClearAll

func (config *SpiderConfig) ClearAll() *SpiderConfig

func (*SpiderConfig) ClearCookie

func (config *SpiderConfig) ClearCookie() *SpiderConfig

可以删除设置的Cookie

func (*SpiderConfig) SetBData

func (config *SpiderConfig) SetBData(data []byte) *SpiderConfig

func (*SpiderConfig) SetCookie

func (config *SpiderConfig) SetCookie(v string) *SpiderConfig

Cookie 这样设置如果有jar != nil 那么同名cookie会和这个一起发送过去

func (*SpiderConfig) SetForm

func (config *SpiderConfig) SetForm(form url.Values) *SpiderConfig

func (*SpiderConfig) SetFormParm

func (config *SpiderConfig) SetFormParm(k, v string) *SpiderConfig

func (*SpiderConfig) SetHeader

func (config *SpiderConfig) SetHeader(header http.Header) *SpiderConfig

Java Bean链式结构

func (*SpiderConfig) SetHeaderParm

func (config *SpiderConfig) SetHeaderParm(k, v string) *SpiderConfig

func (*SpiderConfig) SetHost

func (config *SpiderConfig) SetHost(host string) *SpiderConfig

func (*SpiderConfig) SetMethod

func (config *SpiderConfig) SetMethod(method string) *SpiderConfig

func (*SpiderConfig) SetRefer

func (config *SpiderConfig) SetRefer(refer string) *SpiderConfig

func (*SpiderConfig) SetUa

func (config *SpiderConfig) SetUa(ua string) *SpiderConfig

func (*SpiderConfig) SetUrl

func (config *SpiderConfig) SetUrl(url string) *SpiderConfig

SetUrl的同时Set一下Host

func (*SpiderConfig) SetWaitTime

func (config *SpiderConfig) SetWaitTime(num int) *SpiderConfig

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL