spider

package
v0.0.0-...-da5a1f3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 4, 2017 License: Apache-2.0 Imports: 13 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// 暂停时间 default wait time
	WaitTime = 5

	// HTTP方法
	POST     = "POST"
	POSTJSON = "POSTJSON"
	POSTXML  = "POSTXML"
	POSTFILE = "POSTFILE"

	PUT = "PUT"
	GET = "GET"
)

Variables

View Source
var (
	// 浏览器头部 default header ua
	// 默认的
	FoxfireLinux = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"
	SpiderHeader = map[string][]string{
		"User-Agent": {
			FoxfireLinux,
		},
	}
	// http get and post No timeout
	// 不设置时没有超时时间
	DefaultTimeOut = 0
)
View Source
var (
	// 爬虫池子
	Pool = &_Spider{brower: make(map[string]*Spider)}
	Ua   = map[int]string{}
)
View Source
var (
	//default client to ask get or post
	// 默认的官方客户端,带cookie,方便使用,没有超时时间,不带cookie的客户端不提供
	Client = &http.Client{
		CheckRedirect: func(req *http.Request, via []*http.Request) error {
			Logger.Debugf("-----------Redirect:%v------------", req.URL)
			return nil
		},
		Jar: NewJar(),
	}
)
View Source
var LevelNames = []string{
	"CRITICAL",
	"ERROR",
	"WARNING",
	"NOTICE",
	"INFO",
	"DEBUG",
}

level name you can refer

View Source
var Logger = logging.MustGetLogger("GoSpider")

全局日志

Functions

func CloneHeader

func CloneHeader(h map[string][]string) map[string][]string

clone a header 克隆头部,因为是引用

func CopyM

func CopyM(h http.Header) http.Header

Header map[string][]string ,can use to copy a http header, so that they are not effect each other

func Log

func Log() *logging.Logger

返回全局对象 return global log

func MergeCookie

func MergeCookie(before []*http.Cookie, after []*http.Cookie) []*http.Cookie

merge Cookie,后来的覆盖前来的 暂时没有用的

func NewClient

func NewClient() (*http.Client, error)

a client 不带代理客户端

func NewHeader

func NewHeader(ua interface{}, host string, refer interface{}) map[string][]string

usually a header has ua,host and refer 浏览器标志,主机名,来源

func NewJar

func NewJar() *cookiejar.Jar

cookie record 记录Cookie

func NewProxyClient

func NewProxyClient(proxystring string) (*http.Client, error)

a proxy client 带代理客户端,全部有带cookie

func OutputMaps

func OutputMaps(info string, args map[string][]string)

just debug a map

func RandomUa

func RandomUa() string

返回随机Ua

func SetGlobalTimeout

func SetGlobalTimeout(num int)

超时目前只能这样设置全局

func SetLogLevel

func SetLogLevel(level string)

设置日志级别 set log level

func TooSortSizes

func TooSortSizes(data []byte, sizes float64) error

if a file size small than sizes(KB) ,it will be throw a error

func UaInit

func UaInit()

Ua初始化

func Wait

func Wait(waittime int)

Wait some secord

Types

type Spider

type Spider struct {
	*SpiderConfig
	Preurl        string       // pre url 上一次访问的URL
	Raw           []byte       // 抓取到的二进制流
	UrlStatuscode int          // the last url response code,such as 404 响应状态码
	Client        *http.Client // 真正客户端
	Fetchtimes    int          // url fetch number times 抓取次数
	Errortimes    int          // error times 失败次数
	Ipstring      string       // spider ip,just for user to record their proxyip 代理IP地址,没有代理默认localhost
	// contains filtered or unexported fields
}

爬虫结构体

func GetSpider

func GetSpider() *Spider

获取默认Spider Todo 应该给爬虫对象,一些JavaBean的链式方法

func New

func New(ipstring interface{}) (*Spider, error)

新建爬虫别名函数

func NewSpider

func NewSpider(ipstring interface{}) (*Spider, error)

新建一个爬虫,如果ipstring是一个代理IP地址,那使用代理客户端

func NewSpiderByClient

func NewSpiderByClient(client *http.Client) *Spider

通过官方Client来新建爬虫,方便您更灵活

func (*Spider) Get

func (this *Spider) Get() (body []byte, e error)

Get method,can take a client 手动调用方法

func (*Spider) Go

func (this *Spider) Go() (body []byte, e error)

auto decide which method 自动根据方法调用相应函数,默认GET方法

func (*Spider) JsonToString

func (this *Spider) JsonToString() (string, error)

将抓到的数据变成字符串,但数据是编码的JSON

func (*Spider) NewHeader

func (this *Spider) NewHeader(ua interface{}, host string, refer interface{})

class method 创建新头部快捷方法

func (*Spider) Post

func (this *Spider) Post() (body []byte, e error)

Post附带信息 can take a client

func (*Spider) PostFILE

func (this *Spider) PostFILE() (body []byte, e error)

func (*Spider) PostJSON

func (this *Spider) PostJSON() (body []byte, e error)

func (*Spider) PostXML

func (this *Spider) PostXML() (body []byte, e error)

func (*Spider) ToString

func (this *Spider) ToString() string

将抓到的数据变成字符串

type SpiderConfig

type SpiderConfig struct {
	Url    string      // now fetch url 这次要抓取的Url
	Method string      // Get Post 请求方法
	Header http.Header // 请求头部
	Data   url.Values  // post form data 表单字段
	BData  []byte      // binary data 文件上传二进制流
	Wait   int         // sleep time 等待时间
}

func (*SpiderConfig) Clear

func (config *SpiderConfig) Clear() *SpiderConfig

func (*SpiderConfig) SetBData

func (config *SpiderConfig) SetBData(data []byte) *SpiderConfig

func (*SpiderConfig) SetForm

func (config *SpiderConfig) SetForm(form url.Values) *SpiderConfig

func (*SpiderConfig) SetFormParm

func (config *SpiderConfig) SetFormParm(k, v string) *SpiderConfig

func (*SpiderConfig) SetHeader

func (config *SpiderConfig) SetHeader(header http.Header) *SpiderConfig

Java Bean链式结构

func (*SpiderConfig) SetHeaderParm

func (config *SpiderConfig) SetHeaderParm(k, v string) *SpiderConfig

func (*SpiderConfig) SetHost

func (config *SpiderConfig) SetHost(host string) *SpiderConfig

func (*SpiderConfig) SetMethod

func (config *SpiderConfig) SetMethod(method string) *SpiderConfig

func (*SpiderConfig) SetRefer

func (config *SpiderConfig) SetRefer(refer string) *SpiderConfig

func (*SpiderConfig) SetUa

func (config *SpiderConfig) SetUa(ua string) *SpiderConfig

func (*SpiderConfig) SetUrl

func (config *SpiderConfig) SetUrl(url string) *SpiderConfig

func (*SpiderConfig) SetWaitTime

func (config *SpiderConfig) SetWaitTime(num int) *SpiderConfig

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL