surfer

package module
v1.2.4 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 26, 2022 License: Apache-2.0 Imports: 29 Imported by: 0

README

Surfer GitHub release report card github issues github closed issues GoDoc view Go大数据

Package surfer is a high level concurrency http client. It has surf and phantom download engines, highly simulated browser behavior, the function of analog login and so on.

简体中文

Features

  • Both surf and phantomjs engines are supported
  • Support random User-Agent
  • Support cache cookie
  • Support http/https

Usage

package main

import (
    "github.com/henrylee2cn/surfer"
    "io/ioutil"
    "log"
)

func main() {
    // Use surf engine
    resp, err := surfer.Download(&surfer.Request{
        Url: "http://github.com/henrylee2cn/surfer",
    })
    if err != nil {
        log.Fatal(err)
    }
    b, err := ioutil.ReadAll(resp.Body)
    log.Println(string(b), err)

    // Use phantomjs engine
    surfer.SetPhantomJsFilePath("Path to phantomjs.exe")
    resp, err = surfer.Download(&surfer.Request{
        Url:          "http://github.com/henrylee2cn",
        DownloaderID: 1,
    })
    if err != nil {
        log.Fatal(err)
    }
    b, err = ioutil.ReadAll(resp.Body)
    log.Println(string(b), err)

    resp.Body.Close()
    surfer.DestroyJsFiles()
}

Full example

License

Surfer is under Apache v2 License. See the LICENSE file for the full license text.

Documentation

Overview

Copyright 2015 henrylee2cn Author. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Package surfer is a high level concurrency http client.

It has `surf` and` phantom` download engines, highly simulated browser behavior, the function of analog login and so on.

Features: - Both surf and phantomjs engines are supported - Support random User-Agent - Support cache cookie - Support http/https

Usage: package main

import (

"github.com/henrylee2cn/surfer"
"io/ioutil"
"log"

)

func main() {
    // Use surf engine
    resp, err := surfer.Download(&surfer.Request{
        Url: "http://github.com/henrylee2cn/surfer",
    })
    if err != nil {
        log.Fatal(err)
    }
    b, err := ioutil.ReadAll(resp.Body)
    log.Println(string(b), err)

    // Use phantomjs engine
    resp, err = surfer.Download(&surfer.Request{
        Url:          "http://github.com/henrylee2cn",
        DownloaderID: 1,
    })
    if err != nil {
        log.Fatal(err)
    }
    b, err = ioutil.ReadAll(resp.Body)
    log.Println(string(b), err)
    resp.Body.Close()
    surfer.DestroyJsFiles()
}

Index

Constants

View Source
const (
	// Windows operating system.
	Windows int = iota
	// Linux based operating system.
	Linux
	// Macintosh /OS X operating system.
	Macintosh
)
View Source
const (
	SurfID             = 0               // Surf下载器标识符
	PhomtomJsID        = 1               // PhomtomJs下载器标识符
	DefaultMethod      = "GET"           // 默认请求方法
	DefaultDialTimeout = 2 * time.Minute // 默认请求服务器超时
	DefaultConnTimeout = 2 * time.Minute // 默认下载超时
	DefaultTryTimes    = 3               // 默认最大下载次数
	DefaultRetryPause  = 2 * time.Second // 默认重新下载前停顿时长
)

constant

Variables

View Source
var Database = UATable{
	"chrome": {
		"37.0.2049.0",
		Windows,
		Formats{
			"37": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
			"36": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
			"35": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
			"34": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
			"33": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
			"32": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
			"31": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
			"30": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
		},
	},
	"firefox": {
		"31.0",
		Windows,
		Formats{
			"31": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:31.0) Gecko/20100101 Firefox/{{.Ver}}",
			"30": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:30.0) Gecko/20120101 Firefox/{{.Ver}}",
			"29": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:29.0) Gecko/20120101 Firefox/{{.Ver}}",
			"28": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:28.0) Gecko/20100101 Firefox/{{.Ver}}",
			"27": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:27.0) Gecko/20130101 Firefox/{{.Ver}}",
			"26": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:26.0) Gecko/20121011 Firefox/{{.Ver}}",
			"25": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:25.0) Gecko/20100101 Firefox/{{.Ver}}",
		},
	},
	"msie": {
		"10.0",
		Windows,
		Formats{
			"10": "Mozilla/5.0 (compatible; MSIE 10.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/5.0; .NET CLR 3.5.30729)",
			"9":  "Mozilla/5.0 (compatible; MSIE 9.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/5.0; .NET CLR 3.0.30729)",
			"8":  "Mozilla/5.0 (compatible; MSIE 8.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/4.0; .NET CLR 3.0.04320)",
			"7":  "Mozilla/4.0 (compatible; MSIE 7.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}.NET CLR 2.0.50727)",
		},
	},
	"opera": {
		"12.14",
		Windows,
		Formats{
			"12": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.9.181 Version/{{.Ver}}",
			"11": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.7.62 Version/{{.Ver}}",
			"10": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.2.15 Version/{{.Ver}}",
			"9":  "Opera/9.00 ({{.OSN}} {{.OSV}}; U{{.Coms}})",
		},
	},
	"safari": {
		"6.0",
		Macintosh,
		Formats{
			"6": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/536.26 (KHTML, like Gecko) Version/{{.Ver}} Safari/8536.25",
			"5": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/531.2+ (KHTML, like Gecko) Version/{{.Ver}} Safari/531.2+",
			"4": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/528.16 (KHTML, like Gecko) Version/{{.Ver}} Safari/528.16",
		},
	},
	"itunes": {
		"9.1.1",
		Macintosh,
		Formats{
			"9": "iTunes/{{.Ver}}",
			"8": "iTunes/{{.Ver}}",
			"7": "iTunes/{{.Ver}} (Macintosh; U; PPC Mac OS X 10.4.7{{.Coms}})",
			"6": "iTunes/{{.Ver}} (Macintosh; U; PPC Mac OS X 10.4.5{{.Coms}})",
		},
	},
	"aol": {
		"9.7",
		Windows,
		Formats{
			"9": "Mozilla/5.0 (compatible; MSIE 9.0; AOL {{.Ver}}; AOLBuild 4343.19; {{.OSN}} {{.OSV}}; WOW64; Trident/5.0; FunWebProducts{{.Coms}})",
			"8": "Mozilla/4.0 (compatible; MSIE 7.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}; GTB5; .NET CLR 1.1.4322; .NET CLR 2.0.50727{{.Coms}})",
			"7": "Mozilla/4.0 (compatible; MSIE 7.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}; FunWebProducts{{.Coms}})",
			"6": "Mozilla/4.0 (compatible; MSIE 6.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}{{.Coms}})",
		},
	},
	"konqueror": {
		"4.9",
		Linux,
		Formats{
			"4": "Mozilla/5.0 (compatible; Konqueror/4.0; {{.OSN}}{{.Coms}}) KHTML/4.0.3 (like Gecko)",
			"3": "Mozilla/5.0 (compatible; Konqueror/3.0-rc6; i686 {{.OSN}}; 20021127{{.Coms}})",
			"2": "Mozilla/5.0 (compatible; Konqueror/2.1.1; {{.OSN}}{{.Coms}})",
		},
	},
	"netscape": {
		"9.1.0285",
		Windows,
		Formats{
			"9": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.9.2.4{{.Coms}}) Gecko/20070321 Netscape/{{.Ver}}",
			"8": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.7.5{{.Coms}}) Gecko/20050519 Netscape/{{.Ver}}",
			"7": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.0.1{{.Coms}}) Gecko/20020921 Netscape/{{.Ver}}",
		},
	},
	"lynx": {
		"2.8.8dev.3",
		Linux,
		Formats{
			"2": "Lynx/{{.Ver}} libwww-FM/2.14 SSL-MM/1.4.1",
			"1": "Lynx (textmode)",
		},
	},
	"googlebot": {
		"2.1",
		Linux,
		Formats{
			"2": "Mozilla/5.0 (compatible; Googlebot/{{.Ver}}; +http://www.google.com/bot.html{{.Coms}})",
			"1": "Googlebot/{{.Ver}} (+http://www.google.com/bot.html{{.Coms}})",
		},
	},
	"bingbot": {
		"2.0",
		Windows,
		Formats{
			"2": "Mozilla/5.0 (compatible; bingbot/{{.Ver}}; +http://www.bing.com/bingbot.htm{{.Coms}})",
		},
	},
	"yahoobot": {
		"2.0",
		Linux,
		Formats{
			"2": "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp{{.Coms}})",
		},
	},
	"default": {
		"1.0",
		Linux,
		Formats{
			"1": "{{.Name}}/{{.Ver}} ({{.OSN}} {{.OSV}}{{.Coms}})",
		},
	},
}

Database is the "database" of user agents.

View Source
var DefaultOSAttributes = map[int]OSAttributes{
	Windows:   {"Windows NT", "6.3", []string{"x64"}},
	Linux:     {"Linux", "3.16.1", []string{"x64"}},
	Macintosh: {"Intel Mac OS X", "10_6_8", []string{}},
}

DefaultOSAttributes stores default OS attributes.

View Source
var UserAgents = map[string][]string{}

UserAgents all User-Agent

Functions

func AutoToUTF8

func AutoToUTF8(resp *http.Response) error

AutoToUTF8 采用surf内核下载时,可以尝试自动转码为utf8 采用phantomjs内核时,无需转码(已是utf8)

func BodyBytes

func BodyBytes(resp *http.Response) ([]byte, error)

BodyBytes 读取完整响应流正文

func CreateDefault

func CreateDefault(browser string) string

CreateDefault returns a user agent string using default values.

func CreateReal

func CreateReal() string

CreateReal creates generates and returns a complete user agent string.

func CreateVersion

func CreateVersion(browser, version string) string

CreateVersion generates and returns a complete user agent string for a specific browser version.

func DestroyJsFiles

func DestroyJsFiles()

DestroyJsFiles 销毁Phantomjs的js临时文件

func Download

func Download(req *Request) (resp *http.Response, err error)

Download 实现surfer下载器接口

func Format

func Format(bname, bver string) string

Format returns the format string for the given browser name and version.

When a format can't be found for a version, the first format string for the browser is returned. When a format can't be found for the browser the default format is returned.

func GetWDPath

func GetWDPath() string

GetWDPath gets the work directory path.

func IsDirExists

func IsDirExists(path string) bool

IsDirExists judges path is directory or not.

func IsFileExists

func IsFileExists(path string) bool

IsFileExists judges path is file or not.

func SetPhantomJsFilePath

func SetPhantomJsFilePath(filePath string)

指定phantomjs可执行文件的位置

func TopVersion

func TopVersion(bname string) string

TopVersion returns the most recent version for the given browser name.

func UrlEncode

func UrlEncode(urlStr string) (*url.URL, error)

UrlEncode 返回编码后的url.URL指针、及解析错误

func WalkDir

func WalkDir(targpath string, suffixes ...string) (dirlist []string)

WalkDir 遍历目录,可指定后缀

Types

type Bytes

type Bytes []byte

Bytes bytes type of body content, without content type

func (Bytes) SetBody

func (b Bytes) SetBody(r *Request) error

SetBody sets request body

type Content

type Content struct {
	ContentType string
	Bytes       []byte
}

Content bytes type of body content

func (*Content) SetBody

func (c *Content) SetBody(r *Request) error

SetBody sets request body

type Cookie struct {
	Name   string `json:"name"`
	Value  string `json:"value"`
	Domain string `json:"domain"`
	Path   string `json:"path"`
}

给phantomjs传输cookie用

type DnsCache

type DnsCache struct {
	// contains filtered or unexported fields
}

DnsCache DNS cache

func (*DnsCache) Del

func (d *DnsCache) Del(addr string)

Del deletes ipPort from DNS cache.

func (*DnsCache) Query

func (d *DnsCache) Query(addr string) (string, bool)

Query queries ipPort from DNS cache.

func (*DnsCache) Reg

func (d *DnsCache) Reg(addr, ipPort string)

Reg registers ipPort to DNS cache.

type File

type File struct {
	Filename string
	Bytes    []byte
}

File post form file

type Form

type Form struct {
	// Values [field name]-[]value
	Values map[string][]string
	// Files [field name]-[]File
	Files map[string][]File
}

Form impletes body interface

func (Form) SetBody

func (f Form) SetBody(r *Request) error

SetBody sets request body

type Formats

type Formats map[string]string

Formats is a collection of UA format strings. key is the browser version. value is the browser info.

type JSONObj

type JSONObj struct{ Data interface{} }

JSONObj JSON type of body content

func (*JSONObj) SetBody

func (obj *JSONObj) SetBody(r *Request) error

SetBody sets request body

type OSAttributes

type OSAttributes struct {
	// OSName is the operating system name.
	OSName string
	// OSVersion is the operating system version.
	OSVersion string
	// Comments are additional comments to add to a user agent string.
	Comments []string
}

OSAttributes stores OS attributes.

type Phantom

type Phantom struct {
	PhantomjsFile string //Phantomjs完整文件名
	TempJsDir     string //临时js存放目录

	CookieJar *cookiejar.Jar
	// contains filtered or unexported fields
}

Phantom 基于Phantomjs的下载器实现,作为surfer的补充 效率较surfer会慢很多,但是因为模拟浏览器,破防性更好 支持UserAgent/TryTimes/RetryPause/自定义js

func (*Phantom) DestroyJsFiles

func (phantom *Phantom) DestroyJsFiles()

DestroyJsFiles 销毁js临时文件

func (*Phantom) Download

func (phantom *Phantom) Download(req *Request) (resp *http.Response, err error)

Download 实现surfer下载器接口

type Request

type Request struct {
	// url (必须填写)
	Url string

	// GET POST HEAD (默认为GET)
	Method string
	// http header
	Header http.Header
	// 是否使用cookies,在Spider的EnableCookie设置
	EnableCookie bool
	// request body interface
	Body body

	// dial tcp: i/o timeout
	DialTimeout time.Duration
	// WSARecv tcp: i/o timeout
	ConnTimeout time.Duration
	// the max times of download
	TryTimes int
	// how long pause when retry
	RetryPause time.Duration
	// max redirect times
	// when RedirectTimes equal 0, redirect times is ∞
	// when RedirectTimes less than 0, redirect times is 0
	RedirectTimes int
	// the download ProxyHost
	Proxy string

	// 指定下载器ID
	// 0为Surf高并发下载器,各种控制功能齐全
	// 1为PhantomJS下载器,特点破防力强,速度慢,低并发
	DownloaderID int
	// contains filtered or unexported fields
}

Request contains the necessary prerequisite information.

func (*Request) ReadBody

func (r *Request) ReadBody() ([]byte, error)

ReadBody returns body bytes

type RespBody

type RespBody struct {
	io.ReadCloser
	io.Reader
}

RespBody 封装Response.Body

func (*RespBody) Read

func (b *RespBody) Read(p []byte) (int, error)

Read 实现Reader接口

type Response

type Response struct {
	Cookies []string
	Body    string
	Error   string
	Header  []struct {
		Name  string
		Value string
	}
}

Response 用于解析Phantomjs的响应内容

type Surf

type Surf struct {
	CookieJar *cookiejar.Jar
}

Surf is the default Download implementation.

func (*Surf) Download

func (surf *Surf) Download(param *Request) (*http.Response, error)

Download 实现surfer下载器接口

type Surfer

type Surfer interface {
	// GET @param url string, header http.Header, cookies []*http.Cookie
	// HEAD @param url string, header http.Header, cookies []*http.Cookie
	// POST PostForm @param url, referer string, values url.Values, header http.Header, cookies []*http.Cookie
	// POST-M PostMultipart @param url, referer string, values url.Values, header http.Header, cookies []*http.Cookie
	Download(*Request) (resp *http.Response, err error)
}

Surfer represents an core of HTTP web browser for crawler.

func New

func New(jar ...*cookiejar.Jar) Surfer

New 创建一个Surf下载器

func NewPhantom

func NewPhantom(phantomjsFile, tempJsDir string, jar ...*cookiejar.Jar) Surfer

NewPhantom 创建一个Phantomjs下载器

type TemplateData

type TemplateData struct {
	Name string
	Ver  string
	OSN  string
	OSV  string
	Coms string
}

TemplateData structure for template data.

type UAData

type UAData struct {
	TopVersion string
	DefaultOS  int
	Formats    Formats
}

UAData stores information on a browser user agent.

type UATable

type UATable map[string]UAData

UATable is a collection of UAData values. key is the name of the browser.

type XMLObj

type XMLObj struct{ Data interface{} }

XMLObj XML type of body content

func (*XMLObj) SetBody

func (obj *XMLObj) SetBody(r *Request) error

SetBody sets request body

Directories

Path Synopsis
example

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL