surfer: github.com/henrylee2cn/surfer Index | Files | Directories

package surfer

import "github.com/henrylee2cn/surfer"

Copyright 2015 henrylee2cn Author. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Package surfer is a high level concurrency http client.

It has `surf` and` phantom` download engines, highly simulated browser behavior, the function of analog login and so on.

Features: - Both surf and phantomjs engines are supported - Support random User-Agent - Support cache cookie - Support http/https

Usage: package main

import (

"github.com/henrylee2cn/surfer"
"io/ioutil"
"log"

)

func main() {

// Use surf engine
resp, err := surfer.Download(&surfer.Request{
    Url: "http://github.com/henrylee2cn/surfer",
})
if err != nil {
    log.Fatal(err)
}
b, err := ioutil.ReadAll(resp.Body)
log.Println(string(b), err)

// Use phantomjs engine
resp, err = surfer.Download(&surfer.Request{
    Url:          "http://github.com/henrylee2cn",
    DownloaderID: 1,
})
if err != nil {
    log.Fatal(err)
}
b, err = ioutil.ReadAll(resp.Body)
log.Println(string(b), err)
resp.Body.Close()
surfer.DestroyJsFiles()

}

Index

Package Files

agent.go agent_linux.go body.go phantom.go request.go surf.go surfer.go util.go

Constants

const (
    // Windows operating system.
    Windows int = iota
    // Linux based operating system.
    Linux
    // Macintosh /OS X operating system.
    Macintosh
)
const (
    SurfID             = 0               // Surf下载器标识符
    PhomtomJsID        = 1               // PhomtomJs下载器标识符
    DefaultMethod      = "GET"           // 默认请求方法
    DefaultDialTimeout = 2 * time.Minute // 默认请求服务器超时
    DefaultConnTimeout = 2 * time.Minute // 默认下载超时
    DefaultTryTimes    = 3               // 默认最大下载次数
    DefaultRetryPause  = 2 * time.Second // 默认重新下载前停顿时长
)

constant

Variables

var Database = UATable{
    "chrome": {
        "37.0.2049.0",
        Windows,
        Formats{
            "37": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
            "36": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
            "35": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
            "34": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
            "33": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
            "32": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
            "31": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
            "30": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
        },
    },
    "firefox": {
        "31.0",
        Windows,
        Formats{
            "31": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:31.0) Gecko/20100101 Firefox/{{.Ver}}",
            "30": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:30.0) Gecko/20120101 Firefox/{{.Ver}}",
            "29": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:29.0) Gecko/20120101 Firefox/{{.Ver}}",
            "28": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:28.0) Gecko/20100101 Firefox/{{.Ver}}",
            "27": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:27.0) Gecko/20130101 Firefox/{{.Ver}}",
            "26": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:26.0) Gecko/20121011 Firefox/{{.Ver}}",
            "25": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:25.0) Gecko/20100101 Firefox/{{.Ver}}",
        },
    },
    "msie": {
        "10.0",
        Windows,
        Formats{
            "10": "Mozilla/5.0 (compatible; MSIE 10.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/5.0; .NET CLR 3.5.30729)",
            "9":  "Mozilla/5.0 (compatible; MSIE 9.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/5.0; .NET CLR 3.0.30729)",
            "8":  "Mozilla/5.0 (compatible; MSIE 8.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/4.0; .NET CLR 3.0.04320)",
            "7":  "Mozilla/4.0 (compatible; MSIE 7.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}.NET CLR 2.0.50727)",
        },
    },
    "opera": {
        "12.14",
        Windows,
        Formats{
            "12": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.9.181 Version/{{.Ver}}",
            "11": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.7.62 Version/{{.Ver}}",
            "10": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.2.15 Version/{{.Ver}}",
            "9":  "Opera/9.00 ({{.OSN}} {{.OSV}}; U{{.Coms}})",
        },
    },
    "safari": {
        "6.0",
        Macintosh,
        Formats{
            "6": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/536.26 (KHTML, like Gecko) Version/{{.Ver}} Safari/8536.25",
            "5": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/531.2+ (KHTML, like Gecko) Version/{{.Ver}} Safari/531.2+",
            "4": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/528.16 (KHTML, like Gecko) Version/{{.Ver}} Safari/528.16",
        },
    },
    "itunes": {
        "9.1.1",
        Macintosh,
        Formats{
            "9": "iTunes/{{.Ver}}",
            "8": "iTunes/{{.Ver}}",
            "7": "iTunes/{{.Ver}} (Macintosh; U; PPC Mac OS X 10.4.7{{.Coms}})",
            "6": "iTunes/{{.Ver}} (Macintosh; U; PPC Mac OS X 10.4.5{{.Coms}})",
        },
    },
    "aol": {
        "9.7",
        Windows,
        Formats{
            "9": "" /* 132 byte string literal not displayed */,
            "8": "Mozilla/4.0 (compatible; MSIE 7.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}; GTB5; .NET CLR 1.1.4322; .NET CLR 2.0.50727{{.Coms}})",
            "7": "Mozilla/4.0 (compatible; MSIE 7.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}; FunWebProducts{{.Coms}})",
            "6": "Mozilla/4.0 (compatible; MSIE 6.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}{{.Coms}})",
        },
    },
    "konqueror": {
        "4.9",
        Linux,
        Formats{
            "4": "Mozilla/5.0 (compatible; Konqueror/4.0; {{.OSN}}{{.Coms}}) KHTML/4.0.3 (like Gecko)",
            "3": "Mozilla/5.0 (compatible; Konqueror/3.0-rc6; i686 {{.OSN}}; 20021127{{.Coms}})",
            "2": "Mozilla/5.0 (compatible; Konqueror/2.1.1; {{.OSN}}{{.Coms}})",
        },
    },
    "netscape": {
        "9.1.0285",
        Windows,
        Formats{
            "9": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.9.2.4{{.Coms}}) Gecko/20070321 Netscape/{{.Ver}}",
            "8": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.7.5{{.Coms}}) Gecko/20050519 Netscape/{{.Ver}}",
            "7": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.0.1{{.Coms}}) Gecko/20020921 Netscape/{{.Ver}}",
        },
    },
    "lynx": {
        "2.8.8dev.3",
        Linux,
        Formats{
            "2": "Lynx/{{.Ver}} libwww-FM/2.14 SSL-MM/1.4.1",
            "1": "Lynx (textmode)",
        },
    },
    "googlebot": {
        "2.1",
        Linux,
        Formats{
            "2": "Mozilla/5.0 (compatible; Googlebot/{{.Ver}}; +http://www.google.com/bot.html{{.Coms}})",
            "1": "Googlebot/{{.Ver}} (+http://www.google.com/bot.html{{.Coms}})",
        },
    },
    "bingbot": {
        "2.0",
        Windows,
        Formats{
            "2": "Mozilla/5.0 (compatible; bingbot/{{.Ver}}; +http://www.bing.com/bingbot.htm{{.Coms}})",
        },
    },
    "yahoobot": {
        "2.0",
        Linux,
        Formats{
            "2": "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp{{.Coms}})",
        },
    },
    "default": {
        "1.0",
        Linux,
        Formats{
            "1": "{{.Name}}/{{.Ver}} ({{.OSN}} {{.OSV}}{{.Coms}})",
        },
    },
}

Database is the "database" of user agents.

var DefaultOSAttributes = map[int]OSAttributes{
    Windows:   {"Windows NT", "6.3", []string{"x64"}},
    Linux:     {"Linux", "3.16.1", []string{"x64"}},
    Macintosh: {"Intel Mac OS X", "10_6_8", []string{}},
}

DefaultOSAttributes stores default OS attributes.

var UserAgents = map[string][]string{}

UserAgents all User-Agent

func AutoToUTF8 Uses

func AutoToUTF8(resp *http.Response) error

AutoToUTF8 采用surf内核下载时,可以尝试自动转码为utf8 采用phantomjs内核时,无需转码(已是utf8)

func BodyBytes Uses

func BodyBytes(resp *http.Response) ([]byte, error)

BodyBytes 读取完整响应流正文

func CreateDefault Uses

func CreateDefault(browser string) string

CreateDefault returns a user agent string using default values.

func CreateReal Uses

func CreateReal() string

CreateReal creates generates and returns a complete user agent string.

func CreateVersion Uses

func CreateVersion(browser, version string) string

CreateVersion generates and returns a complete user agent string for a specific browser version.

func DestroyJsFiles Uses

func DestroyJsFiles()

DestroyJsFiles 销毁Phantomjs的js临时文件

func Download Uses

func Download(req *Request) (resp *http.Response, err error)

Download 实现surfer下载器接口

func Format Uses

func Format(bname, bver string) string

Format returns the format string for the given browser name and version.

When a format can't be found for a version, the first format string for the browser is returned. When a format can't be found for the browser the default format is returned.

func GetWDPath Uses

func GetWDPath() string

GetWDPath gets the work directory path.

func IsDirExists Uses

func IsDirExists(path string) bool

IsDirExists judges path is directory or not.

func IsFileExists Uses

func IsFileExists(path string) bool

IsFileExists judges path is file or not.

func SetPhantomJsFilePath Uses

func SetPhantomJsFilePath(filePath string)

指定phantomjs可执行文件的位置

func TopVersion Uses

func TopVersion(bname string) string

TopVersion returns the most recent version for the given browser name.

func UrlEncode Uses

func UrlEncode(urlStr string) (*url.URL, error)

UrlEncode 返回编码后的url.URL指针、及解析错误

func WalkDir Uses

func WalkDir(targpath string, suffixes ...string) (dirlist []string)

WalkDir 遍历目录,可指定后缀

type Bytes Uses

type Bytes []byte

Bytes bytes type of body content, without content type

func (Bytes) SetBody Uses

func (b Bytes) SetBody(r *Request) error

SetBody sets request body

type Content Uses

type Content struct {
    ContentType string
    Bytes       []byte
}

Content bytes type of body content

func (*Content) SetBody Uses

func (c *Content) SetBody(r *Request) error

SetBody sets request body

type Cookie struct {
    Name   string `json:"name"`
    Value  string `json:"value"`
    Domain string `json:"domain"`
    Path   string `json:"path"`
}

给phantomjs传输cookie用

type DnsCache Uses

type DnsCache struct {
    // contains filtered or unexported fields
}

DnsCache DNS cache

func (*DnsCache) Del Uses

func (d *DnsCache) Del(addr string)

Del deletes ipPort from DNS cache.

func (*DnsCache) Query Uses

func (d *DnsCache) Query(addr string) (string, bool)

Query queries ipPort from DNS cache.

func (*DnsCache) Reg Uses

func (d *DnsCache) Reg(addr, ipPort string)

Reg registers ipPort to DNS cache.

type File Uses

type File struct {
    Filename string
    Bytes    []byte
}

File post form file

type Form Uses

type Form struct {
    // Values [field name]-[]value
    Values map[string][]string
    // Files [field name]-[]File
    Files map[string][]File
}

Form impletes body interface

func (Form) SetBody Uses

func (f Form) SetBody(r *Request) error

SetBody sets request body

type Formats Uses

type Formats map[string]string

Formats is a collection of UA format strings. key is the browser version. value is the browser info.

type JSONObj Uses

type JSONObj struct{ Data interface{} }

JSONObj JSON type of body content

func (*JSONObj) SetBody Uses

func (obj *JSONObj) SetBody(r *Request) error

SetBody sets request body

type OSAttributes Uses

type OSAttributes struct {
    // OSName is the operating system name.
    OSName string
    // OSVersion is the operating system version.
    OSVersion string
    // Comments are additional comments to add to a user agent string.
    Comments []string
}

OSAttributes stores OS attributes.

type Phantom Uses

type Phantom struct {
    PhantomjsFile string //Phantomjs完整文件名
    TempJsDir     string //临时js存放目录

    CookieJar *cookiejar.Jar
    // contains filtered or unexported fields
}

Phantom 基于Phantomjs的下载器实现,作为surfer的补充 效率较surfer会慢很多,但是因为模拟浏览器,破防性更好 支持UserAgent/TryTimes/RetryPause/自定义js

func (*Phantom) DestroyJsFiles Uses

func (phantom *Phantom) DestroyJsFiles()

DestroyJsFiles 销毁js临时文件

func (*Phantom) Download Uses

func (phantom *Phantom) Download(req *Request) (resp *http.Response, err error)

Download 实现surfer下载器接口

type Request Uses

type Request struct {
    // url (必须填写)
    Url string

    // GET POST HEAD (默认为GET)
    Method string
    // http header
    Header http.Header
    // 是否使用cookies,在Spider的EnableCookie设置
    EnableCookie bool
    // request body interface
    Body body

    // dial tcp: i/o timeout
    DialTimeout time.Duration
    // WSARecv tcp: i/o timeout
    ConnTimeout time.Duration
    // the max times of download
    TryTimes int
    // how long pause when retry
    RetryPause time.Duration
    // max redirect times
    // when RedirectTimes equal 0, redirect times is ∞
    // when RedirectTimes less than 0, redirect times is 0
    RedirectTimes int
    // the download ProxyHost
    Proxy string

    // 指定下载器ID
    // 0为Surf高并发下载器,各种控制功能齐全
    // 1为PhantomJS下载器,特点破防力强,速度慢,低并发
    DownloaderID int
    // contains filtered or unexported fields
}

Request contains the necessary prerequisite information.

func (*Request) ReadBody Uses

func (r *Request) ReadBody() ([]byte, error)

ReadBody returns body bytes

type RespBody Uses

type RespBody struct {
    io.ReadCloser
    io.Reader
}

RespBody 封装Response.Body

func (*RespBody) Read Uses

func (b *RespBody) Read(p []byte) (int, error)

Read 实现Reader接口

type Response Uses

type Response struct {
    Cookies []string
    Body    string
    Error   string
    Header  []struct {
        Name  string
        Value string
    }
}

Response 用于解析Phantomjs的响应内容

type Surf Uses

type Surf struct {
    CookieJar *cookiejar.Jar
}

Surf is the default Download implementation.

func (*Surf) Download Uses

func (surf *Surf) Download(param *Request) (*http.Response, error)

Download 实现surfer下载器接口

type Surfer Uses

type Surfer interface {
    // GET @param url string, header http.Header, cookies []*http.Cookie
    // HEAD @param url string, header http.Header, cookies []*http.Cookie
    // POST PostForm @param url, referer string, values url.Values, header http.Header, cookies []*http.Cookie
    // POST-M PostMultipart @param url, referer string, values url.Values, header http.Header, cookies []*http.Cookie
    Download(*Request) (resp *http.Response, err error)
}

Surfer represents an core of HTTP web browser for crawler.

func New Uses

func New(jar ...*cookiejar.Jar) Surfer

New 创建一个Surf下载器

func NewPhantom Uses

func NewPhantom(phantomjsFile, tempJsDir string, jar ...*cookiejar.Jar) Surfer

NewPhantom 创建一个Phantomjs下载器

type TemplateData Uses

type TemplateData struct {
    Name string
    Ver  string
    OSN  string
    OSV  string
    Coms string
}

TemplateData structure for template data.

type UAData Uses

type UAData struct {
    TopVersion string
    DefaultOS  int
    Formats    Formats
}

UAData stores information on a browser user agent.

type UATable Uses

type UATable map[string]UAData

UATable is a collection of UAData values. key is the name of the browser.

type XMLObj Uses

type XMLObj struct{ Data interface{} }

XMLObj XML type of body content

func (*XMLObj) SetBody Uses

func (obj *XMLObj) SetBody(r *Request) error

SetBody sets request body

Directories

PathSynopsis
example/example001
example/example002
example/example003
example/example004
example/example005

Package surfer imports 29 packages (graph) and is imported by 13 packages. Updated 2019-10-21. Refresh now. Tools for package owners.