http

package
v1.2.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 18, 2022 License: MulanPSL-2.0 Imports: 35 Imported by: 6

Documentation

Index

Constants

View Source
const (
	LANGEMPTY = "" /* 391-byte string literal not displayed */
)

Variables

View Source
var (
	B2S                 = regexp.MustCompile("\\<[\\S\\s]+?\\>")
	DelStyle            = regexp.MustCompile("\\<style[\\S\\s]+?\\</style\\>")
	DelScript           = regexp.MustCompile("\\<script[\\S\\s]+?\\</script\\>")
	DelHtmlTag          = regexp.MustCompile("\\<[\\S\\s]+?\\>")
	DelSpaceContinuesly = regexp.MustCompile("\\s{2,}")
	FileTp              = regexp.MustCompile(`\.[a-zA-Z0-9]+`)
)
View Source
var (
	UA = []string{
		"Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0",
		"Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0",
		"Mozilla/5.0 (Windows; U; Windows NT 5.1 ; x64; en-US; rv:1.9.1b2pre) Gecko/20081026 Firefox/3.1b2pre",
		"Opera/10.60 (Windows NT 5.1; U; zh-cn) Presto/2.6.30 Version/10.60", "Opera/8.01 (J2ME/MIDP; Opera Mini/2.0.4062; en; U; ssr)",
		"Mozilla/5.0 (Windows; U; Windows NT 5.1; ; rv:1.9.0.14) Gecko/2009082707 Firefox/3.0.14",
		"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36",
		"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
		"Mozilla/5.0 (Windows; U; Windows NT 6.0; fr; rv:1.9.2.4) Gecko/20100523 Firefox/3.6.4 ( .NET CLR 3.5.30729)",
		"Mozilla/5.0 (Windows; U; Windows NT 6.0; fr-FR) AppleWebKit/528.16 (KHTML, like Gecko) Version/4.0 Safari/528.16",
		"Mozilla/5.0 (Windows; U; Windows NT 6.0; fr-FR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
	}
	// UA = random.choice(user_agent)
	DeafultHeaders = map[string]string{
		"Accept":                    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
		"User-Agent":                UA[0],
		"Upgrade-Insecure-Requests": "1",
		"Connection":                "keep-alive",
		"Cache-Control":             "max-age=0",
		"Accept-Encoding":           "gzip, deflate, sdch",
		"Accept-Language":           "zh-CN,zh;q=0.8",
		"Referer":                   "http://www.baidu.com/link?url=www.so.com&url=www.soso.com&&url=www.sogou.com",
		"Cookie":                    "PHPSESSID=gljsd5c3ei5n813roo4878q203",
	}
)
View Source
var (
	MAX_POOL   = 100
	RandomMode = 1
	FlowMode   = 0
	HostParse  = regexp.MustCompile(`Host: .+`)
	LogLevl    = 0
)
View Source
var (
	Red       = color.New(color.FgRed).SprintFunc()
	Green     = color.New(color.FgGreen).SprintFunc()
	GreenBack = color.New(color.BgGreen, color.Bold).SprintFunc()
	BlueBack  = color.New(color.BgBlue, color.FgHiWhite, color.Bold).SprintFunc()
	Yello     = color.New(color.FgYellow).SprintFunc()
	Blue      = color.New(color.FgBlue).SprintFunc()
	Magenta   = color.New(color.FgMagenta).SprintFunc()
	Bold      = color.New(color.Bold).SprintFunc()
	Underline = color.New(color.Underline).SprintFunc()
	Hblue     = color.New(color.FgHiBlue).SprintFunc()
	Hgreen    = color.New(color.FgHiGreen).SprintFunc()
	Hyello    = color.New(color.FgHiYellow).SprintFunc()
)
View Source
var (
	NW = regexp.MustCompile(`\W`)

	DateMatcher = map[*regexp.Regexp]string{
		regexp.MustCompile(`[1-2]\d{3}年\d{1,2}月\d{1,2}日 \d{1,2}\:\d{1,2}\:\d{1,2}`): "2006年1月2日 15:04:05",
		regexp.MustCompile(`[1-2]\d{3}年[0-1]\d月[0-3]\d日 [0-2]\d\:[0-5]\d\:\d{2}`):   "2006年1月2日 15:04:05",
		regexp.MustCompile(`[1-2]\d{3}-\d{1,2}-\d{1,2} \d{1,2}\:\d{1,2}\:\d{1,2}`):  "2006-1-2 15:04:05",
		regexp.MustCompile(`[1-2]\d{3}/\d{1,2}/\d{1,2} \d{1,2}\:\d{1,2}\:\d{1,2}`):  "2006/1/2 15:04:05",
		regexp.MustCompile(`[1-2]\d{3}/\d{1,2}/\d{1,2} \d{1,2}\:\d{1,2}`):           "2006/1/2 15:04",
		regexp.MustCompile(`\d{2}\/\d{2}\/[1-2]\d{3} \- \d{2}\:\d{2}`):              "02/01/2006 - 15:04",
		regexp.MustCompile(`[1-2]\d{3}年\d{1,2}月\d{1,2}日 \d{1,2}\:\d{1,2}`):          "2006年1月2日 15:04",
		regexp.MustCompile(`[1-2]\d{3}-\d{1,2}-\d{1,2} \d{1,2}\:\d{1,2}`):           "2006-1-2 15:04",
		regexp.MustCompile(`[1-2]\d{3}年\d{1,2}月\d{1,2}日`):                           "2006年1月2日",
		regexp.MustCompile(`[1-2]\d{3}-\d{1,2}-\d{1,2}`):                            "2006-1-2",
		regexp.MustCompile(`[1-2]\d{3}/\d{1,2}/\d{1,2}`):                            "2006/1/2",
		regexp.MustCompile(`\w{1,15}, \d{1,2} \w{1,15} [1-2]\d{3}`):                 "Mon, 02 Jan 2006",
		regexp.MustCompile(`\d{1,2} \w{1,15} [1-2]\d{3}`):                           "02 Jan 2006",
		regexp.MustCompile(`[1-2]\d{3}-[0-1]\d-\d{2}T\d{2}\:\d{2}\:\d{2}Z`):         "2006-01-02T10:27:21Z",
		regexp.MustCompile(`\d{2}\.[0-1]\d\.[1-2]\d{3}`):                            "02.01.2006",
	}
)
View Source
var (
	MsgCacheChan = make(chan string)
	CacheStatus  = false
)
View Source
var (
	DefaultProxyDialer NewDialler
	DefaultProxyPool   ProxyDiallerPool
)
View Source
var (
	STOP         = "[STOP]"
	DefaultLoger = &defaultloger{}
)
View Source
var (
	ConfigDocument = `` /* 798-byte string literal not displayed */

)
View Source
var (
	DEFAULT_BREAKPOINT_FILE = "default-skip.txt"
)
View Source
var (
	KeyEx = regexp.MustCompile(`\{\w+\}`)
)

Functions

func AndSimHash

func AndSimHash(a, b string) float32

func DecodeHTMLBody

func DecodeHTMLBody(body io.Reader, charset string) (io.Reader, error)

DecodeHTMLBody returns an decoding reader of the html Body for the specified `charset` If `charset` is empty, DecodeHTMLBody tries to guess the encoding from the content

func DefaultProxyMode

func DefaultProxyMode(mode int)

0 is flow / 1 is random

func Failed

func Failed(args ...interface{})

func FindMostMayDate

func FindMostMayDate(title, raw string) (t time.Time)

func GetMD5

func GetMD5(c []byte) string

func Info

func Info(args ...interface{})

func L

func L(args ...interface{})

func ProgressLog

func ProgressLog(now int, all int, msg string)

func SetProxyGenerater

func SetProxyGenerater(proxyCreator NewDialler)

func ShowDemo

func ShowDemo()

func Skip

func Skip(u string, us ...string) bool

func Socks5Dialer

func Socks5Dialer(addr string) proxy.Dialer

func Success

func Success(args ...interface{})

func UrlJoin

func UrlJoin(f ...string) string

Types

type ArrayFilter

type ArrayFilter []string

func (ArrayFilter) Add

func (array ArrayFilter) Add(o string) ArrayFilter

func (ArrayFilter) Every

func (array ArrayFilter) Every(handler func(no int, every string) string) ArrayFilter

func (ArrayFilter) Filter

func (array ArrayFilter) Filter(reStrOrFunc_str_bool interface{}) (newArray ArrayFilter)

func (ArrayFilter) FilterFunc

func (array ArrayFilter) FilterFunc(reStrOrFunc_str_bool func(no int, every string) bool) (newArray ArrayFilter)

func (ArrayFilter) In

func (array ArrayFilter) In(o string) int

func (ArrayFilter) Sort

func (array ArrayFilter) Sort() ArrayFilter

type Article

type Article struct {
	Text   string    `json:"text"`
	Title  string    `json:"title"`
	Date   time.Time `json:"date"`
	Author string    `json:"author"`
	Link   string    `json:"link"`
}

func NewArticle

func NewArticle(doc *goquery.Document) (article *Article)

func (*Article) WaitToFile

func (article *Article) WaitToFile()

type Async

type Async struct {
	Done []string

	NeedRestart bool
	// contains filtered or unexported fields
}

func (*Async) Async

func (async *Async) Async(url string, proxy ...string) *Async

func (*Async) Each

func (async *Async) Each(do func(out *AsyncOut)) *Async

func (*Async) EndAsync

func (async *Async) EndAsync() *Session

func (*Async) LoadCache

func (async *Async) LoadCache(name string) *Async

func (*Async) Restart

func (async *Async) Restart() *Async

func (*Async) State

func (async *Async) State() *Async

type AsyncOut

type AsyncOut struct {
	Url string
	// Err error
	End bool
	Res *WithOper
}

type ConsoleBar

type ConsoleBar struct {
	All       int64
	Now       int64
	Width     int
	NowWidth  int
	Interval  int
	Last      time.Time
	LastMsg   string
	LastWrite string
	LastBar   string
}

func NewConsoleBar

func NewConsoleBar(all int64) (bar *ConsoleBar, err error)

func (*ConsoleBar) Add

func (pro *ConsoleBar) Add(i int) int

func (*ConsoleBar) Error

func (pro *ConsoleBar) Error(err error)

func (*ConsoleBar) Finished

func (pro *ConsoleBar) Finished()

func (*ConsoleBar) GetPercent

func (pro *ConsoleBar) GetPercent() float32

func (*ConsoleBar) Increment

func (pro *ConsoleBar) Increment() int

func (*ConsoleBar) Println

func (pro *ConsoleBar) Println(args ...interface{})

func (*ConsoleBar) Reset

func (pro *ConsoleBar) Reset()

func (*ConsoleBar) SetAll

func (pro *ConsoleBar) SetAll(all int)

func (*ConsoleBar) SetMsg

func (pro *ConsoleBar) SetMsg(msg string)

func (*ConsoleBar) Update

func (pro *ConsoleBar) Update()

func (*ConsoleBar) Write

func (pro *ConsoleBar) Write(args ...interface{})

type Dict

type Dict map[string]string

type DictBool

type DictBool map[string]bool

func (DictBool) Keys

func (dict DictBool) Keys() (newArray ArrayFilter)

type EnumeConfig

type EnumeConfig struct {
	Domain   string
	Proxy    string
	Proxy2   string
	Proxy3   string
	Output   string
	Names    []string
	IdFile   string
	StartId  int
	EndId    int
	Template map[string]string
}

func ReadConf

func ReadConf(f string) (config *EnumeConfig)

func (*EnumeConfig) Marshal

func (conf *EnumeConfig) Marshal() string

type FilterOption

type FilterOption struct {
	Rank     int
	Distance float32
	Proxy    interface{}
}

type G

type G map[string]interface{}

type Gfunc

type Gfunc map[string]NextValue
type Links [][]*UrlSim

func (Links) AsString

func (link Links) AsString(rank int) (a [][]string)

type Loger

type Loger interface {
	Println(args ...interface{})
	Error(error)
	SetMsg(msg string)
}

type NewDialler

type NewDialler func(proxy interface{}) proxy.Dialer

type NextValue

type NextValue func(v Value) Value

type Payloader

type Payloader string

func (Payloader) AsFile

func (pay Payloader) AsFile(howHandle func(f *os.File, err error))

AsFile as file to open

func (Payloader) Format

func (pay Payloader) Format(args ...interface{}) Payloader

Format by {}

func (Payloader) FormatMap

func (pay Payloader) FormatMap(args map[string]interface{}) Payloader

Format by map {key}

func (Payloader) Lines

func (pay Payloader) Lines() (a ArrayFilter)

func (Payloader) Render

func (pay Payloader) Render(name string, v Value) string

func (Payloader) String

func (pay Payloader) String() string

type ProxyDiallerPool

type ProxyDiallerPool interface {
	GetDialer() proxy.Dialer
	Add(url string)
	SetMode(int)
}

type Result

type Result struct {
	Url string
	Res interface{}
}

type Resulter

type Resulter struct {
	Oks  []string
	Errs []error
}

func (*Resulter) Json

func (r *Resulter) Json() string

func (*Resulter) PutErr

func (r *Resulter) PutErr(err error)

func (*Resulter) PutOk

func (r *Resulter) PutOk(m string)

type RunnerPool

type RunnerPool struct {
	Thread int

	Handle func(arg string, tryTime int) interface{}
	After  func(res Result, loger Loger)
	ErrDo  func(error, int, Result, Loger)
	// Loger  Loger
	RetryTime int
	LogLevl   int
	Bar       *ConsoleBar
	// contains filtered or unexported fields
}

func NewAwaitPool

func NewAwaitPool(thread int) (pool *RunnerPool)

func (*RunnerPool) Loop

func (pool *RunnerPool) Loop(args []string, showBar bool)

func (*RunnerPool) LoopByFunc

func (pool *RunnerPool) LoopByFunc(generate func() (string, bool))

func (*RunnerPool) Tick

func (pool *RunnerPool) Tick(sec int)

type Selection

type Selection struct {
	goquery.Selection
}

type Session

type Session struct {
	Header            map[string]string
	Transprot         httplib.Transport
	Timeout           int
	RandomeUA         bool
	MultiGetRetryTime int
	Proxy             string
	Document          *goquery.Document
}

func NewSession

func NewSession() (sess *Session)

func (*Session) Asyncs

func (session *Session) Asyncs(work int, loadCache bool, showState bool, do func(each *AsyncOut), urls ...string) *Session

func (*Session) CheckAlive

func (sess *Session) CheckAlive(urls []string, showBar bool, after func(res *SmartResponse) bool, proxy ...interface{}) (alived []string)

func (*Session) Copy

func (session *Session) Copy() *Session

func (*Session) Get

func (session *Session) Get(url string, proxy ...interface{}) (resp *SmartResponse, err error)

*

  • Get set proxy: socks5://xxx.x.x.x.x:port ss://xxasfsfs ssr://xasfsaf General.Config{...}

func (*Session) GetsWith

func (sess *Session) GetsWith(urltemp string, mapFuncs Gfunc, handleRes func(loger Loger, res *SmartResponse, err error), thread int, proxy ...interface{})
HttpByCustom

example urltemp like : "https://www.baidu.com/?uid={id}"

GetSwith("https://www.baidu.com/?uid={id}", func(p Pyaloader) (string, bool){
	defaultvalue := 0
	p.SetValue("id", func(v Value)Value{
		return v.Add(2)
	}, defaultvalue)
})

func (*Session) Json

func (session *Session) Json(url string, data map[string]interface{}, proxy ...interface{}) (resp *SmartResponse, err error)

func (*Session) MultiGet

func (sess *Session) MultiGet(urls []string, handleRes func(loger Loger, res *SmartResponse, err error), showBar bool, proxy ...interface{})

func (*Session) Post

func (session *Session) Post(httpurl string, data map[string]string, proxy ...interface{}) (resp *SmartResponse, err error)

func (*Session) Send

func (session *Session) Send(raw string, proxy ...interface{}) (resp *SmartResponse, err error)

func (*Session) SetHeader

func (session *Session) SetHeader(key string, value string)

func (*Session) SetProxy

func (session *Session) SetProxy(proxy interface{})

func (*Session) SetProxyDialer

func (session *Session) SetProxyDialer(dialer proxy.Dialer)

func (*Session) SetSocks5Proxy

func (session *Session) SetSocks5Proxy(proxyAddr string) (err error)

func (*Session) SetTimeout

func (session *Session) SetTimeout(t int)

func (*Session) StartAsync

func (session *Session) StartAsync(i int) *Async

func (*Session) TestErrorPage

func (session *Session) TestErrorPage(url string, proxy ...interface{}) (string, string, string)

func (*Session) Upload

func (session *Session) Upload(url string, filePath string, fileKey string, data map[string]string, showBar bool, proxy ...interface{}) (resp *SmartResponse, err error)

func (*Session) UrlJoin

func (session *Session) UrlJoin(f ...string) string

func (*Session) With

func (session *Session) With(urlstr string, proxy ...interface{}) (with *WithOper)

* With will save res to Docuemtn tmporially, then can each to cssselect do some

type SmartResponse

type SmartResponse struct {
	httplib.Response
	Code int
	// contains filtered or unexported fields
}

func ParseRawData

func ParseRawData(buf []byte, url string) (r *SmartResponse, err error)

func (*SmartResponse) Base64

func (res *SmartResponse) Base64() string

func (*SmartResponse) Base64Mime

func (res *SmartResponse) Base64Mime() []byte

func (*SmartResponse) CssExtract

func (res *SmartResponse) CssExtract(cssSelctors Dict) (out G)
 CssExtract
	 raw | href | id | class
exmaple : CssExtract(Dict{
	"name": "div.names#one " ,   // will return *goquery.Selection
	"nameText": "div.names#one | raw " , // will return node's string
	"imgLink" : "img#head | href ",
})

func (*SmartResponse) CssSelect

func (res *SmartResponse) CssSelect(css string, each func(i int, s *Selection))

func (*SmartResponse) FastCheckLineByLine

func (res *SmartResponse) FastCheckLineByLine(found func(line string) bool) (string, bool)
func (res *SmartResponse) FileLinks(includeouter ...bool) (s []string)

func (*SmartResponse) HashMMH3

func (res *SmartResponse) HashMMH3() int32

func (*SmartResponse) HashMMH3Base64

func (res *SmartResponse) HashMMH3Base64() int32

func (*SmartResponse) HeaderJson

func (smartres *SmartResponse) HeaderJson() string

func (*SmartResponse) HeaderString

func (smartres *SmartResponse) HeaderString() (d string)

func (*SmartResponse) Html

func (smartres *SmartResponse) Html() []byte

func (*SmartResponse) Json

func (smartres *SmartResponse) Json(obj ...interface{}) (jdata map[string]interface{})
func (res *SmartResponse) Links(includeouter ...bool) (s []string)

func (*SmartResponse) Md5

func (smartres *SmartResponse) Md5() string

get content md5

func (*SmartResponse) PageTextHash

func (res *SmartResponse) PageTextHash() string

func (*SmartResponse) ReExtractString

func (smartres *SmartResponse) ReExtractString(re string) []string

Get regex group

func (*SmartResponse) RequestURL

func (smartres *SmartResponse) RequestURL() *url.URL

func (*SmartResponse) Search

func (smartres *SmartResponse) Search(key string, toLower bool) bool

func (*SmartResponse) Soup

func (smartres *SmartResponse) Soup() (m *goquery.Document)

Get Soup

func (*SmartResponse) String

func (smartres *SmartResponse) String() string

func (*SmartResponse) Text

func (resp *SmartResponse) Text() string

func (*SmartResponse) Title

func (smartres *SmartResponse) Title() string

Get title

type UrlSim

type UrlSim struct {
	// contains filtered or unexported fields
}

func AsUrlSim

func AsUrlSim(urlstr string, title ...string) (u *UrlSim)

func (*UrlSim) GetTitle

func (u *UrlSim) GetTitle() string

func (*UrlSim) GetUrl

func (u *UrlSim) GetUrl() string

func (*UrlSim) SetTitle

func (u *UrlSim) SetTitle(title string)

func (*UrlSim) Sub

func (u *UrlSim) Sub(other interface{}) (score float32)

type Value

type Value struct {
	// contains filtered or unexported fields
}

func NewValue

func NewValue(i interface{}) Value

func (Value) Add

func (v Value) Add(one int) Value

func (Value) AsInt

func (v Value) AsInt() (int, error)

func (Value) Empty

func (v Value) Empty() bool

func (Value) Increase

func (v Value) Increase() Value

func (Value) String

func (v Value) String() string

type WithOper

type WithOper struct {
	URL            *url.URL
	Document       *goquery.Document
	LastSelections []*goquery.Selection
	Links          Links
	Article        *Article
	Err            error
	// contains filtered or unexported fields
}

func (*WithOper) AsArticle

func (with *WithOper) AsArticle() *WithOper

func (*WithOper) AsSiteMap

func (with *WithOper) AsSiteMap(do func(out *AsyncOut), breakpointContinue bool, showState bool, filter func(chanelUrl string) bool) *WithOper

* AsSiteMap 爬取site-map 提取xml sitemap的大部分标准

>@breakpointContinue 开启断点续传,会自动读取和存储 已爬页面到 /tmp/default-skip.txt 和 /tmp/skip-site.txt

>@showState 开启状态显示

>@filter 通过url 过滤每个channel true to entry false not entry

example  func(u string){ return strings.Contains(u,"/zh/")}

func (*WithOper) Each

func (with *WithOper) Each(css string, do ...func(i int, s *Selection)) *WithOper

func (*WithOper) EndCache

func (with *WithOper) EndCache() *WithOper

func (*WithOper) Entry

func (with *WithOper) Entry(url string) *WithOper

func (*WithOper) For

func (with *WithOper) For(do func(i int, s *Selection)) *WithOper

func (*WithOper) News

func (with *WithOper) News(filters ...FilterOption) *WithOper

func (*WithOper) PreTestSkip

func (with *WithOper) PreTestSkip(name string, urls ...string) (o []string)

func (*WithOper) SimpleNews

func (with *WithOper) SimpleNews() *WithOper

func (*WithOper) StartCache

func (with *WithOper) StartCache(name string) *WithOper

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL