spider

package module
v0.0.0-...-f124479 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 8, 2021 License: Apache-2.0 Imports: 20 Imported by: 0

README

go-spider

A golang spider frame.

Documentation

Index

Constants

View Source
const (
	ContentTypeACC    = ".acc"
	ContentTypeABW    = ".abw"
	ContentTypeARC    = ".arc"
	ContentTypeAVI    = ".avi"
	ContentTypeAZW    = ".azw"
	ContentTypeBIN    = ".bin"
	ContentTypeBMP    = ".bmp"
	ContentTypeBZ     = ".bz"
	ContentTypeBZ2    = ".bz2"
	ContentTypeCSH    = ".csh"
	ContentTypeCSS    = ".css"
	ContentTypeCSV    = ".csv"
	ContentTypeDOC    = ".doc"
	ContentTypeDOCX   = ".docx"
	ContentTypeEOT    = ".eot"
	ContentTypeEPUB   = ".epub"
	ContentTypeGIF    = ".gif"
	ContentTypeHTM    = ".htm"
	ContentTypeHTML   = ".html"
	ContentTypeICO    = ".ico"
	ContentTypeICS    = ".ics"
	ContentTypeJAR    = ".jar"
	ContentTypeJPEG   = ".jpeg"
	ContentTypeJPG    = ".jpg"
	ContentTypeJS     = ".js"
	ContentTypeJSON   = ".json"
	ContentTypeJSONLD = ".jsonld"
	ContentTypeMID    = ".mid"
	ContentTypeMIDI   = ".midi"
	ContentTypeMJS    = ".mjs"
	ContentTypeMP3    = ".mp3"
	ContentTypeMPEG   = ".mpeg"
	ContentTypeMPKG   = ".mpkg"
	ContentTypeODP    = ".odp"
	ContentTypeODS    = ".ods"
	ContentTypeODT    = ".odt"
	ContentTypeOGA    = ".oga"
	ContentTypeOGV    = ".ogv"
	ContentTypeOGX    = ".ogx"
	ContentTypeOTF    = ".otf"
	ContentTypePNG    = ".png"
	ContentTypePDF    = ".pdf"
	ContentTypePPT    = ".ppt"
	ContentTypePPTX   = ".pptx"
	ContentTypeRAR    = ".rar"
	ContentTypeRTF    = ".rtf"
	ContentTypeSH     = ".sh"
	ContentTypeSVG    = ".svg"
	ContentTypeSWF    = ".swf"
	ContentTypeTAR    = ".tar"
	ContentTypeTIF    = ".tif"
	ContentTypeTIFF   = ".tiff"
	ContentTypeTTF    = ".ttf"
	ContentTypeTXT    = ".txt"
	ContentTypeVSD    = ".vsd"
	ContentTypeWAV    = ".wav"
	ContentTypeWEBA   = ".weba"
	ContentTypeWEBM   = ".webm"
	ContentTypeWEBP   = ".webp"
	ContentTypeWOFF   = ".woff"
	ContentTypeWOFF2  = ".woff2"
	ContentTypeXHTML  = ".xhtml"
	ContentTypeXLS    = ".xls"
	ContentTypeXLSX   = ".xlsx"
	ContentTypeXML    = ".xml"
	ContentTypeXUL    = ".xul"
	ContentTypeZIP    = ".zip"
	ContentType3GP    = ".3GP"
	ContentType3G2    = ".3G2"
	ContentType7Z     = ".7Z"
)
View Source
const (
	SleepTypeNode = iota
	SleepTypeFixed
	SleepTypeRandom
)
View Source
const (
	SpiderConcuDefault = 100

	SleepTypeDefault = SleepTypeRandom
	SleepMinDefault  = 0
	SleepMaxDefault  = 1000

	DownloadPathDefault = "/tmp/tamper"
)
View Source
const (
	SelectorDefault = "script, link, a, img, frame, iframe, area, base, blockquote, body, del, head, ins, object, q"
)

Variables

View Source
var ContentTypes = map[string]string{
	"audio/aac":                    ContentTypeACC,
	"application/x-abiwor":         ContentTypeABW,
	"application/x-freearc":        ContentTypeARC,
	"video/x-msvideo":              ContentTypeAVI,
	"application/vnd.amazon.ebook": ContentTypeAZW,
	"application/octet-stream":     ContentTypeBIN,
	"image/bmp":                    ContentTypeBMP,
	"application/x-bzip":           ContentTypeBZ,
	"application/x-bzip2":          ContentTypeBZ2,
	"application/x-csh":            ContentTypeCSH,
	"text/css":                     ContentTypeCSS,
	"text/csv":                     ContentTypeCSV,
	"application/msword":           ContentTypeDOC,
	"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ContentTypeDOCX,

	"application/vnd.ms-fontobject":       ContentTypeEOT,
	"application/epub+zip":                ContentTypeEPUB,
	"image/gif":                           ContentTypeGIF,
	"text/html":                           ContentTypeHTML,
	"image/vnd.microsoft.ico":             ContentTypeICO,
	"text/calendar":                       ContentTypeICS,
	"application/java-archiv":             ContentTypeJAR,
	"image/jpeg":                          ContentTypeJPEG,
	"text/javascript":                     ContentTypeJS,
	"application/javascript":              ContentTypeJS,
	"application/json":                    ContentTypeJSON,
	"application/ld+json":                 ContentTypeJSONLD,
	"audio/midi audio/x-midi":             ContentTypeMIDI,
	"audio/mpeg":                          ContentTypeMP3,
	"video/mpeg":                          ContentTypeMPEG,
	"application/vnd.apple.installer+xml": ContentTypeMPKG,
	"application/vnd.oasis.opendocument.presentation": ContentTypeMPEG,
	"application/vnd.oasis.opendocument.spreadsheet":  ContentTypeODP,
	"application/vnd.oasis.opendocument.text":         ContentTypeODS,
	"audio/ogg":                     ContentTypeODT,
	"video/ogg":                     ContentTypeOGA,
	"application/ogg":               ContentTypeOGX,
	"font/otf":                      ContentTypeOTF,
	"image/png":                     ContentTypePNG,
	"application/pdf":               ContentTypePDF,
	"application/vnd.ms-powerpoint": ContentTypePPT,
	"application/vnd.openxmlformats-officedocument.presentationml.presentation": ContentTypePPTX,

	"application/x-rar-compressed":  ContentTypeRAR,
	"application/rtf":               ContentTypeRTF,
	"application/x-sh":              ContentTypeSH,
	"image/svg+xml":                 ContentTypeSVG,
	"application/x-shockwave-flash": ContentTypeSWF,
	"application/x-tar":             ContentTypeTAR,
	"image/tiff":                    ContentTypeTIFF,
	"font/ttf":                      ContentTypeTTF,
	"text/plain":                    ContentTypeTXT,
	"application/vnd.visio":         ContentTypeVSD,
	"audio/wav":                     ContentTypeWAV,
	"audio/webm":                    ContentTypeWEBA,
	"video/webm":                    ContentTypeWEBM,
	"image/webp":                    ContentTypeWEBP,
	"font/woff":                     ContentTypeWOFF,
	"font/woff2":                    ContentTypeWOFF2,
	"application/xhtml+xml":         ContentTypeXHTML,
	"application/vnd.ms-excel":      ContentTypeXLS,
	"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ContentTypeXLSX,

	"application/xml":                 ContentTypeXML,
	"application/vnd.mozilla.xul+xml": ContentTypeXUL,
	"application/zip":                 ContentTypeZIP,
	"video/3gpp":                      ContentType3GP,
	"audio/3gpp":                      ContentType3GP,
	"video/3gpp2":                     ContentType3G2,
	"audio/3gpp2":                     ContentType3G2,
	"application/x-7z-compressed":     ContentType7Z,
}

Functions

This section is empty.

Types

type DomProcesser

type DomProcesser struct {
	// contains filtered or unexported fields
}

func NewDomProcesser

func NewDomProcesser(options ...OptionDomProcesser) *DomProcesser

func (*DomProcesser) Finish

func (dp *DomProcesser) Finish()

func (*DomProcesser) Process

func (dp *DomProcesser) Process(charSet string, certain bool, rsp *http.Response) ([]*http.Request, error)

type Downloader

type Downloader interface {
	//url, header, reader, suffix
	//url location, header location, body location
	Download(*url.URL, http.Header, io.Reader, string) (*string, *string, *string, error)
}

type FileDownloader

type FileDownloader struct {
	// contains filtered or unexported fields
}

func NewFileDownloader

func NewFileDownloader(path string) *FileDownloader

func (*FileDownloader) Download

func (fd *FileDownloader) Download(u *url.URL, header http.Header, reader io.Reader, suffix string) (*string, *string, *string, error)

type Filter

type Filter interface {
	SuffixAllow(suffix string) bool
	SizeAllow(size int64) bool
	HttpsAllow() bool
}

默认Transfer-Encoding: chunked不会被缓存 默认Content-Type: application/octet-stream不会被缓存

type LimitFilter

type LimitFilter struct {
	// contains filtered or unexported fields
}

func NewLimitFilter

func NewLimitFilter(options ...OptionFilter) (*LimitFilter, error)

func (*LimitFilter) HttpsAllow

func (lm *LimitFilter) HttpsAllow() bool

func (*LimitFilter) SizeAllow

func (lm *LimitFilter) SizeAllow(size int64) bool

func (*LimitFilter) SuffixAllow

func (lm *LimitFilter) SuffixAllow(suffix string) bool

type OptionDomProcesser

type OptionDomProcesser func(*DomProcesser)

func OptionDomProcesserSelectors

func OptionDomProcesserSelectors(selectors []string) OptionDomProcesser

type OptionFilter

type OptionFilter func(*LimitFilter) error

func OptionFilterSize

func OptionFilterSize(size int64) OptionFilter

func OptionFilterSuffixs

func OptionFilterSuffixs(suffixs []string) OptionFilter

type OptionSpider

type OptionSpider func(*Spider)

func OptionSpiderConcu

func OptionSpiderConcu(concu uint32) OptionSpider

func OptionSpiderDownloader

func OptionSpiderDownloader(downloader Downloader) OptionSpider

func OptionSpiderFilter

func OptionSpiderFilter(filter Filter) OptionSpider

func OptionSpiderProcesser

func OptionSpiderProcesser(processer Processer) OptionSpider

func OptionSpiderRequestCheckRedirect

func OptionSpiderRequestCheckRedirect(checkRedirect func(req *http.Request, via []*http.Request) error) OptionSpider

func OptionSpiderRequestHeader

func OptionSpiderRequestHeader(key, value string) OptionSpider

func OptionSpiderRequestTimeout

func OptionSpiderRequestTimeout(timeout time.Duration) OptionSpider

func OptionSpiderResponseChunkedAllowed

func OptionSpiderResponseChunkedAllowed(allowed bool) OptionSpider

func OptionSpiderSchduler

func OptionSpiderSchduler(resourceMgr ResourceManager) OptionSpider

func OptionSpiderScheduler

func OptionSpiderScheduler(scheduler Scheduler) OptionSpider

func OptionSpiderSleep

func OptionSpiderSleep(tp, min, max uint) OptionSpider

type Processer

type Processer interface {
	Process(string, bool, *http.Response) ([]*http.Request, error)
	Finish()
}

type ResourceChan

type ResourceChan struct {
	// contains filtered or unexported fields
}

func NewResourceChan

func NewResourceChan(all uint32) *ResourceChan

func (*ResourceChan) Acquire

func (rc *ResourceChan) Acquire()

func (*ResourceChan) Free

func (rc *ResourceChan) Free() uint32

func (*ResourceChan) Release

func (rc *ResourceChan) Release()

func (*ResourceChan) Used

func (rc *ResourceChan) Used() uint32

type ResourceManager

type ResourceManager interface {
	Acquire()
	Release()

	//空闲的和使用的
	Free() uint32
	Used() uint32
}

type Result

type Result struct {
	Error string `json:"error,omitempty"`

	//request result
	Req *http.Request  `json:"-"`
	Rsp *http.Response `json:"-"`

	//inner parser result
	Size    int64  `json:"size,omitempty"`
	Suffix  string `json:"suffix,omitempty"`
	CharSet string `json:"charset,omitempty"`

	//download result
	UrlPath  *string `json:"url_path,omitempty"`
	HdrPath  *string `json:"hdr_path,omitempty"`
	BodyPath *string `json:"body_path,omitempty"`

	//processer result
	Depth uint     `json:"depth"`
	Subs  []string `json:"subs,omitempty"`
}

type Scheduler

type Scheduler interface {
	Push(*http.Request)
	Poll() *http.Request

	Rest() int
}

type SchedulerChan

type SchedulerChan struct {
	// contains filtered or unexported fields
}

func NewSchedulerChan

func NewSchedulerChan() *SchedulerChan

func (*SchedulerChan) Poll

func (sc *SchedulerChan) Poll() *http.Request

func (*SchedulerChan) Push

func (sc *SchedulerChan) Push(req *http.Request)

func (*SchedulerChan) Rest

func (sc *SchedulerChan) Rest() int

type Spider

type Spider struct {
	// contains filtered or unexported fields
}

不包含content-type嗅探

func NewSpider

func NewSpider(options ...OptionSpider) *Spider

func (*Spider) AddRequest

func (spider *Spider) AddRequest(req *http.Request) *Spider

func (*Spider) Result

func (spider *Spider) Result() map[string]*Result

func (*Spider) Run

func (spider *Spider) Run() *Spider

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL