youcrawl

package module
v0.0.0-...-0276a4f Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 24, 2020 License: MIT Imports: 21 Imported by: 0

README

YouCrawl

FOSSA Status codecov BCH compliance

简体中文 | English

Go crawl library

Install

go get -u github.com/allentom/youcrawl

Features

HTML parser : PuerkitoBio/goquery

Workflow

The yellow part will be executed in parallel

The crawler library contains the following components, which can be added as needed

  1. Middleware
  2. HTML Parser
  3. Pipeline
  4. GlobalStore
  5. PostProcess
  6. Plugin

The simplest example

Because most of the components are optional, no complicated code is required.

func main() {
    e := youcrawl.NewEngine(
		&youcrawl.EngineOption{
			// Up to 5 tasks at the same time
			MaxRequest: 5,
		},
	)
  e.AddURLs("http://www.example.com")
  e.RunAndWait()
}

The above code just request the web page. The following code will add some components to show more features, the code will be a little complicated.

We will collect data from website and save into json file

func main() {
    e := youcrawl.NewEngine(
		&youcrawl.EngineOption{
			// Up to 5 tasks at the same time
			MaxRequest: 5,
		},
    )
    // add url
    e.AddURLs("http://www.example.com")
    // add UserAgent Middleware, add random UserAgent when requested
    e.UseMiddleware(&youcrawl.UserAgentMiddleware{})
    // Add parser and get page title
	e.AddHTMLParser(func(doc *goquery.Document, ctx *youcrawl.Context) error {
		title := doc.Find("title").Text()
		fmt.Println(title)
		ctx.Item.SetValue("title", title)
		return nil
    })
    // add Pipeline to store the item to the items in the GlobalStore
    e.AddPipelines(&youcrawl.GlobalStorePipeline{})
    // write the data under the `items` field in GlobalStore to the json file
	e.AddPostProcess(&youcrawl.OutputJsonPostProcess{
		StorePath: "./output.json",
	})
  e.RunAndWait()
}

License

FOSSA Status

Documentation

Index

Constants

View Source
const (
	// total
	STATUS_KEY_TOTAL = "status.total"
	// unrequested count
	STATUS_KEY_UNREQUESTED = "status.unrequested"
	// complete count
	STATUS_KEY_COMPLETE = "status.complete"
	// speed
	STATUS_KEY_SPEED = "status.speed"
)
View Source
const (
	ItemKeyChannelToken = "channelToken"
)

Variables

View Source
var (
	KeyNotContainError = errors.New("key not in item")
	TypeError          = errors.New("error type of item value")
)
View Source
var EngineLogger *logrus.Entry = logrus.WithField("scope", "engine")

Functions

func CrawlProcess

func CrawlProcess(taskChannel chan struct{}, e *Engine, task *Task)

func ParseHTML

func ParseHTML(parser HTMLParser, ctx *Context) error

parse html with parser

func RandomIntRangeWithStringSeed

func RandomIntRangeWithStringSeed(min int, max int, seedString string) int

func ReadListFile

func ReadListFile(listFilePath string) ([]string, error)

func RequestWithURL

func RequestWithURL(task *Task, middlewares ...Middleware) (io.Reader, error)

make request with url

Types

type ChannelPipeline

type ChannelPipeline struct {
	ChannelMapping sync.Map
}

func (*ChannelPipeline) Process

func (p *ChannelPipeline) Process(item interface{}, _ GlobalStore) error

type ChannelPipelineToken

type ChannelPipelineToken interface {
	GetToken() string
}

type Context

type Context struct {
	Request     *http.Request
	Response    *http.Response
	Item        interface{}
	GlobalStore GlobalStore
	Pool        TaskPool
	Cookie      *cookiejar.Jar
	Doc         *goquery.Document
}

share data in crawl process

type CookieMiddleware

type CookieMiddleware struct {
	Store  CookieStore
	GetKey func(c *http.Client, r *http.Request, ctx *Context) string
}

func NewCookieMiddleware

func NewCookieMiddleware(option CookieMiddlewareOption) *CookieMiddleware

func (*CookieMiddleware) Process

func (m *CookieMiddleware) Process(c *http.Client, r *http.Request, ctx *Context)

func (*CookieMiddleware) RequestCallback

func (m *CookieMiddleware) RequestCallback(c *http.Client, r *http.Request, ctx *Context)

type CookieMiddlewareOption

type CookieMiddlewareOption struct {
	GetKey func(c *http.Client, r *http.Request, ctx *Context) string
}

type CookieStore

type CookieStore interface {
	GetCookie(key string) *cookiejar.Jar
	SetCookie(key string, jar *cookiejar.Jar)
	GetOrCreate(key string) *cookiejar.Jar
}

type DefaultCookieStore

type DefaultCookieStore struct {
	sync.Map
}

func (*DefaultCookieStore) GetCookie

func (s *DefaultCookieStore) GetCookie(key string) *cookiejar.Jar

func (*DefaultCookieStore) GetOrCreate

func (s *DefaultCookieStore) GetOrCreate(key string) *cookiejar.Jar

func (*DefaultCookieStore) SetCookie

func (s *DefaultCookieStore) SetCookie(key string, jar *cookiejar.Jar)

type DefaultItem

type DefaultItem struct {
	Store map[string]interface{}
}

func (*DefaultItem) GetFloat64

func (i *DefaultItem) GetFloat64(key string) (float64, error)

func (*DefaultItem) GetInt

func (i *DefaultItem) GetInt(key string) (int, error)

func (*DefaultItem) GetString

func (i *DefaultItem) GetString(key string) (string, error)

func (DefaultItem) GetToken

func (i DefaultItem) GetToken() string

func (*DefaultItem) GetValue

func (i *DefaultItem) GetValue(key string) (interface{}, error)

func (*DefaultItem) SetValue

func (i *DefaultItem) SetValue(key string, value interface{})

type DelayMiddleware

type DelayMiddleware struct {
	Min   int
	Max   int
	Fixed int
}

func (*DelayMiddleware) Process

func (d *DelayMiddleware) Process(_ *http.Client, r *http.Request, _ *Context)

func (*DelayMiddleware) RequestCallback

func (d *DelayMiddleware) RequestCallback(_ *http.Client, _ *http.Request, _ *Context)

type Engine

type Engine struct {
	sync.Mutex
	*EngineOption
	// dispatch task
	Pool        TaskPool
	Parsers     []HTMLParser
	Middlewares []Middleware
	Pipelines   []Pipeline
	GlobalStore GlobalStore
	PostProcess []PostProcess
	Plugins     []Plugin
	// receive signal: force stop pool
	InterruptChan chan struct{}
	// receive signal: stop pool when all task has done
	StopPoolChan chan struct{}
}

youcrawl engine

func NewEngine

func NewEngine(option *EngineOption) *Engine

init new engine

func (*Engine) AddHTMLParser

func (e *Engine) AddHTMLParser(parsers ...HTMLParser)

add parse

func (*Engine) AddPipelines

func (e *Engine) AddPipelines(pipelines ...Pipeline)

add pipelines

func (*Engine) AddPlugins

func (e *Engine) AddPlugins(plugins ...Plugin)

add plugins

func (*Engine) AddPostProcess

func (e *Engine) AddPostProcess(postprocessList ...PostProcess)

add postprocess

func (*Engine) AddTasks

func (e *Engine) AddTasks(tasks ...*Task)

add task to crawl unsafe operation,engine must not in running status

in engine running ,use RequestPool.AddURLs method

func (*Engine) AddURLs

func (e *Engine) AddURLs(urls ...string)

add url to crawl unsafe operation,engine must not in running status

in engine running ,use RequestPool.AddURLs method

func (*Engine) Run

func (e *Engine) Run(wg *sync.WaitGroup)

run crawl engine

func (*Engine) RunAndWait

func (e *Engine) RunAndWait()

run and wait it done

func (*Engine) UseMiddleware

func (e *Engine) UseMiddleware(middlewares ...Middleware)

add middleware

func (*Engine) UseTaskPool

func (e *Engine) UseTaskPool(taskPool TaskPool)

use taskPool

type EngineOption

type EngineOption struct {
	// max running in same time
	MaxRequest int

	// true for:
	// keep running until manually stopped
	Daemon bool
}

init engine config

type GlobalStore

type GlobalStore interface {
	Init() error
	SetValue(key string, value interface{})
	GetValue(key string) interface{}
	GetOrCreate(key string, value interface{}) interface{}
}

store engine global

type GlobalStorePipeline

type GlobalStorePipeline struct {
}

global store pipeline save current item to global items

func (*GlobalStorePipeline) Process

func (g *GlobalStorePipeline) Process(item interface{}, store GlobalStore) error

type HTMLParser

type HTMLParser func(ctx *Context) error

type ImageDownloadItem

type ImageDownloadItem struct {
	Urls []string
}

type ImageDownloadPipeline

type ImageDownloadPipeline struct {
	// get store folder
	//
	//./download/image by default
	GetStoreFileFolder func(item interface{}, store GlobalStore) string
	// get save filename
	//
	// same name with image,by default
	GetSaveFileName func(item interface{}, store GlobalStore, rawURL string) string
	// get urls
	//
	//if the type of Item is ImageDownloadItem, no need to specify
	GetUrls func(item interface{}, store GlobalStore) []string
	// maximum number of concurrent downloads
	MaxDownload int
	// request middlewares to use
	Middlewares []Middleware
	// call on each image downloaded complete
	OnImageDownloadComplete func(item interface{}, store GlobalStore, url string, downloadFilePath string)
	// call on all image download, regardless of whether all image download is successful
	OnDone func(item interface{}, store GlobalStore)
}

func (*ImageDownloadPipeline) Process

func (i *ImageDownloadPipeline) Process(item interface{}, store GlobalStore) error

type MemoryGlobalStore

type MemoryGlobalStore struct {
	sync.Map
	Content map[string]interface{}
}

func (*MemoryGlobalStore) GetOrCreate

func (s *MemoryGlobalStore) GetOrCreate(key string, value interface{}) interface{}

func (*MemoryGlobalStore) GetValue

func (s *MemoryGlobalStore) GetValue(key string) interface{}

func (*MemoryGlobalStore) Init

func (s *MemoryGlobalStore) Init() error

func (*MemoryGlobalStore) SetValue

func (s *MemoryGlobalStore) SetValue(key string, value interface{})

type Middleware

type Middleware interface {
	// before request call
	Process(c *http.Client, r *http.Request, ctx *Context)
	// after request call
	RequestCallback(c *http.Client, r *http.Request, ctx *Context)
}

type OutputCSVPostProcess

type OutputCSVPostProcess struct {
	// contains filtered or unexported fields
}

func NewOutputCSVPostProcess

func NewOutputCSVPostProcess(option OutputCSVPostProcessOption) *OutputCSVPostProcess

func (*OutputCSVPostProcess) Process

func (o *OutputCSVPostProcess) Process(store GlobalStore) error

type OutputCSVPostProcessOption

type OutputCSVPostProcessOption struct {
	// output path.
	// if not provided,use `./output.csv` as default value
	OutputPath string
	// with header.
	// default : false
	WithHeader bool
	// key to write
	// if not provided,will write all key
	Keys []string
	// key to csv column name.
	// if not provide,use key name as csv column name
	KeysMapping map[string]string
	// if value not exist in item.
	// by default,use empty string
	NotExistValue string
}

type OutputJsonPostProcess

type OutputJsonPostProcess struct {
	StorePath string
	GetData   func(store GlobalStore) interface{}
}

func (*OutputJsonPostProcess) Process

func (p *OutputJsonPostProcess) Process(store GlobalStore) error

type Pipeline

type Pipeline interface {
	Process(item interface{}, store GlobalStore) error
}

type Plugin

type Plugin interface {
	Run(e *Engine)
}

type PostProcess

type PostProcess interface {
	Process(store GlobalStore) error
}

type ProxyMiddleware

type ProxyMiddleware struct {
	List []string
}

func NewProxyMiddleware

func NewProxyMiddleware(option ProxyMiddlewareOption) (*ProxyMiddleware, error)

func (*ProxyMiddleware) GetProxy

func (p *ProxyMiddleware) GetProxy() string

func (*ProxyMiddleware) Process

func (p *ProxyMiddleware) Process(c *http.Client, r *http.Request, ctx *Context)

func (*ProxyMiddleware) RequestCallback

func (p *ProxyMiddleware) RequestCallback(c *http.Client, r *http.Request, ctx *Context)

type ProxyMiddlewareOption

type ProxyMiddlewareOption struct {
	// set proxy list,
	// if both ProxyList and ProxyFilePath are provided,combine tow list
	ProxyList []string
	// read proxy from file,use `./proxy.txt` by default,
	// if both ProxyList and ProxyFilePath are provided,combine tow list
	ProxyFilePath string
}

type RequestPool

type RequestPool struct {
	Tasks         []Task
	Total         int
	CompleteCount int
	NextTask      *Task
	GetTaskChan   chan *Task
	DoneChan      chan struct{}
	CompleteChan  chan *Task
	PreventStop   bool
	Store         GlobalStore
	sync.RWMutex
}

func NewRequestPool

func NewRequestPool(option RequestPoolOption, store GlobalStore) *RequestPool

func (*RequestPool) AddTasks

func (p *RequestPool) AddTasks(tasks ...*Task)

add task

func (*RequestPool) AddURLs

func (p *RequestPool) AddURLs(urls ...string)

add task to task pool

func (*RequestPool) Close

func (p *RequestPool) Close()

func (*RequestPool) GetCompleteCount

func (p *RequestPool) GetCompleteCount() (int, error)

func (*RequestPool) GetDoneChan

func (p *RequestPool) GetDoneChan() chan struct{}

func (*RequestPool) GetOneTask

func (p *RequestPool) GetOneTask(e *Engine) <-chan *Task

func (*RequestPool) GetTotal

func (p *RequestPool) GetTotal() (int, error)

func (*RequestPool) GetUnRequestCount

func (p *RequestPool) GetUnRequestCount() (int, error)

func (*RequestPool) GetUnRequestedTask

func (p *RequestPool) GetUnRequestedTask() (target *Task)

find unreauested task

func (*RequestPool) OnTaskDone

func (p *RequestPool) OnTaskDone(task *Task)

get task from pool task

func (*RequestPool) SetPrevent

func (p *RequestPool) SetPrevent(isPrevent bool)

type RequestPoolOption

type RequestPoolOption struct {
	UseCookie   bool
	PreventStop bool
}

type StatusOutputPlugin

type StatusOutputPlugin struct {
	// disable log output
	LogOutput bool
}

log engine status plugin

func (*StatusOutputPlugin) Run

func (p *StatusOutputPlugin) Run(e *Engine)

type Task

type Task struct {
	ID        string
	Url       string
	Context   Context
	Requested bool
	Completed bool
}

tracking request task

func NewTask

func NewTask(url string, item interface{}) Task

type TaskPool

type TaskPool interface {
	AddURLs(urls ...string)
	AddTasks(task ...*Task)
	GetOneTask(e *Engine) <-chan *Task
	GetUnRequestedTask() (target *Task)
	OnTaskDone(task *Task)
	GetDoneChan() chan struct{}
	Close()
	SetPrevent(isPrevent bool)
	GetTotal() (int, error)
	GetUnRequestCount() (int, error)
	GetCompleteCount() (int, error)
}

type UserAgentMiddleware

type UserAgentMiddleware struct {
	List []string
}

func NewUserAgentMiddleware

func NewUserAgentMiddleware(option UserAgentMiddlewareOption) (*UserAgentMiddleware, error)

func (*UserAgentMiddleware) GetUserAgent

func (p *UserAgentMiddleware) GetUserAgent() string

func (*UserAgentMiddleware) Process

func (p *UserAgentMiddleware) Process(c *http.Client, r *http.Request, ctx *Context)

func (*UserAgentMiddleware) RequestCallback

func (p *UserAgentMiddleware) RequestCallback(c *http.Client, r *http.Request, ctx *Context)

type UserAgentMiddlewareOption

type UserAgentMiddlewareOption struct {
	// set user agent list,
	// if both UserAgentList and UserAgentFilePath are provided,combine tow list
	UserAgentList []string
	// read useragent from file,use `./ua.txt` by default,
	// if both UserAgentList and UserAgentFilePath are provided,combine tow list
	UserAgentFilePath string
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL