gotana

package module
v0.0.0-...-e505dc8 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 20, 2018 License: MIT Imports: 28 Imported by: 0

README

===========
Gotana
===========

.. image:: https://cdn0.iconfinder.com/data/icons/antique-weapons/1000/weapon-12-128.png


:Info: Easy to use and extensible scraping framework written in Go.
:Repository: https://github.com/jnosal/gotana
:Author: Jacek Nosal
:Maintainer: Jacek Nosal


Documentation
=====
http://gotana.readthedocs.org/en/latest/


Examples
=====
https://github.com/jnosal/gotana/tree/master/examples

Documentation

Index

Constants

View Source
const (
	STATE_INITIAL  = "INTITIAL"
	STATE_RUNNING  = "RUNNING"
	STATE_STOPPING = "STOPPING"
)
View Source
const (
	EVENT_SCRAPER_OPENED     = "SCRAPER_OPENED"
	EVENT_SCRAPER_CLOSED     = "SCRAPER_CLOSED"
	EVENT_SAVEABLE_EXTRACTED = "SAVEABLE_EXTRACTED"
	STATUS_CODE_INITIAL      = 999
	TIMEOUT_DIALER           = time.Duration(time.Second * 30)
	TIMEOUT_REQUEST          = time.Duration(time.Second * 30)
	TIMEOUT_TLS              = time.Duration(time.Second * 10)
)
View Source
const (
	TCP_CONNECTION_READLINE_DEADLINE = 30
)
View Source
const TYPE_CONTAINS = "contains"
View Source
const TYPE_REGEXP = "regexp"

Variables

This section is empty.

Functions

func CommandExtensions

func CommandExtensions(message string, conn net.Conn, server *TCPServer)

func CommandHelp

func CommandHelp(message string, conn net.Conn, server *TCPServer)

func CommandItems

func CommandItems(message string, conn net.Conn, server *TCPServer)

func CommandList

func CommandList(message string, conn net.Conn, server *TCPServer)

func CommandMiddleware

func CommandMiddleware(message string, conn net.Conn, server *TCPServer)

func CommandStats

func CommandStats(message string, conn net.Conn, server *TCPServer)

func CommandStop

func CommandStop(message string, conn net.Conn, server *TCPServer)

func ContainsOneOf

func ContainsOneOf(s string, targets []string) bool

func DelAcceptEncodingMiddleware

func DelAcceptEncodingMiddleware(request *http.Request) *http.Request

func DescribeFunc

func DescribeFunc(f interface{}) string

func DescribeStruct

func DescribeStruct(v interface{}) string

func DisplayBytes

func DisplayBytes(bytes []byte)

func DisplayResponseBody

func DisplayResponseBody(r io.Reader)

func GetMapKeys

func GetMapKeys(m map[string]interface{}) []string

func Logger

func Logger() *logging.Logger

func NewHTTPClient

func NewHTTPClient() (client *http.Client)

func NewHTTPServer

func NewHTTPServer(address string, engine *Engine) (server *fury.Fury)

func ProcessFile

func ProcessFile(config interface{}, file string) error

func RandomUserAgentMiddleware

func RandomUserAgentMiddleware(request *http.Request) *http.Request

func SaveItem

func SaveItem(item SaveableItem, dao DAO)

func SilentRecover

func SilentRecover(name string)

func StripString

func StripString(s string) string

Types

type DAO

type DAO interface {
	SaveItem(name string, data []byte) error
	GetItems(name string) []string
	CountItems(name string) int64
	ProcessItem(items string) genericStruct
	ProcessItems(items []string) []genericStruct
}

func GetDAO

func GetDAO(engine *Engine) DAO

type DisplayExtension

type DisplayExtension struct {
}

func (*DisplayExtension) ItemScraped

func (d *DisplayExtension) ItemScraped(scraper *Scraper, item SaveableItem)

func (*DisplayExtension) ScraperStarted

func (d *DisplayExtension) ScraperStarted(scraper *Scraper)

func (*DisplayExtension) ScraperStopped

func (d *DisplayExtension) ScraperStopped(scraper *Scraper)

type Engine

type Engine struct {
	Meta   *EngineMeta
	Config *ScraperConfig
	// contains filtered or unexported fields
}

func NewEngine

func NewEngine() (r *Engine)

func (*Engine) AddScrapers

func (engine *Engine) AddScrapers(scrapers ...*Scraper) *Engine

func (*Engine) Cleanup

func (engine *Engine) Cleanup()

func (Engine) Done

func (engine Engine) Done() bool

func (*Engine) FromConfig

func (engine *Engine) FromConfig(config *ScraperConfig) *Engine

func (*Engine) GetScraper

func (engine *Engine) GetScraper(name string) *Scraper

func (*Engine) HasScraper

func (engine *Engine) HasScraper(name string) bool

func (*Engine) IncrFinishedCounter

func (engine *Engine) IncrFinishedCounter()

func (*Engine) PrepareRequest

func (engine *Engine) PrepareRequest(request *http.Request) *http.Request

func (*Engine) SetHandler

func (engine *Engine) SetHandler(handler ScrapingHandlerFunc) *Engine

func (*Engine) Start

func (engine *Engine) Start()

func (*Engine) Stop

func (engine *Engine) Stop()

func (*Engine) UseExtension

func (engine *Engine) UseExtension(extensions ...Extension) *Engine

func (*Engine) UseMiddleware

func (engine *Engine) UseMiddleware(middleware ...RequestMiddlewareFunc) *Engine

type EngineMeta

type EngineMeta struct {
	ScraperStats  map[string]*ScraperMeta
	Started       time.Time
	RequestsTotal int
	LastRequest   *http.Request
	LastResponse  *http.Response
	// contains filtered or unexported fields
}

func NewEngineMeta

func NewEngineMeta() (m *EngineMeta)

func (*EngineMeta) IncrSaved

func (meta *EngineMeta) IncrSaved(scraper *Scraper)

func (*EngineMeta) IncrScraped

func (meta *EngineMeta) IncrScraped(scraper *Scraper)

func (*EngineMeta) UpdateRequestStats

func (meta *EngineMeta) UpdateRequestStats(scraper *Scraper, isSuccessful bool, request *http.Request, response *http.Response)

type Extension

type Extension interface {
	ScraperStarted(scraper *Scraper)
	ScraperStopped(scraper *Scraper)
	ItemScraped(scraper *Scraper, item SaveableItem)
}

type Extractable

type Extractable interface {
	Extract(io.ReadCloser, func(string))
}

type HealthCheckResource

type HealthCheckResource struct {
	// contains filtered or unexported fields
}

func (HealthCheckResource) Get

func (resource HealthCheckResource) Get(meta *fury.Meta)

type LinkExtractor

type LinkExtractor struct {
	Extractable
}

func (*LinkExtractor) Extract

func (extractor *LinkExtractor) Extract(r io.ReadCloser, callback func(string))

type ListByScraperResource

type ListByScraperResource struct {
	// contains filtered or unexported fields
}

func (ListByScraperResource) Get

func (resource ListByScraperResource) Get(meta *fury.Meta)

type RedisDAO

type RedisDAO struct {
	// contains filtered or unexported fields
}

func NewRedisDao

func NewRedisDao(address string) (dao RedisDAO)

func (RedisDAO) CountItems

func (r RedisDAO) CountItems(name string) int64

func (RedisDAO) GetItems

func (r RedisDAO) GetItems(name string) []string

func (RedisDAO) GetLatestItem

func (r RedisDAO) GetLatestItem(name string) error

func (RedisDAO) KeyPrefixed

func (r RedisDAO) KeyPrefixed(key string) string

func (RedisDAO) ProcessItem

func (r RedisDAO) ProcessItem(item string) genericStruct

func (RedisDAO) ProcessItems

func (r RedisDAO) ProcessItems(items []string) []genericStruct

func (RedisDAO) SaveItem

func (r RedisDAO) SaveItem(name string, data []byte) error

func (RedisDAO) String

func (r RedisDAO) String() string

type RequestMiddlewareFunc

type RequestMiddlewareFunc func(request *http.Request) *http.Request

type SaveInRedisExtension

type SaveInRedisExtension struct {
}

func (*SaveInRedisExtension) ItemScraped

func (d *SaveInRedisExtension) ItemScraped(scraper *Scraper, item SaveableItem)

func (*SaveInRedisExtension) ScraperStarted

func (d *SaveInRedisExtension) ScraperStarted(scraper *Scraper)

func (*SaveInRedisExtension) ScraperStopped

func (d *SaveInRedisExtension) ScraperStopped(scraper *Scraper)

type SaveableItem

type SaveableItem interface {
	Scraper() *Scraper
	Validate() bool
	RecordData() ([]byte, error)
}

type ScrapedItem

type ScrapedItem struct {
	Url      string
	FinalUrl string `json:"-"`

	BodyBytes []byte `json:"-"`
	// contains filtered or unexported fields
}

func NewScrapedItem

func NewScrapedItem(url string, scraper *Scraper, resp *http.Response) ScrapedItem

func (ScrapedItem) CheckIfRedirected

func (proxy ScrapedItem) CheckIfRedirected() bool

func (ScrapedItem) CheckURLPatterns

func (proxy ScrapedItem) CheckURLPatterns() (result bool)

func (ScrapedItem) FinalResponseBody

func (proxy ScrapedItem) FinalResponseBody() (io.ReadCloser, error)

func (ScrapedItem) HTMLDocument

func (proxy ScrapedItem) HTMLDocument() (document *goquery.Document, err error)

func (ScrapedItem) ScheduleScraperStop

func (proxy ScrapedItem) ScheduleScraperStop()

func (ScrapedItem) String

func (proxy ScrapedItem) String() (result string)

type Scraper

type Scraper struct {
	Name       string
	Domain     string
	Scheme     string
	BaseUrl    string
	CurrentUrl string
	// contains filtered or unexported fields
}

func NewScraper

func NewScraper(params ScraperParams) (s *Scraper)

func (*Scraper) AddPatterns

func (scraper *Scraper) AddPatterns(urlPatterns ...URLPattern) *Scraper

func (*Scraper) CheckIfFetched

func (scraper *Scraper) CheckIfFetched(url string) (ok bool)

func (*Scraper) CheckIfShouldStop

func (scraper *Scraper) CheckIfShouldStop() (ok bool)

func (*Scraper) CheckUrl

func (scraper *Scraper) CheckUrl(sourceUrl string) (ok bool, url string)

func (*Scraper) Fetch

func (scraper *Scraper) Fetch(url string) (resp *http.Response, err error)

func (*Scraper) MarkAsFetched

func (scraper *Scraper) MarkAsFetched(url string)

func (*Scraper) Notify

func (scraper *Scraper) Notify(url string, resp *http.Response)

func (*Scraper) RunExtractor

func (scraper *Scraper) RunExtractor(resp *http.Response)

func (*Scraper) SetHandler

func (scraper *Scraper) SetHandler(handler ScrapingHandlerFunc) *Scraper

func (*Scraper) Start

func (scraper *Scraper) Start()

func (*Scraper) Stop

func (scraper *Scraper) Stop()

func (*Scraper) String

func (scraper *Scraper) String() (result string)

type ScraperConfig

type ScraperConfig struct {
	Project      string `required:"true"`
	HttpAddress  string
	TcpAddress   string
	RedisAddress string
	Scrapers     []struct {
		RequestLimit int `required:"true"`
		Extractor    string
		Name         string `required:"true"`
		Url          string `required:"true"`
		Patterns     []struct {
			Type    string `required:"true"`
			Pattern string `required:"true"`
		}
	}
}

func NewSpiderConfig

func NewSpiderConfig(file string) (config *ScraperConfig)

type ScraperMeta

type ScraperMeta struct {
	// contains filtered or unexported fields
}

func NewScraperMeta

func NewScraperMeta() (m *ScraperMeta)

type ScraperMixin

type ScraperMixin struct {
	Proxy ScrapedItem
}

func (ScraperMixin) Scraper

func (item ScraperMixin) Scraper() *Scraper

func (*ScraperMixin) SetProxy

func (s *ScraperMixin) SetProxy(proxy ScrapedItem) *ScraperMixin

type ScraperParams

type ScraperParams struct {
	Name         string
	Url          string
	RequestLimit int
	Extractor    Extractable
}

type ScrapingHandlerFunc

type ScrapingHandlerFunc func(ScrapedItem, chan<- SaveableItem)

type StatsResource

type StatsResource struct {
	// contains filtered or unexported fields
}

func (StatsResource) Get

func (resource StatsResource) Get(meta *fury.Meta)

type TCPCommand

type TCPCommand func(message string, conn net.Conn, server *TCPServer)

type TCPMessage

type TCPMessage struct {
	// contains filtered or unexported fields
}

type TCPServer

type TCPServer struct {
	// contains filtered or unexported fields
}

func NewTCPServer

func NewTCPServer(address string, engine *Engine) (server *TCPServer)

func (*TCPServer) AddCommand

func (server *TCPServer) AddCommand(name string, handler TCPCommand)

func (*TCPServer) Start

func (server *TCPServer) Start()

func (*TCPServer) Stop

func (server *TCPServer) Stop()

type URLPattern

type URLPattern struct {
	Type    string `required:"true"`
	Pattern string `required:"true"`
}

func NewURLPattern

func NewURLPattern(kind string, pattern string) (instance URLPattern)

func (URLPattern) String

func (item URLPattern) String() (result string)

func (*URLPattern) Validate

func (item *URLPattern) Validate(url string) (result bool)

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL