filia

package module
v0.0.0-...-dc4a598 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 28, 2014 License: MIT Imports: 20 Imported by: 0

Documentation

Index

Constants

View Source
const (
	DocumentFile = DocumentType(iota)
	DocumentDirectory
	DocumentLink
	DocumentSpecial
)

Variables

View Source
var (
	DefaultSettings = Settings{
		Protos: map[string]Proto{
			"http":  HTTPProto{},
			"https": HTTPProto{},
			"ftp":   NewFTPProto(),
			"sftp":  NewSFTPProto(),
		},
		Decoders: map[string]Decoder{
			"text/html":             DefaultHTMLDecoder,
			"application/xhtml+xml": DefaultHTMLDecoder,
			"application/pdf":       DefaultPDFDecoder,
			"image/png":             DefaultImageDecoder,
			"image/jpeg":            DefaultImageDecoder,
			"image/gif":             DefaultImageDecoder,
			"video/webm":            DefaultMediaDecoder,
			"audio/mpeg":            DefaultMediaDecoder,
			"application/ogg":       DefaultMediaDecoder,
			"application/zip":       DefaultZIPDecoder,
			"application/x-gzip":    DefaultGzipDecoder,
		},
	}
	DefaultCrawler = Crawler{
		Settings: DefaultSettings,
		Queue:    make(StdCrawlerQueue),
		Set:      *set.New(),
		Output:   make(chan Document),
	}
)
View Source
var (
	DefaultHTMLDecoder  = HTMLDecoder{}
	DefaultPDFDecoder   = PDFDecoder{}
	DefaultImageDecoder = ImageDecoder{}
	DefaultMediaDecoder = MediaDecoder{}
	DefaultZIPDecoder   = ZIPDecoder{}
	DefaultGzipDecoder  = GzipDecoder{}
)

Functions

This section is empty.

Types

type Crawler

type Crawler struct {
	Settings
	Queue CrawlerQueue
	Set   set.Set

	Output chan Document
	ErrC   chan error
	CloseC chan struct{}
}

func NewCrawler

func NewCrawler() *Crawler

func (*Crawler) Close

func (c *Crawler) Close()

func (*Crawler) Crawl

func (c *Crawler) Crawl()

func (*Crawler) Emit

func (c *Crawler) Emit(urls ...string)

func (*Crawler) Errors

func (c *Crawler) Errors() <-chan error

type CrawlerQueue

type CrawlerQueue interface {
	// Send sends the list of urls in given order to the queue
	Send(urls ...string)
	// Recv receives one url from the queue and returns it. It may block.
	Recv() (url string)
}

A CrawlerQueue is in most cases the same as a channel to send and receive strings, but provides two methods instead for external queue systems like Redis and RabbitMq

type Decoder

type Decoder interface {
	Decode(doc *Document, rc io.ReadCloser) error
}

type Document

type Document struct {
	URL         *url.URL
	Type        DocumentType
	ContentType string
	Time        time.Time
	Size        int64

	Links []string

	Title        string
	Version      string
	Album        string
	Artist       string
	Performer    string
	Copyright    string
	License      string
	Organisation string
	Genre        string
	Date         string
	ISRC         string
	Author       string
	Description  string

	Content string

	NoIndex  bool
	NoFollow bool
}
func (d Document) AbsLinks() (r []string)

func (*Document) Init

func (d *Document) Init()

type DocumentType

type DocumentType int

type FTPProto

type FTPProto struct {
	// contains filtered or unexported fields
}

func NewFTPProto

func NewFTPProto() *FTPProto

func (*FTPProto) Get

func (p *FTPProto) Get(url_ *url.URL) (doc Document, body io.ReadCloser, err error)

type FileProto

type FileProto struct {
}

func (*FileProto) Get

func (p *FileProto) Get(url_ *url.URL) (doc Document, body io.ReadCloser, err error)

type GzipDecoder

type GzipDecoder struct {
	Tar TarDecoder
}

func (GzipDecoder) Decode

func (g GzipDecoder) Decode(doc *Document, rc io.ReadCloser) error

type HTMLDecoder

type HTMLDecoder struct{}

func (HTMLDecoder) Decode

func (h HTMLDecoder) Decode(doc *Document, rc io.ReadCloser) error

type HTTPProto

type HTTPProto struct {
	Client http.Client
}

func (HTTPProto) Get

func (p HTTPProto) Get(url_ *url.URL) (doc Document, body io.ReadCloser, err error)

type ImageDecoder

type ImageDecoder struct{}

func (ImageDecoder) Decode

func (i ImageDecoder) Decode(doc *Document, rc io.ReadCloser) error

type MediaDecoder

type MediaDecoder struct{}

func (MediaDecoder) Decode

func (m MediaDecoder) Decode(doc *Document, rc io.ReadCloser) error

type PDFDecoder

type PDFDecoder struct{}

func (PDFDecoder) Decode

func (p PDFDecoder) Decode(doc *Document, rc io.ReadCloser) error

type Proto

type Proto interface {
	Get(url *url.URL) (doc Document, body io.ReadCloser, err error)
}

type SFTPProto

type SFTPProto struct {
	// contains filtered or unexported fields
}

func NewSFTPProto

func NewSFTPProto() *SFTPProto

func (*SFTPProto) Get

func (s *SFTPProto) Get(url_ *url.URL) (doc Document, body io.ReadCloser, err error)

type Settings

type Settings struct {
	Protos   map[string]Proto
	Decoders map[string]Decoder
}

func (*Settings) CrawlURL

func (c *Settings) CrawlURL(url string) (doc Document, err error)

func (*Settings) Fetch

func (c *Settings) Fetch(urls string) (Document, io.ReadCloser, error)

type StdCrawlerQueue

type StdCrawlerQueue chan string

StdCrawlerQueue is a string channel with methods required by CrawlerQueue

func (StdCrawlerQueue) Recv

func (s StdCrawlerQueue) Recv() string

Recv receives one string and is just a wrapper for <-c, but is needed to fulfill the CrawlerQueue interface.

func (StdCrawlerQueue) Send

func (s StdCrawlerQueue) Send(urls ...string)

Send sends the urls to the string channel. It's just a wrapper for c <- url, but is needed to fulfill the CrawlerQueue interface.

type TarDecoder

type TarDecoder struct{}

func (TarDecoder) Decode

func (t TarDecoder) Decode(doc *Document, rc io.ReadCloser) error

type ZIPDecoder

type ZIPDecoder struct{}

func (ZIPDecoder) Decode

func (z ZIPDecoder) Decode(doc *Document, rc io.ReadCloser) error

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL