crawler

package
v0.0.0-...-1e84083 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 25, 2021 License: MIT Imports: 11 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Cache

type Cache interface {
	// Add insert new key-value into cache, if key already exists
	// in cache returns evicted=true
	Add(key string, value interface{})
	// Get gets value from the coresponding key from cache
	// ok == true if object exist in cache, otherwire ok == false
	Get(key string) (value interface{}, ok bool)
}

type Collector

type Collector struct {
	// contains filtered or unexported fields
}

func NewCollector

func NewCollector(log *zap.SugaredLogger, c Cache) *Collector

func (*Collector) Work

func (c *Collector) Work(ctx context.Context, URL url.URL, maxDepth int) *Graph

type Crawler

type Crawler struct {
	Graph *Graph
	// contains filtered or unexported fields
}

func New

func New(log *zap.SugaredLogger, c Cache) *Crawler

func (*Crawler) ParseWebsite

func (c *Crawler) ParseWebsite(ctx context.Context, websiteURL url.URL) ([]link, error)

ParseWebsite parses html url and returns all <a href> elements and returns its href values.

func (*Crawler) Scrape

func (c *Crawler) Scrape(ctx context.Context, URL url.URL, maxDepth int) (*Graph, error)

func (*Crawler) ScrapeChan

func (c *Crawler) ScrapeChan(ctx context.Context, jobs <-chan Job, results chan<- Job, errCh chan<- error, maxDepth int)

func (*Crawler) ScrapeRec

func (c *Crawler) ScrapeRec(ctx context.Context, sourceURL url.URL, depth int, maxDepth int) error

type Edges

type Edges map[Node][]Node

func (*Edges) MarshalJSON

func (e *Edges) MarshalJSON() ([]byte, error)

type Graph

type Graph struct {
	Nodes []Node `json:"nodes"`
	Edges Edges  `json:"links"`
	// contains filtered or unexported fields
}

func (*Graph) AddEdge

func (g *Graph) AddEdge(src url.URL, dst url.URL)

func (*Graph) AddNode

func (g *Graph) AddNode(URL url.URL)

func (*Graph) EdgeExists

func (g *Graph) EdgeExists(src url.URL, dst url.URL) bool

type Job

type Job struct {
	URL   url.URL
	Depth int
}

type Node

type Node struct {
	URL url.URL `json:"id"`
}

func (*Node) MarshalJSON

func (n *Node) MarshalJSON() ([]byte, error)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL