discover

package
v0.0.0-...-e72d39b Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 14, 2018 License: AGPL-3.0 Imports: 8 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func CompressSpace

func CompressSpace(s string) string

CompressSpace reduces all whitespace sequences (space, tabs, newlines etc) in a string to a single space. Leading/trailing space is trimmed. Has the effect of converting multiline strings to one line.

func GetAttr

func GetAttr(n *html.Node, attr string) string

GetAttr retrieved the value of an attribute on a node. Returns empty string if attribute doesn't exist.

func GetTextContent

func GetTextContent(n *html.Node) string

GetTextContent recursively fetches the text for a node

Types

type DiscoverStats

type DiscoverStats struct {
	ErrorCount int
	FetchCount int
}

type Discoverer

type Discoverer struct {
	Name               string
	StartURL           url.URL
	ArtPats            []*regexp.Regexp
	NavLinkSel         cascadia.Selector
	BaseErrorThreshold int
	StripFragments     bool
	StripQuery         bool
	HostPat            *regexp.Regexp

	ErrorLog Logger
	InfoLog  Logger
	Stats    DiscoverStats
}

func NewDiscoverer

func NewDiscoverer(cfg DiscovererDef) (*Discoverer, error)

func (*Discoverer) CookArticleURL

func (disc *Discoverer) CookArticleURL(baseURL *url.URL, artLink string) (*url.URL, error)

func (*Discoverer) Run

func (disc *Discoverer) Run(client *http.Client) (LinkSet, error)

type DiscovererDef

type DiscovererDef struct {
	Name   string
	URL    string
	ArtPat []string
	NavSel string
	// BaseErrorThreshold is starting number of http errors to accept before
	// bailing out.
	// error threshold formula: base + 10% of successful request count
	BaseErrorThreshold int

	// Hostpat is a regex matching accepted domains
	// if empty, reject everything on a different domain
	HostPat string

	// If NoStripQuery is set then article URLs won't have the query part zapped
	NoStripQuery bool
}

type LinkSet

type LinkSet map[url.URL]bool

thin map wrapper for some set operations

func (*LinkSet) Add

func (s *LinkSet) Add(link url.URL)

func (*LinkSet) Merge

func (s *LinkSet) Merge(other LinkSet)

merge the contents of other into this set

func (*LinkSet) Pop

func (s *LinkSet) Pop() url.URL

remove and return a single item from the set

func (*LinkSet) Remove

func (s *LinkSet) Remove(link url.URL)

type Logger

type Logger interface {
	Printf(format string, v ...interface{})
}

type NullLogger

type NullLogger struct{}

func (NullLogger) Printf

func (l NullLogger) Printf(format string, v ...interface{})

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL