discover

package
v0.0.0-...-23e6414 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 20, 2022 License: AGPL-3.0 Imports: 9 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ErrQuit = errors.New("quit requested")

Functions

func CompressSpace

func CompressSpace(s string) string

CompressSpace reduces all whitespace sequences (space, tabs, newlines etc) in a string to a single space. Leading/trailing space is trimmed. Has the effect of converting multiline strings to one line.

func GetAttr

func GetAttr(n *html.Node, attr string) string

GetAttr retrieved the value of an attribute on a node. Returns empty string if attribute doesn't exist.

func GetTextContent

func GetTextContent(n *html.Node) string

GetTextContent recursively fetches the text for a node

Types

type DiscoverStats

type DiscoverStats struct {
	ErrorCount int
	FetchCount int
}

type Discoverer

type Discoverer struct {
	Name               string
	StartURL           url.URL
	ArtPats            []*regexp.Regexp
	XArtPats           []*regexp.Regexp
	NavLinkSel         cascadia.Selector
	XNavPats           []*regexp.Regexp
	CruftSel           cascadia.Selector
	BaseErrorThreshold int
	StripFragments     bool
	StripQuery         bool
	HostPat            *regexp.Regexp
	UserAgent          string

	ErrorLog Logger
	InfoLog  Logger
	Stats    DiscoverStats
}

func NewDiscoverer

func NewDiscoverer(cfg DiscovererDef) (*Discoverer, error)

func (*Discoverer) CookArticleURL

func (disc *Discoverer) CookArticleURL(baseURL *url.URL, artLink string) (*url.URL, error)

func (*Discoverer) Run

func (disc *Discoverer) Run(client *http.Client, quit <-chan struct{}) (LinkSet, error)

type DiscovererDef

type DiscovererDef struct {
	Name string
	URL  string
	// article urls to include - regexes
	ArtPat []string
	// article urls to exclude - regexes
	XArtPat []string

	// article url forms to include (eg "/YYYY/MM/SLUG.html")
	ArtForm []string
	// article url forms to exclude
	XArtForm []string

	// CSS selector to identify navigation links
	NavSel string
	// regexp patterns of pages to skip during link discovery
	XNavPat []string

	// css selector for elements to cull during article discovery
	CruftSel string

	// BaseErrorThreshold is starting number of http errors to accept before
	// bailing out. default is 5   (and 0 is considered as unset, so default is applied)
	// error threshold formula: base + 10% of successful request count
	BaseErrorThreshold int

	// Hostpat is a regex matching accepted domains
	// if empty, reject everything on a different domain
	HostPat string

	// If NoStripQuery is set then article URLs won't have the query part zapped
	NoStripQuery bool

	// UserAgent string to use in HTTP requests
	UserAgent string
}

type LinkSet

type LinkSet map[url.URL]bool

thin map wrapper for some set operations

func (*LinkSet) Add

func (s *LinkSet) Add(link url.URL)

func (*LinkSet) Merge

func (s *LinkSet) Merge(other LinkSet)

merge the contents of other into this set

func (*LinkSet) Pop

func (s *LinkSet) Pop() url.URL

remove and return a single item from the set

func (*LinkSet) Remove

func (s *LinkSet) Remove(link url.URL)

type Logger

type Logger interface {
	Printf(format string, v ...interface{})
}

type NullLogger

type NullLogger struct{}

func (NullLogger) Printf

func (l NullLogger) Printf(format string, v ...interface{})

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL