core

package
v2.0.18 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 23, 2024 License: GPL-3.0 Imports: 15 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	GeneralRegex = `((?:https?)://[\w\-]+(?:\.[\w\-]+)+[\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])`
	HrefRegex    = `href=["']([^"']+)["']`
)
View Source
var UnreadableExtensions = []string{
	".png",
	".jpg",
	".jpeg",
	".gif",
	".pdf",
	".doc",
	".docx",
	".xls",
	".xlsx",
	".ppt",
	".pptx",
	".zip",
	".rar",
	".tar",
	".gz",
	".exe",
	".mp3",
	".mp4",
	".avi",
	".mov",
	".wmv",
	".flv",
	".wav",
	".mpeg",
	".mpg",
	".m4v",
	".swf",
	".svg",
	".ico",
	".ttf",
	".woff",
	".woff2",
	".eot",
	".otf",
	".psd",
	".ai",
	".eps",
	".indd",
	".raw",
	".webm",
	".m4a",
	".m4p",
	".m4b",
	".m4r",
}

Functions

This section is empty.

Types

type Cache

type Cache struct {
	Visited map[string]bool
	Lock    sync.Mutex
}

func (*Cache) AddVisited

func (c *Cache) AddVisited(url string)

func (*Cache) Flush

func (c *Cache) Flush()

func (*Cache) IsVisited

func (c *Cache) IsVisited(url string) bool

type Crawler

type Crawler struct {
	RootURL        string
	Level          int
	ExportFile     string
	RegexMap       map[string]string
	ExcludedStatus []int
	IncludedUrls   []string
	Client         *http.Client
	UserAgent      string
	Cache          Cache
	Workers        int
	Delay          int
}

func NewCrawler

func NewCrawler(options *shared.Options) *Crawler

func (*Crawler) AddMatches

func (c *Crawler) AddMatches(page *webtree.Page)

func (*Crawler) Crawl

func (c *Crawler) Crawl()

func (*Crawler) CrawlNodeBlock

func (c *Crawler) CrawlNodeBlock(w *webtree.Node, levelChangedChan chan int)

func (*Crawler) CrawlNodeLive

func (c *Crawler) CrawlNodeLive(w *webtree.Node)

func (*Crawler) Export

func (c *Crawler) Export(tree *webtree.Node, format string, filename string) error

func (*Crawler) ExportJSON

func (c *Crawler) ExportJSON(root *webtree.Node, filename string) error

func (*Crawler) ExportTXT

func (c *Crawler) ExportTXT(root *webtree.Node, filename string) error

func (*Crawler) ExportXML

func (c *Crawler) ExportXML(tree *webtree.Node, filename string) error
func (c *Crawler) ExtractLinks(page *webtree.Page) (links []string)

func (*Crawler) Fetch

func (c *Crawler) Fetch(page *webtree.Page)

func (*Crawler) IsSkipablePage

func (c *Crawler) IsSkipablePage(page *webtree.Page) bool

func (*Crawler) ProcessANode added in v2.0.14

func (c *Crawler) ProcessANode(node *webtree.Node)

func (*Crawler) SaveResults

func (c *Crawler) SaveResults(root *webtree.Node)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL