webcrawler

package
v1.0.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 18, 2024 License: MIT Imports: 19 Imported by: 0

Documentation

Index

Constants

View Source
const Label = "Webcrawler"

Variables

View Source
var DefaultDownloadContentTypes = []string{
	"application/pdf", "application/msword", "application/vnd.ms-excel", "vnd.ms-excel.addin.macroEnabled.12",
	"vnd.ms-excel.sheet.binary.macroEnabled.12", "vnd.ms-excel.sheet.macroEnabled.12",
	"vnd.ms-excel.template.macroEnabled.12", "application/vnd.ms-word.document.macroEnabled.12",
	"vnd.ms-word.template.macroEnabled.12", "application/vnd.ms-word.template.macroEnabled.12",
}
View Source
var DefaultFollowContentTypes = []string{
	"text/html", "text/plain", "text/javascript", "application/javascript", "application/json", "application/atom+xml",
	"application/rss+xml", "application/xhtml+xml", "application/x-latex", "application/xml", "application/xml-dtd",
	"application/x-sh", "application/x-tex", "application/x-texinfo", "text/cache-manifest", "text/calendar",
	"text/css", "text/csv", "text/csv-schema", "text/directory", "text/dns", "text/ecmascript", "text/encaprtp",
	"text/example", "text/fwdred", "text/grammar-ref-list", "text/jcr-cnd", "text/markdown", "text/mizar", "text/n3",
	"text/parameters", "text/provenance-notation", "text/prs.fallenstein.rst", "text/prs.lines.tag", "text/raptorfec",
	"text/RED", "text/rfc822-headers", "text/rtf", "text/rtp-enc-aescm128", "text/rtploopback", "text/rtx", "text/SGML",
	"text/t140", "text/tab-separated-values", "text/troff", "text/turtle", "text/ulpfec", "text/uri-list", "text/vcard",
	"text/vnd.abc", "text/vnd.debian.copyright", "text/vnd.DMClientScript", "text/vnd.dvb.subtitle",
	"text/vnd.esmertec.theme-descriptor", "text/vnd.fly", "text/vnd.fmi.flexstor", "text/vnd.graphviz",
	"text/vnd.in3d.3dml", "text/vnd.in3d.spot", "text/vnd.IPTC.NewsML", "text/vnd.IPTC.NITF", "text/vnd.latex-z",
	"text/vnd.motorola.reflex", "text/vnd.ms-mediapackage", "text/vnd.net2phone.commcenter.command",
	"text/vnd.radisys.msml-basic-layout", "text/vnd.si.uricatalogue", "text/vnd.sun.j2me.app-descriptor",
	"text/vnd.trolltech.linguist", "text/vnd.wap.si", "text/vnd.wap.sl", "text/vnd.wap.wmlscript", "text/vnd.wap-wml",
	"text/vnd-a", "text/vnd-curl", "text/xml", "text/xml-external-parsed-entity",
}

Functions

func CheckSetup

func CheckSetup() error

CheckSetup checks whether Setup() executed accordingly. Scan arguments should be checked by the scanner.

func Setup

func Setup(logger utils.Logger) error

Setup configures the environment accordingly, if the scan module has some special requirements. A successful setup is required before a scan can be started.

Types

type CrawlResult

type CrawlResult struct {
	Vhost               string // The vhost this content was discovered with
	Status              string
	FaviconHash         string
	AuthMethod          string // Authentication methods seen while crawling this target
	AuthSuccess         bool   // Authentication success, if authentication-required was discovered
	RequestsTotal       int    // Amount of HTTP requests in total (except www-authenticate round-trips)
	RequestsRedirect    int    // Amount of HTTP requests, where only the headers were read
	RequestsPartial     int    // Amount of HTTP requests, where only the headers were read
	RequestsComplete    int    // Amount of HTTP requests, where the full response read
	DiscoveredVhosts    []string
	DiscoveredDownloads []string
	Pages               []*Page
}

type Crawler

type Crawler struct {
	// contains filtered or unexported fields
}

func NewCrawler

func NewCrawler(
	logger utils.Logger,
	baseUrl url.URL,
	vhost string,
	https bool,
	depth int,
	followQS bool,
	storeRoot bool,
	download bool,
	downloadFolder string,
	ntlmDomain string,
	ntlmUser string,
	ntlmPassword string,
	userAgent string,
	proxy *url.URL,
	requestTimeout time.Duration,
	followTypes []string,
	downloadTypes []string,
	maxThreads int,
	deadline time.Time,
) (*Crawler, error)

func (*Crawler) Crawl

func (c *Crawler) Crawl() *CrawlResult

type Page

type Page struct {
	Depth               int
	Url                 *url.URL
	RedirectUrl         string // Final URL the request got redirected to. Might be inside or outside of original endpoint.
	RedirectCount       int    // Number of redirects that happened until the final URL was reached
	AuthMethod          string
	AuthSuccess         bool
	ResponseCode        int
	ResponseMessage     string
	ResponseContentType string
	ResponseHeaders     string
	ResponseEncoding    string // The encoding used to decode the response body returned by the server. Decided based on response headers, meta tags and trial and error.
	HtmlTitle           string
	HtmlContent         []byte   // Bytes array, to be converted by consumer as required
	RawLinks            []string // URLs found on that page
}

type Result

type Result struct {
	Data      []*CrawlResult
	Status    string // Final scan status (success or graceful error). Should be stored along with the scan results.
	Exception bool   // Indicates if something went wrong badly and results shall be discarded. This should never be

}

type Scanner

type Scanner struct {
	Label    string
	Started  time.Time
	Finished time.Time
	// contains filtered or unexported fields
}

func NewScanner

func NewScanner(
	logger utils.Logger,
	target string,
	port int,
	vhosts []string,
	https bool,
	depth int,
	maxThreads int,
	followQS bool,
	storeRoot bool,
	download bool,
	outputFolder string,
	ntlmDomain string,
	ntlmUser string,
	ntlmPassword string,
	userAgent string,
	proxy string,
	requestTimeout time.Duration,
) (*Scanner, error)

func (*Scanner) Run

func (s *Scanner) Run(timeout time.Duration) (res *Result)

Run starts scan execution. This must either be executed as a goroutine, or another thread must be active listening on the scan's result channel, in order to avoid a deadlock situation.

func (*Scanner) SetDownloadContentTypes

func (s *Scanner) SetDownloadContentTypes(responseContentTypes []string) error

SetDownloadContentTypes allows to set a custom none-default list of response content types to be downloaded during crawling a website.

func (*Scanner) SetFollowContentTypes

func (s *Scanner) SetFollowContentTypes(responseContentTypes []string) error

SetFollowContentTypes allows to set a custom none-default list of response content types to be followed during crawling a website.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL