Documentation ¶
Index ¶
Constants ¶
View Source
const Label = "Webcrawler"
Variables ¶
View Source
var DefaultDownloadContentTypes = []string{
"application/pdf", "application/msword", "application/vnd.ms-excel", "vnd.ms-excel.addin.macroEnabled.12",
"vnd.ms-excel.sheet.binary.macroEnabled.12", "vnd.ms-excel.sheet.macroEnabled.12",
"vnd.ms-excel.template.macroEnabled.12", "application/vnd.ms-word.document.macroEnabled.12",
"vnd.ms-word.template.macroEnabled.12", "application/vnd.ms-word.template.macroEnabled.12",
}
View Source
var DefaultFollowContentTypes = []string{
"text/html", "text/plain", "text/javascript", "application/javascript", "application/json", "application/atom+xml",
"application/rss+xml", "application/xhtml+xml", "application/x-latex", "application/xml", "application/xml-dtd",
"application/x-sh", "application/x-tex", "application/x-texinfo", "text/cache-manifest", "text/calendar",
"text/css", "text/csv", "text/csv-schema", "text/directory", "text/dns", "text/ecmascript", "text/encaprtp",
"text/example", "text/fwdred", "text/grammar-ref-list", "text/jcr-cnd", "text/markdown", "text/mizar", "text/n3",
"text/parameters", "text/provenance-notation", "text/prs.fallenstein.rst", "text/prs.lines.tag", "text/raptorfec",
"text/RED", "text/rfc822-headers", "text/rtf", "text/rtp-enc-aescm128", "text/rtploopback", "text/rtx", "text/SGML",
"text/t140", "text/tab-separated-values", "text/troff", "text/turtle", "text/ulpfec", "text/uri-list", "text/vcard",
"text/vnd.abc", "text/vnd.debian.copyright", "text/vnd.DMClientScript", "text/vnd.dvb.subtitle",
"text/vnd.esmertec.theme-descriptor", "text/vnd.fly", "text/vnd.fmi.flexstor", "text/vnd.graphviz",
"text/vnd.in3d.3dml", "text/vnd.in3d.spot", "text/vnd.IPTC.NewsML", "text/vnd.IPTC.NITF", "text/vnd.latex-z",
"text/vnd.motorola.reflex", "text/vnd.ms-mediapackage", "text/vnd.net2phone.commcenter.command",
"text/vnd.radisys.msml-basic-layout", "text/vnd.si.uricatalogue", "text/vnd.sun.j2me.app-descriptor",
"text/vnd.trolltech.linguist", "text/vnd.wap.si", "text/vnd.wap.sl", "text/vnd.wap.wmlscript", "text/vnd.wap-wml",
"text/vnd-a", "text/vnd-curl", "text/xml", "text/xml-external-parsed-entity",
}
Functions ¶
func CheckSetup ¶
func CheckSetup() error
CheckSetup checks whether Setup() executed accordingly. Scan arguments should be checked by the scanner.
Types ¶
type CrawlResult ¶
type CrawlResult struct { Vhost string // The vhost this content was discovered with Status string FaviconHash string AuthMethod string // Authentication methods seen while crawling this target AuthSuccess bool // Authentication success, if authentication-required was discovered RequestsTotal int // Amount of HTTP requests in total (except www-authenticate round-trips) RequestsRedirect int // Amount of HTTP requests, where only the headers were read RequestsPartial int // Amount of HTTP requests, where only the headers were read RequestsComplete int // Amount of HTTP requests, where the full response read DiscoveredVhosts []string DiscoveredDownloads []string Pages []*Page }
type Crawler ¶
type Crawler struct {
// contains filtered or unexported fields
}
func NewCrawler ¶
func NewCrawler( logger utils.Logger, baseUrl url.URL, vhost string, https bool, depth int, followQS bool, storeRoot bool, download bool, downloadFolder string, ntlmDomain string, ntlmUser string, ntlmPassword string, userAgent string, proxy *url.URL, requestTimeout time.Duration, followTypes []string, downloadTypes []string, maxThreads int, deadline time.Time, ) (*Crawler, error)
func (*Crawler) Crawl ¶
func (c *Crawler) Crawl() *CrawlResult
type Page ¶
type Page struct { Depth int Url *url.URL RedirectUrl string // Final URL the request got redirected to. Might be inside or outside of original endpoint. RedirectCount int // Number of redirects that happened until the final URL was reached AuthMethod string AuthSuccess bool ResponseCode int ResponseMessage string ResponseContentType string ResponseHeaders string ResponseEncoding string // The encoding used to decode the response body returned by the server. Decided based on response headers, meta tags and trial and error. HtmlTitle string HtmlContent []byte // Bytes array, to be converted by consumer as required RawLinks []string // URLs found on that page }
type Result ¶
type Result struct { Data []*CrawlResult Status string // Final scan status (success or graceful error). Should be stored along with the scan results. Exception bool // Indicates if something went wrong badly and results shall be discarded. This should never be }
type Scanner ¶
type Scanner struct { Label string Started time.Time Finished time.Time // contains filtered or unexported fields }
func NewScanner ¶
func NewScanner( logger utils.Logger, target string, port int, vhosts []string, https bool, depth int, maxThreads int, followQS bool, storeRoot bool, download bool, outputFolder string, ntlmDomain string, ntlmUser string, ntlmPassword string, userAgent string, proxy string, requestTimeout time.Duration, ) (*Scanner, error)
func (*Scanner) Run ¶
Run starts scan execution. This must either be executed as a goroutine, or another thread must be active listening on the scan's result channel, in order to avoid a deadlock situation.
func (*Scanner) SetDownloadContentTypes ¶
SetDownloadContentTypes allows to set a custom none-default list of response content types to be downloaded during crawling a website.
func (*Scanner) SetFollowContentTypes ¶
SetFollowContentTypes allows to set a custom none-default list of response content types to be followed during crawling a website.
Click to show internal directories.
Click to hide internal directories.