webcrawler

package

v1.0.2 Latest Latest Go to latest Published: Apr 18, 2024 License: MIT Imports: 19 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/siemens/GoScans

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func CheckSetup() error
func Setup(logger utils.Logger) error
type CrawlResult
type Crawler
- func NewCrawler(logger utils.Logger, baseUrl url.URL, vhost string, https bool, depth int, ...) (*Crawler, error)
- func (c *Crawler) Crawl() *CrawlResult
type Page
type Result
type Scanner
- func NewScanner(logger utils.Logger, target string, port int, vhosts []string, https bool, ...) (*Scanner, error)

Constants ¶

View Source

const Label = "Webcrawler"

Variables ¶

View Source

var DefaultDownloadContentTypes = []string{
	"application/pdf", "application/msword", "application/vnd.ms-excel", "vnd.ms-excel.addin.macroEnabled.12",
	"vnd.ms-excel.sheet.binary.macroEnabled.12", "vnd.ms-excel.sheet.macroEnabled.12",
	"vnd.ms-excel.template.macroEnabled.12", "application/vnd.ms-word.document.macroEnabled.12",
	"vnd.ms-word.template.macroEnabled.12", "application/vnd.ms-word.template.macroEnabled.12",
}

View Source

var DefaultFollowContentTypes = []string{
	"text/html", "text/plain", "text/javascript", "application/javascript", "application/json", "application/atom+xml",
	"application/rss+xml", "application/xhtml+xml", "application/x-latex", "application/xml", "application/xml-dtd",
	"application/x-sh", "application/x-tex", "application/x-texinfo", "text/cache-manifest", "text/calendar",
	"text/css", "text/csv", "text/csv-schema", "text/directory", "text/dns", "text/ecmascript", "text/encaprtp",
	"text/example", "text/fwdred", "text/grammar-ref-list", "text/jcr-cnd", "text/markdown", "text/mizar", "text/n3",
	"text/parameters", "text/provenance-notation", "text/prs.fallenstein.rst", "text/prs.lines.tag", "text/raptorfec",
	"text/RED", "text/rfc822-headers", "text/rtf", "text/rtp-enc-aescm128", "text/rtploopback", "text/rtx", "text/SGML",
	"text/t140", "text/tab-separated-values", "text/troff", "text/turtle", "text/ulpfec", "text/uri-list", "text/vcard",
	"text/vnd.abc", "text/vnd.debian.copyright", "text/vnd.DMClientScript", "text/vnd.dvb.subtitle",
	"text/vnd.esmertec.theme-descriptor", "text/vnd.fly", "text/vnd.fmi.flexstor", "text/vnd.graphviz",
	"text/vnd.in3d.3dml", "text/vnd.in3d.spot", "text/vnd.IPTC.NewsML", "text/vnd.IPTC.NITF", "text/vnd.latex-z",
	"text/vnd.motorola.reflex", "text/vnd.ms-mediapackage", "text/vnd.net2phone.commcenter.command",
	"text/vnd.radisys.msml-basic-layout", "text/vnd.si.uricatalogue", "text/vnd.sun.j2me.app-descriptor",
	"text/vnd.trolltech.linguist", "text/vnd.wap.si", "text/vnd.wap.sl", "text/vnd.wap.wmlscript", "text/vnd.wap-wml",
	"text/vnd-a", "text/vnd-curl", "text/xml", "text/xml-external-parsed-entity",
}

Functions ¶

func CheckSetup ¶

func CheckSetup() error

CheckSetup checks whether Setup() executed accordingly. Scan arguments should be checked by the scanner.

func Setup ¶

func Setup(logger utils.Logger) error

Setup configures the environment accordingly, if the scan module has some special requirements. A successful setup is required before a scan can be started.

Types ¶

type CrawlResult ¶

type CrawlResult struct {
	Vhost               string // The vhost this content was discovered with
	Status              string
	FaviconHash         string
	AuthMethod          string // Authentication methods seen while crawling this target
	AuthSuccess         bool   // Authentication success, if authentication-required was discovered
	RequestsTotal       int    // Amount of HTTP requests in total (except www-authenticate round-trips)
	RequestsRedirect    int    // Amount of HTTP requests, where only the headers were read
	RequestsPartial     int    // Amount of HTTP requests, where only the headers were read
	RequestsComplete    int    // Amount of HTTP requests, where the full response read
	DiscoveredVhosts    []string
	DiscoveredDownloads []string
	Pages               []*Page
}

type Crawler ¶

type Crawler struct {
	// contains filtered or unexported fields
}

func NewCrawler ¶

func NewCrawler(
	logger utils.Logger,
	baseUrl url.URL,
	vhost string,
	https bool,
	depth int,
	followQS bool,
	storeRoot bool,
	download bool,
	downloadFolder string,
	ntlmDomain string,
	ntlmUser string,
	ntlmPassword string,
	userAgent string,
	proxy *url.URL,
	requestTimeout time.Duration,
	followTypes []string,
	downloadTypes []string,
	maxThreads int,
	deadline time.Time,
) (*Crawler, error)

func (*Crawler) Crawl ¶

func (c *Crawler) Crawl() *CrawlResult

type Page ¶

type Page struct {
	Depth               int
	Url                 *url.URL
	RedirectUrl         string // Final URL the request got redirected to. Might be inside or outside of original endpoint.
	RedirectCount       int    // Number of redirects that happened until the final URL was reached
	AuthMethod          string
	AuthSuccess         bool
	ResponseCode        int
	ResponseMessage     string
	ResponseContentType string
	ResponseHeaders     string
	ResponseEncoding    string // The encoding used to decode the response body returned by the server. Decided based on response headers, meta tags and trial and error.
	HtmlTitle           string
	HtmlContent         []byte   // Bytes array, to be converted by consumer as required
	RawLinks            []string // URLs found on that page
}

type Result ¶

type Result struct {
	Data      []*CrawlResult
	Status    string // Final scan status (success or graceful error). Should be stored along with the scan results.
	Exception bool   // Indicates if something went wrong badly and results shall be discarded. This should never be

}

type Scanner ¶

type Scanner struct {
	Label    string
	Started  time.Time
	Finished time.Time
	// contains filtered or unexported fields
}

func NewScanner ¶

func NewScanner(
	logger utils.Logger,
	target string,
	port int,
	vhosts []string,
	https bool,
	depth int,
	maxThreads int,
	followQS bool,
	storeRoot bool,
	download bool,
	outputFolder string,
	ntlmDomain string,
	ntlmUser string,
	ntlmPassword string,
	userAgent string,
	proxy string,
	requestTimeout time.Duration,
) (*Scanner, error)

func (*Scanner) Run ¶

func (s *Scanner) Run(timeout time.Duration) (res *Result)

Run starts scan execution. This must either be executed as a goroutine, or another thread must be active listening on the scan's result channel, in order to avoid a deadlock situation.

func (*Scanner) SetDownloadContentTypes ¶

func (s *Scanner) SetDownloadContentTypes(responseContentTypes []string) error

SetDownloadContentTypes allows to set a custom none-default list of response content types to be downloaded during crawling a website.

func (*Scanner) SetFollowContentTypes ¶

func (s *Scanner) SetFollowContentTypes(responseContentTypes []string) error

SetFollowContentTypes allows to set a custom none-default list of response content types to be followed during crawling a website.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL