base

package
v0.0.0-...-1d26606 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 11, 2022 License: MIT Imports: 12 Imported by: 1

Documentation

Overview

This package contains the base/root components of MIDA. Other MIDA packages import this package, but this package should not depend on any other MIDA packages

Index

Constants

View Source
const (
	// Output Parameters
	DefaultLocalOutputPath        = "results"
	DefaultPostQueue              = ""
	DefaultResourceSubdir         = "resources"
	DefaultScriptSubdir           = "scripts"
	DefaultCoverageSubdir         = "coverage"
	DefaultScreenshotFileName     = "screenshot.png"
	DefaultCookieFileName         = "cookies.json"
	DefaultDomFileName            = "dom.json"
	DefaultMetadataFile           = "metadata.json"
	DefaultCovBVFileName          = "coverage.bv"
	DefaultCovTreeSummaryFileName = "cov_tree.csv"
	DefaultResourceMetadataFile   = "resource_metadata.json"
	DefaultScriptMetadataFile     = "script_metadata.json"
	DefaultSftpPrivKeyFile        = "~/.ssh/id_rsa"
	DefaultTaskLogFile            = "task.log"

	DefaultNavTimeout           = 30 // How long to wait when connecting to a web server
	DefaultSSHBackoffMultiplier = 5  // Exponential increase in time between tries when connecting for SFTP storage
	DefaultTaskPriority         = 5  // Queue priority when creating new tasks -- Value should be 1-10

	DefaultEventChannelBufferSize = 10000

	// Browser-Related Parameters
	DefaultOSXChromePath       = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
	DefaultOSXChromiumPath     = "/Applications/Chromium.app/Contents/MacOS/Chromium"
	DefaultLinuxChromePath     = "/usr/bin/google-chrome-stable"
	DefaultLinuxChromiumPath   = "/usr/bin/chromium-browser"
	DefaultWindowsChromePath   = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
	DefaultWindowsChromiumPath = "\\%LocalAppData%\\chromium\\Application\\chrome.exe"
	DefaultHeadless            = false

	// RawTask completion
	DefaultTimeAfterLoad       = 5  // Default time to stay on a page after load event is fired (in TimeAfterLoad mode)
	DefaultTimeout             = 10 // Default time (in seconds) to remain on a page before exiting browser
	DefaultCompletionCondition = TimeoutOnly

	// Default Interaction Settings
	DefaultNavLockAfterLoad      = true
	DefaultBasicInteraction      = false
	DefaultGremlins              = false
	DefaultTriggerEventListeners = false

	// Defaults for data gathering settings
	DefaultAllResources     = true
	DefaultAllScripts       = false
	DefaultCookies          = true
	DefaultDOM              = false
	DefaultResourceMetadata = true
	DefaultScreenshot       = true
	DefaultScriptMetadata   = false
	DefaultBrowserCoverage  = false
	DefaultRawCovFiles      = false
	DefaultCovTxtFile       = false
	DefaultCovTreeSummary   = false

	DefaultShuffle = true // Whether to shuffle order of task processing

	DefaultProtocolPrefix = "https://" // If no protocol is provided, we use https for the crawl
)

Variables

View Source
var (

	// Flags we apply by default to Chrome/Chromium-based browsers
	DefaultChromiumBrowserFlags = []string{
		"--enable-features=NetworkService",
		"--disable-background-networking",
		"--disable-background-timer-throttling",
		"--disable-backgrounding-occluded-windows",
		"--disable-client-side-phishing-detection",
		"--disable-extensions",
		"--disable-features=IsolateOrigins,site-per-process",
		"--disable-hang-monitor",
		"--disable-ipc-flooding-protection",
		"--disable-infobars",
		"--disable-popup-blocking",
		"--disable-prompt-on-repost",
		"--disable-renderer-backgrounding",
		"--disable-sync",
		"--disk-cache-size=0",
		"--incognito",
		"--new-window",
		"--no-default-browser-check",
		"--no-first-run",
		"--no-sandbox",
		"--safebrowsing-disable-auto-update",
	}
)

Functions

func WriteCompressedTaskSetToBytes

func WriteCompressedTaskSetToBytes(tasks CompressedTaskSet) ([]byte, error)

WriteCompressedTaskSetToBytes takes a CompressedTaskSet and converts it to corresponding JSON bytes to transfer somewhere.

func WriteCompressedTaskSetToFile

func WriteCompressedTaskSetToFile(cts *CompressedTaskSet, filename string, overwrite bool) error

WriteCompressedTaskSetToFile takes a CompressedTaskSet and writes a JSON representation of it out to a file

func WriteTaskSliceToBytes

func WriteTaskSliceToBytes(tasks []RawTask) ([]byte, error)

WriteTaskSliceToBytes takes a slice of tasks and converts it to corresponding JSON bytes to transfer somewhere.

func WriteTaskSliceToFile

func WriteTaskSliceToFile(tasks []RawTask, filename string) error

WriteTaskSliceToFile takes a RawTask slice and writes it out as a JSON file to a given filename.

Types

type BrowserCoverageMetadata

type BrowserCoverageMetadata struct {
	RawCoverageFilenames []string `json:"raw_coverage_filenames"`
	TotalRegions         int      `json:"total_regions"`
	CoveredRegions       int      `json:"covered_regions""`
}

type BrowserSettings

type BrowserSettings struct {
	BrowserBinary       *string              `json:"browser_binary,omitempty"`       // The binary for the browser (e.g., "/path/to/chrome.exe")
	UserDataDirectory   *string              `json:"user_data_directory,omitempty"`  // Path to user data directory to use
	AddBrowserFlags     *[]string            `json:"add_browser_flags,omitempty"`    // Flags to be added to default browser flags
	RemoveBrowserFlags  *[]string            `json:"remove_browser_flags,omitempty"` // Flags to be removed from default browser flags
	SetBrowserFlags     *[]string            `json:"set_browser_flags,omitempty"`    // Flags to use to override default browser flags
	Extensions          *[]string            `json:"extensions,omitempty"`           // Paths to browser extensions to be used for the crawl
	InteractionSettings *InteractionSettings `json:"interaction_settings"`           // Settings describing how the browser will interact with the page
}

Settings describing the way in which a browser will be opened

func AllocateNewBrowserSettings

func AllocateNewBrowserSettings() *BrowserSettings

AllocateNewBrowserSettings allocates a new BrowserSettings struct, initializing everything to zero values

type CompletionCondition

type CompletionCondition string

Conditions under which a crawl will complete successfully

const (
	TimeoutOnly   CompletionCondition = "TimeoutOnly"   // Complete only when the timeout is reached
	TimeAfterLoad CompletionCondition = "TimeAfterLoad" // Wait a given number of seconds after the load event
	LoadEvent     CompletionCondition = "LoadEvent"     // Terminate crawl immediately when load event fires
)

type CompletionSettings

type CompletionSettings struct {
	CompletionCondition *CompletionCondition `json:"completion_condition"`      // Condition under which crawl will complete
	Timeout             *int                 `json:"timeout,omitempty"`         // Maximum amount of time the browser will remain open
	TimeAfterLoad       *int                 `json:"time_after_load,omitempty"` // Maximum amount of time the browser will remain open after page load
}

Settings describing how a particular crawl will terminate

func AllocateNewCompletionSettings

func AllocateNewCompletionSettings() *CompletionSettings

AllocateNewCompletionSettings allocates a new CompletionSettings struct, initializing everything to zero values

type CompressedTaskSet

type CompressedTaskSet struct {
	URL *[]string `json:"url"` // List of URLs to be visited

	Browser    *BrowserSettings    `json:"browser_settings"`    // Settings for launching the browser
	Completion *CompletionSettings `json:"completion_settings"` // Settings for when the site visit will complete
	Data       *DataSettings       `json:"data_settings"`       // Settings for what data will be collected from the site
	Output     *OutputSettings     `json:"output_settings"`     // Settings for what/how results will be saved

	Repeat *int `json:"repeat"` // Number of times to repeat the crawl after it finishes successfully
}

A grouping of tasks for multiple URLs that may be repeated

func AllocateNewCompressedTaskSet

func AllocateNewCompressedTaskSet() *CompressedTaskSet

type CrawlerInfo

type CrawlerInfo struct {
	Browser        string `json:"browser"`         // Name of the browser itself
	BrowserVersion string `json:"browser_version"` // Version of the browser we are using
	UserAgent      string `json:"user_agent"`      // User agent we are using
	JSVersion      string `json:"js_version"`      // JS version
}

Information about the infrastructure used to perform the crawl

type DTResource

type DTResource struct {
	Requests []*network.EventRequestWillBeSent `json:"requests"`  // All requests sent for this particular request
	Response *network.EventResponseReceived    `json:"responses"` // All responses received for this particular request
}

type DataSettings

type DataSettings struct {
	AllResources     *bool `json:"all_resources,omitempty"`     // Save all resource files
	AllScripts       *bool `json:"all_scripts,omitempty"`       // Save all scripts parsed by browser
	Cookies          *bool `json:"cookies,omitempty"`           // Save cookies set by page
	DOM              *bool `json:"dom,omitempty"`               // Collect JSON representation of the DOM
	ResourceMetadata *bool `json:"resource_metadata,omitempty"` // Save extensive metadata about each resource
	Screenshot       *bool `json:"screenshot,omitempty"`        // Save a screenshot from the web page
	ScriptMetadata   *bool `json:"script_metadata,omitempty"`   // Save metadata on scripts parsed by browser

	BrowserCoverage *bool `json:"browser_coverage"` // Whether to gather code coverage data from the browser
	RawCovFiles     *bool `json:"raw_cov_files"`    // Raw profraw files from browser
	CovTxtFile      *bool `json:"cov_txt_file"`     // llvm-cov-custom generated text file containing coverage
	CovTreeSummary  *bool `json:"cov_tree_summary"` // CSV summary of code coverage for file tree
}

Settings describing which data MIDA will capture from the crawl

func AllocateNewDataSettings

func AllocateNewDataSettings() *DataSettings

AllocateNewDataSettings allocates a new DataSettings struct, initializing everything to zero values

type DevToolsNetworkRawData

type DevToolsNetworkRawData struct {
	RequestWillBeSent map[string][]*network.EventRequestWillBeSent
	ResponseReceived  map[string]*network.EventResponseReceived
}

type DevToolsRawData

type DevToolsRawData struct {
	Network DevToolsNetworkRawData
	Cookies []*network.Cookie
	DOM     *cdp.Node
	Scripts DevToolsScriptRawData
}

type DevToolsScriptRawData

type DevToolsScriptRawData []*debugger.EventScriptParsed

type FinalResult

type FinalResult struct {
	Summary            TaskSummary                            `json:"stats"`   // Statistics on timing and resource usage for the crawl
	DTCookies          []*network.Cookie                      `json:"cookies"` // Cookies collected from DevTools protocol
	DTDOM              *cdp.Node                              `json:"dom"`
	DTResourceMetadata map[string]DTResource                  `json:"resource_metadata"` // Metadata on each resource loaded
	DTScriptMetadata   map[string]*debugger.EventScriptParsed `json:"script_metadata"`   // Metadata on each script parsed
}

type InteractionSettings

type InteractionSettings struct {
	LockNavigation        *bool `json:"lock_navigation"`
	BasicInteraction      *bool `json:"basic_interaction"`
	Gremlins              *bool `json:"gremlins"`
	TriggerEventListeners *bool `json:"event_listeners"`
}

Settings describing how MIDA will interact with a page

func AllocateNewInteractionSettings

func AllocateNewInteractionSettings() *InteractionSettings

AllocateNewInteractionSettings allocates a new InteractionSettings struct specifying if/how the browser will interact with pages it visits as part of the task

type LocalOutputSettings

type LocalOutputSettings struct {
	Enable *bool         `json:"enable,omitmepty"`        // Whether this storage method is enabled
	Path   *string       `json:"path,omitempty"`          // Path over the overarching results directory to be written
	DS     *DataSettings `json:"data_settings,omitempty"` // Data settings for output to local filesystem
}

Settings describing output of results to the local filesystem

func AllocateNewLocalOutputSettings

func AllocateNewLocalOutputSettings() *LocalOutputSettings

type OutputSettings

type OutputSettings struct {
	LocalOut  *LocalOutputSettings `json:"local_output_settings,omitempty"` // Output settings for the local filesystem
	SftpOut   *SftpOutputSettings  `json:"sftp_output_settings,omitempty"`  // Output settings for the remote filesystem
	PostQueue *string              `json:"post_queue,omitempty"`            // AMQP queue in which we should put metadata for crawl once complete
}

An aggregation of the output settings for a task or task-set

func AllocateNewOutputSettings

func AllocateNewOutputSettings() *OutputSettings

AllocateNewOutputSettings allocates a new OutputSettings struct, initializing everything to zero values

type RawResult

type RawResult struct {
	TaskSummary TaskSummary     // Summary information about the task, not necessarily complete in RawResult
	DevTools    DevToolsRawData // Struct Containing Raw Data gathered from a DevTools site visit
	sync.Mutex
}

The results MIDA gathers before they are post-processed

type RawTask

type RawTask struct {
	URL *string `json:"url"` // The URL to be visited

	Browser    *BrowserSettings    `json:"browser_settings"`    // Settings for launching the browser
	Completion *CompletionSettings `json:"completion_settings"` // Settings for when the site visit will complete
	Data       *DataSettings       `json:"data_settings"`       // Settings for what data will be collected from the site
	Output     *OutputSettings     `json:"output_settings"`     // Settings for what/how results will be saved
}

A raw MIDA task. This is the struct that is read from/written to file when tasks are stored as JSON.

func AllocateNewTask

func AllocateNewTask() *RawTask

AllocateNewTask allocates a new RawTask struct, initializing everything to zero values

func ExpandCompressedTaskSet

func ExpandCompressedTaskSet(ts CompressedTaskSet) []RawTask

ExpandCompressedTaskSet takes a CompressedTaskSet object and converts it into a slice of regular Tasks.

func ReadTasksFromBytes

func ReadTasksFromBytes(data []byte) ([]RawTask, error)

ReadTasksFromBytes reads in tasks from a byte array. It will read them whether they are formatted as individual tasks or as a CompressedTaskSet.

func ReadTasksFromFile

func ReadTasksFromFile(filename string) ([]RawTask, error)

ReadTasksFromFile is a wrapper function that reads single tasks, full task sets, or compressed task sets from file.

type SanitizedTask

type SanitizedTask struct {
	URL string

	BrowserBinaryPath string   // Full path to the browser binary we use for the crawl
	BrowserFlags      []string // List of flags we will use when opening the browser (does not include --remote-debugging-port or similar)
	UserDataDirectory string   // Full path to the user data directory for the task

	CS  CompletionSettings  // Task completion settings for the task
	DS  DataSettings        // Data Gathering Settings for the task
	IS  InteractionSettings // Settings on how the browser will interact with the page
	OPS OutputSettings      // Output settings for the task
}

Internal type built from the process of sanitizing a RawTask. Should contain all the parameters needed for a crawl without the need to re-access the raw task. SanitizedTask should not contain information that cannot be deduced based on the raw task (and system parameters).

type SftpOutputSettings

type SftpOutputSettings struct {
	Enable         *bool         `json:"enable,omitempty"`           // Whether this storage method is enabled
	Host           *string       `json:"host,omitempty"`             // IP address or domain name of host to store to
	Port           *int          `json:"port,omitempty"`             // Port to initiate SSH/SFTP connection
	Path           *string       `json:"path,omitempty"`             // Path of the overarching results directory to be written
	UserName       *string       `json:"user_name,omitempty"`        // User name we should use for accessing the host
	PrivateKeyFile *string       `json:"private_key_file,omitempty"` // Path to the private key file we should use for accessing the host
	DS             *DataSettings `json:"data_settings,omitempty"`    // Data settings for output via SSH/SFTP
}

Settings describing results output via SSH/SFTP

func AllocateNewSftpOutputSettings

func AllocateNewSftpOutputSettings() *SftpOutputSettings

type TaskSet

type TaskSet []RawTask

A slice of MIDA tasks, ready to be enqueued

type TaskSummary

type TaskSummary struct {
	NavURL string `json:"nav_url"`
	UUID   string `json:"uuid"`

	Success       bool   `json:"success"`                  // True if the task did not fail
	FailureReason string `json:"failure_reason,omitempty"` // Holds the failure code for the task

	TaskWrapper *TaskWrapper `json:"-"`            // Wrapper containing the full task
	TaskTiming  TaskTiming   `json:"task_timing"`  // Timing data for the task
	CrawlerInfo CrawlerInfo  `json:"crawler_info"` // Information about the infrastructure used to visit the site

	OutputHost string `json:"output_host,omitempty"` // Host to which results were stored via SFTP
	OutputPath string `json:"output_path,omitempty"` // Path to the results of the crawl on the applicable host (after storage)

	NumResources int `json:"num_resources"` // Number of resources the browser downloaded
	NumScripts   int `json:"num_scripts"`   // Number of scripts the browser parsed

	NavHistory []page.NavigationEntry `json:"nav_history"`

	BrowserCovData BrowserCoverageMetadata `json:"browser_cov_data"`
}

Statistics gathered about a specific task

type TaskTiming

type TaskTiming struct {
	BrowserOpen           time.Time `json:"browser_open"`
	ConnectionEstablished time.Time `json:"connection_established"`
	LoadEvent             time.Time `json:"load_event"`
	BrowserClose          time.Time `json:"browser_close"`
	BeginPostprocess      time.Time `json:"begin_postprocess"`
	EndPostprocess        time.Time `json:"end_postprocess"`
	BeginStorage          time.Time `json:"begin_storage"`
	EndStorage            time.Time `json:"-"`
}

TaskTiming contains timing data for the processing of a particular task

type TaskWrapper

type TaskWrapper struct {
	RawTask       RawTask       // A pointer to a MIDA task
	SanitizedTask SanitizedTask // A sanitized MIDA task

	UUID    uuid.UUID
	TempDir string // Temporary directory where results are stored. Can be the same as the UserDataDir in some cases.

	// Dynamic fields
	Log     *logrus.Logger
	LogFile *os.File
}

Wrapper struct which contains a task, along with some dynamic metadata. This is an internal struct only -- It should not be exported/stored.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL