Documentation ¶
Overview ¶
This package contains the base/root components of MIDA. Other MIDA packages import this package, but this package should not depend on any other MIDA packages
Index ¶
- Constants
- Variables
- func WriteCompressedTaskSetToBytes(tasks CompressedTaskSet) ([]byte, error)
- func WriteCompressedTaskSetToFile(cts *CompressedTaskSet, filename string, overwrite bool) error
- func WriteTaskSliceToBytes(tasks []RawTask) ([]byte, error)
- func WriteTaskSliceToFile(tasks []RawTask, filename string) error
- type BrowserCoverageMetadata
- type BrowserSettings
- type CompletionCondition
- type CompletionSettings
- type CompressedTaskSet
- type CrawlerInfo
- type DTResource
- type DataSettings
- type DevToolsNetworkRawData
- type DevToolsRawData
- type DevToolsScriptRawData
- type FinalResult
- type InteractionSettings
- type LocalOutputSettings
- type OutputSettings
- type RawResult
- type RawTask
- type SanitizedTask
- type SftpOutputSettings
- type TaskSet
- type TaskSummary
- type TaskTiming
- type TaskWrapper
Constants ¶
const ( // Output Parameters DefaultLocalOutputPath = "results" DefaultPostQueue = "" DefaultResourceSubdir = "resources" DefaultScriptSubdir = "scripts" DefaultCoverageSubdir = "coverage" DefaultScreenshotFileName = "screenshot.png" DefaultCookieFileName = "cookies.json" DefaultDomFileName = "dom.json" DefaultMetadataFile = "metadata.json" DefaultCovBVFileName = "coverage.bv" DefaultCovTreeSummaryFileName = "cov_tree.csv" DefaultResourceMetadataFile = "resource_metadata.json" DefaultScriptMetadataFile = "script_metadata.json" DefaultSftpPrivKeyFile = "~/.ssh/id_rsa" DefaultTaskLogFile = "task.log" DefaultSSHBackoffMultiplier = 5 // Exponential increase in time between tries when connecting for SFTP storage DefaultTaskPriority = 5 // Queue priority when creating new tasks -- Value should be 1-10 DefaultEventChannelBufferSize = 10000 // Browser-Related Parameters DefaultOSXChromePath = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" DefaultOSXChromiumPath = "/Applications/Chromium.app/Contents/MacOS/Chromium" DefaultLinuxChromePath = "/usr/bin/google-chrome-stable" DefaultLinuxChromiumPath = "/usr/bin/chromium-browser" DefaultWindowsChromePath = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" DefaultWindowsChromiumPath = "\\%LocalAppData%\\chromium\\Application\\chrome.exe" DefaultHeadless = false // RawTask completion DefaultTimeAfterLoad = 5 // Default time to stay on a page after load event is fired (in TimeAfterLoad mode) DefaultTimeout = 10 // Default time (in seconds) to remain on a page before exiting browser DefaultCompletionCondition = TimeoutOnly // Default Interaction Settings DefaultBasicInteraction = false DefaultGremlins = false DefaultTriggerEventListeners = false // Defaults for data gathering settings DefaultAllResources = true DefaultAllScripts = false DefaultCookies = true DefaultDOM = false DefaultResourceMetadata = true DefaultScreenshot = true DefaultScriptMetadata = false DefaultBrowserCoverage = false DefaultRawCovFiles = false DefaultCovTxtFile = false DefaultCovTreeSummary = false DefaultShuffle = true // Whether to shuffle order of task processing DefaultProtocolPrefix = "https://" // If no protocol is provided, we use https for the crawl )
Variables ¶
var CompletionConditions = [...]CompletionCondition{TimeoutOnly, TimeAfterLoad, LoadEvent}
var ( // Flags we apply by default to Chrome/Chromium-based browsers DefaultChromiumBrowserFlags = []string{ "--enable-features=NetworkService", "--disable-background-networking", "--disable-background-timer-throttling", "--disable-backgrounding-occluded-windows", "--disable-client-side-phishing-detection", "--disable-extensions", "--disable-features=IsolateOrigins,site-per-process", "--disable-hang-monitor", "--disable-ipc-flooding-protection", "--disable-infobars", "--disable-popup-blocking", "--disable-prompt-on-repost", "--disable-renderer-backgrounding", "--disable-sync", "--disk-cache-size=0", "--incognito", "--new-window", "--no-default-browser-check", "--no-first-run", "--no-sandbox", "--safebrowsing-disable-auto-update", } )
Functions ¶
func WriteCompressedTaskSetToBytes ¶
func WriteCompressedTaskSetToBytes(tasks CompressedTaskSet) ([]byte, error)
WriteCompressedTaskSetToBytes takes a CompressedTaskSet and converts it to corresponding JSON bytes to transfer somewhere.
func WriteCompressedTaskSetToFile ¶
func WriteCompressedTaskSetToFile(cts *CompressedTaskSet, filename string, overwrite bool) error
WriteCompressedTaskSetToFile takes a CompressedTaskSet and writes a JSON representation of it out to a file
func WriteTaskSliceToBytes ¶
WriteTaskSliceToBytes takes a slice of tasks and converts it to corresponding JSON bytes to transfer somewhere.
func WriteTaskSliceToFile ¶
WriteTaskSliceToFile takes a RawTask slice and writes it out as a JSON file to a given filename.
Types ¶
type BrowserCoverageMetadata ¶
type BrowserSettings ¶
type BrowserSettings struct { BrowserBinary *string `json:"browser_binary,omitempty"` // The binary for the browser (e.g., "/path/to/chrome.exe") UserDataDirectory *string `json:"user_data_directory,omitempty"` // Path to user data directory to use AddBrowserFlags *[]string `json:"add_browser_flags,omitempty"` // Flags to be added to default browser flags RemoveBrowserFlags *[]string `json:"remove_browser_flags,omitempty"` // Flags to be removed from default browser flags SetBrowserFlags *[]string `json:"set_browser_flags,omitempty"` // Flags to use to override default browser flags Extensions *[]string `json:"extensions,omitempty"` // Paths to browser extensions to be used for the crawl InteractionSettings *InteractionSettings `json:"interaction_settings"` // Settings describing how the browser will interact with the page }
Settings describing the way in which a browser will be opened
func AllocateNewBrowserSettings ¶
func AllocateNewBrowserSettings() *BrowserSettings
AllocateNewBrowserSettings allocates a new BrowserSettings struct, initializing everything to zero values
type CompletionCondition ¶
type CompletionCondition string
Conditions under which a crawl will complete successfully
const ( TimeoutOnly CompletionCondition = "TimeoutOnly" // Complete only when the timeout is reached TimeAfterLoad CompletionCondition = "TimeAfterLoad" // Wait a given number of seconds after the load event LoadEvent CompletionCondition = "LoadEvent" // Terminate crawl immediately when load event fires )
type CompletionSettings ¶
type CompletionSettings struct { CompletionCondition *CompletionCondition `json:"completion_condition"` // Condition under which crawl will complete Timeout *int `json:"timeout,omitempty"` // Maximum amount of time the browser will remain open TimeAfterLoad *int `json:"time_after_load,omitempty"` // Maximum amount of time the browser will remain open after page load }
Settings describing how a particular crawl will terminate
func AllocateNewCompletionSettings ¶
func AllocateNewCompletionSettings() *CompletionSettings
AllocateNewCompletionSettings allocates a new CompletionSettings struct, initializing everything to zero values
type CompressedTaskSet ¶
type CompressedTaskSet struct { URL *[]string `json:"url"` // List of URLs to be visited Browser *BrowserSettings `json:"browser_settings"` // Settings for launching the browser Completion *CompletionSettings `json:"completion_settings"` // Settings for when the site visit will complete Data *DataSettings `json:"data_settings"` // Settings for what data will be collected from the site Output *OutputSettings `json:"output_settings"` // Settings for what/how results will be saved Repeat *int `json:"repeat"` // Number of times to repeat the crawl after it finishes successfully }
A grouping of tasks for multiple URLs that may be repeated
func AllocateNewCompressedTaskSet ¶
func AllocateNewCompressedTaskSet() *CompressedTaskSet
type CrawlerInfo ¶
type CrawlerInfo struct { Browser string `json:"browser"` // Name of the browser itself BrowserVersion string `json:"browser_version"` // Version of the browser we are using UserAgent string `json:"user_agent"` // User agent we are using JSVersion string `json:"js_version"` // JS version }
Information about the infrastructure used to perform the crawl
type DTResource ¶
type DTResource struct { Requests []*network.EventRequestWillBeSent `json:"requests"` // All requests sent for this particular request Response *network.EventResponseReceived `json:"responses"` // All responses received for this particular request }
type DataSettings ¶
type DataSettings struct { AllResources *bool `json:"all_resources,omitempty"` // Save all resource files AllScripts *bool `json:"all_scripts,omitempty"` // Save all scripts parsed by browser Cookies *bool `json:"cookies,omitempty"` // Save cookies set by page DOM *bool `json:"dom,omitempty"` // Collect JSON representation of the DOM ResourceMetadata *bool `json:"resource_metadata,omitempty"` // Save extensive metadata about each resource Screenshot *bool `json:"screenshot,omitempty"` // Save a screenshot from the web page ScriptMetadata *bool `json:"script_metadata,omitempty"` // Save metadata on scripts parsed by browser BrowserCoverage *bool `json:"browser_coverage"` // Whether to gather code coverage data from the browser RawCovFiles *bool `json:"raw_cov_files"` // Raw profraw files from browser CovTxtFile *bool `json:"cov_txt_file"` // llvm-cov-custom generated text file containing coverage CovTreeSummary *bool `json:"cov_tree_summary"` // CSV summary of code coverage for file tree }
Settings describing which data MIDA will capture from the crawl
func AllocateNewDataSettings ¶
func AllocateNewDataSettings() *DataSettings
AllocateNewDataSettings allocates a new DataSettings struct, initializing everything to zero values
type DevToolsNetworkRawData ¶
type DevToolsNetworkRawData struct { RequestWillBeSent map[string][]*network.EventRequestWillBeSent ResponseReceived map[string]*network.EventResponseReceived }
type DevToolsRawData ¶
type DevToolsRawData struct { Network DevToolsNetworkRawData Cookies []*network.Cookie DOM *cdp.Node Scripts DevToolsScriptRawData }
type DevToolsScriptRawData ¶
type DevToolsScriptRawData []*debugger.EventScriptParsed
type FinalResult ¶
type FinalResult struct { Summary TaskSummary `json:"stats"` // Statistics on timing and resource usage for the crawl DTCookies []*network.Cookie `json:"cookies"` // Cookies collected from DevTools protocol DTDOM *cdp.Node `json:"dom"` DTResourceMetadata map[string]DTResource `json:"resource_metadata"` // Metadata on each resource loaded DTScriptMetadata map[string]*debugger.EventScriptParsed `json:"script_metadata"` // Metadata on each script parsed }
type InteractionSettings ¶
type InteractionSettings struct { BasicInteraction *bool `json:"basic_interaction"` Gremlins *bool `json:"gremlins"` TriggerEventListeners *bool `json:"event_listeners"` }
Settings describing how MIDA will interact with a page
func AllocateNewInteractionSettings ¶
func AllocateNewInteractionSettings() *InteractionSettings
AllocateNewInteractionSettings allocates a new InteractionSettings struct specifying if/how the browser will interact with pages it visits as part of the task
type LocalOutputSettings ¶
type LocalOutputSettings struct { Enable *bool `json:"enable,omitmepty"` // Whether this storage method is enabled Path *string `json:"path,omitempty"` // Path over the overarching results directory to be written DS *DataSettings `json:"data_settings,omitempty"` // Data settings for output to local filesystem }
Settings describing output of results to the local filesystem
func AllocateNewLocalOutputSettings ¶
func AllocateNewLocalOutputSettings() *LocalOutputSettings
type OutputSettings ¶
type OutputSettings struct { LocalOut *LocalOutputSettings `json:"local_output_settings,omitempty"` // Output settings for the local filesystem SftpOut *SftpOutputSettings `json:"sftp_output_settings,omitempty"` // Output settings for the remote filesystem PostQueue *string `json:"post_queue,omitempty"` // AMQP queue in which we should put metadata for crawl once complete }
An aggregation of the output settings for a task or task-set
func AllocateNewOutputSettings ¶
func AllocateNewOutputSettings() *OutputSettings
AllocateNewOutputSettings allocates a new OutputSettings struct, initializing everything to zero values
type RawResult ¶
type RawResult struct { TaskSummary TaskSummary // Summary information about the task, not necessarily complete in RawResult DevTools DevToolsRawData // Struct Containing Raw Data gathered from a DevTools site visit sync.Mutex }
The results MIDA gathers before they are post-processed
type RawTask ¶
type RawTask struct { URL *string `json:"url"` // The URL to be visited Browser *BrowserSettings `json:"browser_settings"` // Settings for launching the browser Completion *CompletionSettings `json:"completion_settings"` // Settings for when the site visit will complete Data *DataSettings `json:"data_settings"` // Settings for what data will be collected from the site Output *OutputSettings `json:"output_settings"` // Settings for what/how results will be saved }
A raw MIDA task. This is the struct that is read from/written to file when tasks are stored as JSON.
func AllocateNewTask ¶
func AllocateNewTask() *RawTask
AllocateNewTask allocates a new RawTask struct, initializing everything to zero values
func ExpandCompressedTaskSet ¶
func ExpandCompressedTaskSet(ts CompressedTaskSet) []RawTask
ExpandCompressedTaskSet takes a CompressedTaskSet object and converts it into a slice of regular Tasks.
func ReadTasksFromBytes ¶
ReadTasksFromBytes reads in tasks from a byte array. It will read them whether they are formatted as individual tasks or as a CompressedTaskSet.
func ReadTasksFromFile ¶
ReadTasksFromFile is a wrapper function that reads single tasks, full task sets, or compressed task sets from file.
type SanitizedTask ¶
type SanitizedTask struct { URL string BrowserBinaryPath string // Full path to the browser binary we use for the crawl BrowserFlags []string // List of flags we will use when opening the browser (does not include --remote-debugging-port or similar) UserDataDirectory string // Full path to the user data directory for the task CS CompletionSettings // Task completion settings for the task DS DataSettings // Data Gathering Settings for the task IS InteractionSettings // Settings on how the browser will interact with the page OPS OutputSettings // Output settings for the task }
Internal type built from the process of sanitizing a RawTask. Should contain all the parameters needed for a crawl without the need to re-access the raw task. SanitizedTask should not contain information that cannot be deduced based on the raw task (and system parameters).
type SftpOutputSettings ¶
type SftpOutputSettings struct { Enable *bool `json:"enable,omitempty"` // Whether this storage method is enabled Host *string `json:"host,omitempty"` // IP address or domain name of host to store to Port *int `json:"port,omitempty"` // Port to initiate SSH/SFTP connection Path *string `json:"path,omitempty"` // Path of the overarching results directory to be written UserName *string `json:"user_name,omitempty"` // User name we should use for accessing the host PrivateKeyFile *string `json:"private_key_file,omitempty"` // Path to the private key file we should use for accessing the host DS *DataSettings `json:"data_settings,omitempty"` // Data settings for output via SSH/SFTP }
Settings describing results output via SSH/SFTP
func AllocateNewSftpOutputSettings ¶
func AllocateNewSftpOutputSettings() *SftpOutputSettings
type TaskSummary ¶
type TaskSummary struct { UUID string `json:"uuid"` Success bool `json:"success"` // True if the task did not fail FailureReason string `json:"failure_reason,omitempty"` // Holds the failure code for the task TaskWrapper *TaskWrapper `json:"-"` // Wrapper containing the full task TaskTiming TaskTiming `json:"task_timing"` // Timing data for the task CrawlerInfo CrawlerInfo `json:"crawler_info"` // Information about the infrastructure used to visit the site OutputHost string `json:"output_host,omitempty"` // Host to which results were stored via SFTP OutputPath string `json:"output_path,omitempty"` // Path to the results of the crawl on the applicable host (after storage) NumResources int `json:"num_resources"` // Number of resources the browser downloaded NumScripts int `json:"num_scripts"` // Number of scripts the browser parsed BrowserCovData BrowserCoverageMetadata `json:"browser_cov_data"` }
Statistics gathered about a specific task
type TaskTiming ¶
type TaskTiming struct { BrowserOpen time.Time `json:"browser_open"` ConnectionEstablished time.Time `json:"connection_established"` LoadEvent time.Time `json:"load_event"` BrowserClose time.Time `json:"browser_close"` BeginPostprocess time.Time `json:"begin_postprocess"` EndPostprocess time.Time `json:"end_postprocess"` BeginStorage time.Time `json:"begin_storage"` EndStorage time.Time `json:"-"` }
TaskTiming contains timing data for the processing of a particular task
type TaskWrapper ¶
type TaskWrapper struct { RawTask RawTask // A pointer to a MIDA task SanitizedTask SanitizedTask // A sanitized MIDA task UUID uuid.UUID TempDir string // Temporary directory where results are stored. Can be the same as the UserDataDir in some cases. // Dynamic fields Log *logrus.Logger LogFile *os.File }
Wrapper struct which contains a task, along with some dynamic metadata. This is an internal struct only -- It should not be exported/stored.