base

package

v0.0.0-...-1d26606 Latest Latest Go to latest Published: Dec 11, 2022 License: MIT Imports: 12 Imported by: 1

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/teamnsrg/mida

Links

Open Source Insights

Documentation ¶

Overview ¶

This package contains the base/root components of MIDA. Other MIDA packages import this package, but this package should not depend on any other MIDA packages

Index ¶

Constants
Variables
func WriteCompressedTaskSetToBytes(tasks CompressedTaskSet) ([]byte, error)
func WriteCompressedTaskSetToFile(cts *CompressedTaskSet, filename string, overwrite bool) error
func WriteTaskSliceToBytes(tasks []RawTask) ([]byte, error)
func WriteTaskSliceToFile(tasks []RawTask, filename string) error
type BrowserCoverageMetadata
type BrowserSettings
- func AllocateNewBrowserSettings() *BrowserSettings
type CompletionCondition
type CompletionSettings
- func AllocateNewCompletionSettings() *CompletionSettings
type CompressedTaskSet
- func AllocateNewCompressedTaskSet() *CompressedTaskSet
type CrawlerInfo
type DTResource
type DataSettings
- func AllocateNewDataSettings() *DataSettings
type DevToolsNetworkRawData
type DevToolsRawData
type DevToolsScriptRawData
type FinalResult
type InteractionSettings
- func AllocateNewInteractionSettings() *InteractionSettings
type LocalOutputSettings
- func AllocateNewLocalOutputSettings() *LocalOutputSettings
type OutputSettings
- func AllocateNewOutputSettings() *OutputSettings
type RawResult
type RawTask
- func AllocateNewTask() *RawTask
- func ExpandCompressedTaskSet(ts CompressedTaskSet) []RawTask
- func ReadTasksFromBytes(data []byte) ([]RawTask, error)
- func ReadTasksFromFile(filename string) ([]RawTask, error)
type SanitizedTask
type SftpOutputSettings
- func AllocateNewSftpOutputSettings() *SftpOutputSettings
type TaskSet
type TaskSummary
type TaskTiming
type TaskWrapper

Constants ¶

View Source

const (
	// Output Parameters
	DefaultLocalOutputPath        = "results"
	DefaultPostQueue              = ""
	DefaultResourceSubdir         = "resources"
	DefaultScriptSubdir           = "scripts"
	DefaultCoverageSubdir         = "coverage"
	DefaultScreenshotFileName     = "screenshot.png"
	DefaultCookieFileName         = "cookies.json"
	DefaultDomFileName            = "dom.json"
	DefaultMetadataFile           = "metadata.json"
	DefaultCovBVFileName          = "coverage.bv"
	DefaultCovTreeSummaryFileName = "cov_tree.csv"
	DefaultResourceMetadataFile   = "resource_metadata.json"
	DefaultScriptMetadataFile     = "script_metadata.json"
	DefaultSftpPrivKeyFile        = "~/.ssh/id_rsa"
	DefaultTaskLogFile            = "task.log"

	DefaultNavTimeout           = 30 // How long to wait when connecting to a web server
	DefaultSSHBackoffMultiplier = 5  // Exponential increase in time between tries when connecting for SFTP storage
	DefaultTaskPriority         = 5  // Queue priority when creating new tasks -- Value should be 1-10

	DefaultEventChannelBufferSize = 10000

	// Browser-Related Parameters
	DefaultOSXChromePath       = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
	DefaultOSXChromiumPath     = "/Applications/Chromium.app/Contents/MacOS/Chromium"
	DefaultLinuxChromePath     = "/usr/bin/google-chrome-stable"
	DefaultLinuxChromiumPath   = "/usr/bin/chromium-browser"
	DefaultWindowsChromePath   = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
	DefaultWindowsChromiumPath = "\\%LocalAppData%\\chromium\\Application\\chrome.exe"
	DefaultHeadless            = false

	// RawTask completion
	DefaultTimeAfterLoad       = 5  // Default time to stay on a page after load event is fired (in TimeAfterLoad mode)
	DefaultTimeout             = 10 // Default time (in seconds) to remain on a page before exiting browser
	DefaultCompletionCondition = TimeoutOnly

	// Default Interaction Settings
	DefaultNavLockAfterLoad      = true
	DefaultBasicInteraction      = false
	DefaultGremlins              = false
	DefaultTriggerEventListeners = false

	// Defaults for data gathering settings
	DefaultAllResources     = true
	DefaultAllScripts       = false
	DefaultCookies          = true
	DefaultDOM              = false
	DefaultResourceMetadata = true
	DefaultScreenshot       = true
	DefaultScriptMetadata   = false
	DefaultBrowserCoverage  = false
	DefaultRawCovFiles      = false
	DefaultCovTxtFile       = false
	DefaultCovTreeSummary   = false

	DefaultShuffle = true // Whether to shuffle order of task processing

	DefaultProtocolPrefix = "https://" // If no protocol is provided, we use https for the crawl
)

Variables ¶

View Source

var CompletionConditions = [...]CompletionCondition{TimeoutOnly, TimeAfterLoad, LoadEvent}

View Source

var (

	// Flags we apply by default to Chrome/Chromium-based browsers
	DefaultChromiumBrowserFlags = []string{
		"--enable-features=NetworkService",
		"--disable-background-networking",
		"--disable-background-timer-throttling",
		"--disable-backgrounding-occluded-windows",
		"--disable-client-side-phishing-detection",
		"--disable-extensions",
		"--disable-features=IsolateOrigins,site-per-process",
		"--disable-hang-monitor",
		"--disable-ipc-flooding-protection",
		"--disable-infobars",
		"--disable-popup-blocking",
		"--disable-prompt-on-repost",
		"--disable-renderer-backgrounding",
		"--disable-sync",
		"--disk-cache-size=0",
		"--incognito",
		"--new-window",
		"--no-default-browser-check",
		"--no-first-run",
		"--no-sandbox",
		"--safebrowsing-disable-auto-update",
	}
)

Functions ¶

func WriteCompressedTaskSetToBytes ¶

func WriteCompressedTaskSetToBytes(tasks CompressedTaskSet) ([]byte, error)

WriteCompressedTaskSetToBytes takes a CompressedTaskSet and converts it to corresponding JSON bytes to transfer somewhere.

func WriteCompressedTaskSetToFile ¶

func WriteCompressedTaskSetToFile(cts *CompressedTaskSet, filename string, overwrite bool) error

WriteCompressedTaskSetToFile takes a CompressedTaskSet and writes a JSON representation of it out to a file

func WriteTaskSliceToBytes ¶

func WriteTaskSliceToBytes(tasks []RawTask) ([]byte, error)

WriteTaskSliceToBytes takes a slice of tasks and converts it to corresponding JSON bytes to transfer somewhere.

func WriteTaskSliceToFile ¶

func WriteTaskSliceToFile(tasks []RawTask, filename string) error

WriteTaskSliceToFile takes a RawTask slice and writes it out as a JSON file to a given filename.

Types ¶

type BrowserCoverageMetadata ¶

type BrowserCoverageMetadata struct {
	RawCoverageFilenames []string `json:"raw_coverage_filenames"`
	TotalRegions         int      `json:"total_regions"`
	CoveredRegions       int      `json:"covered_regions""`
}

type BrowserSettings ¶

type BrowserSettings struct {
	BrowserBinary       *string              `json:"browser_binary,omitempty"`       // The binary for the browser (e.g., "/path/to/chrome.exe")
	UserDataDirectory   *string              `json:"user_data_directory,omitempty"`  // Path to user data directory to use
	AddBrowserFlags     *[]string            `json:"add_browser_flags,omitempty"`    // Flags to be added to default browser flags
	RemoveBrowserFlags  *[]string            `json:"remove_browser_flags,omitempty"` // Flags to be removed from default browser flags
	SetBrowserFlags     *[]string            `json:"set_browser_flags,omitempty"`    // Flags to use to override default browser flags
	Extensions          *[]string            `json:"extensions,omitempty"`           // Paths to browser extensions to be used for the crawl
	InteractionSettings *InteractionSettings `json:"interaction_settings"`           // Settings describing how the browser will interact with the page
}

Settings describing the way in which a browser will be opened

func AllocateNewBrowserSettings ¶

func AllocateNewBrowserSettings() *BrowserSettings

AllocateNewBrowserSettings allocates a new BrowserSettings struct, initializing everything to zero values

type CompletionCondition ¶

type CompletionCondition string

Conditions under which a crawl will complete successfully

const (
	TimeoutOnly   CompletionCondition = "TimeoutOnly"   // Complete only when the timeout is reached
	TimeAfterLoad CompletionCondition = "TimeAfterLoad" // Wait a given number of seconds after the load event
	LoadEvent     CompletionCondition = "LoadEvent"     // Terminate crawl immediately when load event fires
)

type CompletionSettings ¶

type CompletionSettings struct {
	CompletionCondition *CompletionCondition `json:"completion_condition"`      // Condition under which crawl will complete
	Timeout             *int                 `json:"timeout,omitempty"`         // Maximum amount of time the browser will remain open
	TimeAfterLoad       *int                 `json:"time_after_load,omitempty"` // Maximum amount of time the browser will remain open after page load
}

Settings describing how a particular crawl will terminate

func AllocateNewCompletionSettings ¶

func AllocateNewCompletionSettings() *CompletionSettings

AllocateNewCompletionSettings allocates a new CompletionSettings struct, initializing everything to zero values

type CompressedTaskSet ¶

type CompressedTaskSet struct {
	URL *[]string `json:"url"` // List of URLs to be visited

	Browser    *BrowserSettings    `json:"browser_settings"`    // Settings for launching the browser
	Completion *CompletionSettings `json:"completion_settings"` // Settings for when the site visit will complete
	Data       *DataSettings       `json:"data_settings"`       // Settings for what data will be collected from the site
	Output     *OutputSettings     `json:"output_settings"`     // Settings for what/how results will be saved

	Repeat *int `json:"repeat"` // Number of times to repeat the crawl after it finishes successfully
}

A grouping of tasks for multiple URLs that may be repeated

func AllocateNewCompressedTaskSet ¶

func AllocateNewCompressedTaskSet() *CompressedTaskSet

type CrawlerInfo ¶

type CrawlerInfo struct {
	Browser        string `json:"browser"`         // Name of the browser itself
	BrowserVersion string `json:"browser_version"` // Version of the browser we are using
	UserAgent      string `json:"user_agent"`      // User agent we are using
	JSVersion      string `json:"js_version"`      // JS version
}

Information about the infrastructure used to perform the crawl

type DTResource ¶

type DTResource struct {
	Requests []*network.EventRequestWillBeSent `json:"requests"`  // All requests sent for this particular request
	Response *network.EventResponseReceived    `json:"responses"` // All responses received for this particular request
}

type DataSettings ¶

type DataSettings struct {
	AllResources     *bool `json:"all_resources,omitempty"`     // Save all resource files
	AllScripts       *bool `json:"all_scripts,omitempty"`       // Save all scripts parsed by browser
	Cookies          *bool `json:"cookies,omitempty"`           // Save cookies set by page
	DOM              *bool `json:"dom,omitempty"`               // Collect JSON representation of the DOM
	ResourceMetadata *bool `json:"resource_metadata,omitempty"` // Save extensive metadata about each resource
	Screenshot       *bool `json:"screenshot,omitempty"`        // Save a screenshot from the web page
	ScriptMetadata   *bool `json:"script_metadata,omitempty"`   // Save metadata on scripts parsed by browser

	BrowserCoverage *bool `json:"browser_coverage"` // Whether to gather code coverage data from the browser
	RawCovFiles     *bool `json:"raw_cov_files"`    // Raw profraw files from browser
	CovTxtFile      *bool `json:"cov_txt_file"`     // llvm-cov-custom generated text file containing coverage
	CovTreeSummary  *bool `json:"cov_tree_summary"` // CSV summary of code coverage for file tree
}

Settings describing which data MIDA will capture from the crawl

func AllocateNewDataSettings ¶

func AllocateNewDataSettings() *DataSettings

AllocateNewDataSettings allocates a new DataSettings struct, initializing everything to zero values

type DevToolsNetworkRawData ¶

type DevToolsNetworkRawData struct {
	RequestWillBeSent map[string][]*network.EventRequestWillBeSent
	ResponseReceived  map[string]*network.EventResponseReceived
}

type DevToolsRawData ¶

type DevToolsRawData struct {
	Network DevToolsNetworkRawData
	Cookies []*network.Cookie
	DOM     *cdp.Node
	Scripts DevToolsScriptRawData
}

type DevToolsScriptRawData ¶

type DevToolsScriptRawData []*debugger.EventScriptParsed

type FinalResult ¶

type FinalResult struct {
	Summary            TaskSummary                            `json:"stats"`   // Statistics on timing and resource usage for the crawl
	DTCookies          []*network.Cookie                      `json:"cookies"` // Cookies collected from DevTools protocol
	DTDOM              *cdp.Node                              `json:"dom"`
	DTResourceMetadata map[string]DTResource                  `json:"resource_metadata"` // Metadata on each resource loaded
	DTScriptMetadata   map[string]*debugger.EventScriptParsed `json:"script_metadata"`   // Metadata on each script parsed
}

type InteractionSettings ¶

type InteractionSettings struct {
	LockNavigation        *bool `json:"lock_navigation"`
	BasicInteraction      *bool `json:"basic_interaction"`
	Gremlins              *bool `json:"gremlins"`
	TriggerEventListeners *bool `json:"event_listeners"`
}

Settings describing how MIDA will interact with a page

func AllocateNewInteractionSettings ¶

func AllocateNewInteractionSettings() *InteractionSettings

AllocateNewInteractionSettings allocates a new InteractionSettings struct specifying if/how the browser will interact with pages it visits as part of the task

type LocalOutputSettings ¶

type LocalOutputSettings struct {
	Enable *bool         `json:"enable,omitmepty"`        // Whether this storage method is enabled
	Path   *string       `json:"path,omitempty"`          // Path over the overarching results directory to be written
	DS     *DataSettings `json:"data_settings,omitempty"` // Data settings for output to local filesystem
}

Settings describing output of results to the local filesystem

func AllocateNewLocalOutputSettings ¶

func AllocateNewLocalOutputSettings() *LocalOutputSettings

type OutputSettings ¶

type OutputSettings struct {
	LocalOut  *LocalOutputSettings `json:"local_output_settings,omitempty"` // Output settings for the local filesystem
	SftpOut   *SftpOutputSettings  `json:"sftp_output_settings,omitempty"`  // Output settings for the remote filesystem
	PostQueue *string              `json:"post_queue,omitempty"`            // AMQP queue in which we should put metadata for crawl once complete
}

An aggregation of the output settings for a task or task-set

func AllocateNewOutputSettings ¶

func AllocateNewOutputSettings() *OutputSettings

AllocateNewOutputSettings allocates a new OutputSettings struct, initializing everything to zero values

type RawResult ¶

type RawResult struct {
	TaskSummary TaskSummary     // Summary information about the task, not necessarily complete in RawResult
	DevTools    DevToolsRawData // Struct Containing Raw Data gathered from a DevTools site visit
	sync.Mutex
}

The results MIDA gathers before they are post-processed

type RawTask ¶

type RawTask struct {
	URL *string `json:"url"` // The URL to be visited

	Browser    *BrowserSettings    `json:"browser_settings"`    // Settings for launching the browser
	Completion *CompletionSettings `json:"completion_settings"` // Settings for when the site visit will complete
	Data       *DataSettings       `json:"data_settings"`       // Settings for what data will be collected from the site
	Output     *OutputSettings     `json:"output_settings"`     // Settings for what/how results will be saved
}

A raw MIDA task. This is the struct that is read from/written to file when tasks are stored as JSON.

func AllocateNewTask ¶

func AllocateNewTask() *RawTask

AllocateNewTask allocates a new RawTask struct, initializing everything to zero values

func ExpandCompressedTaskSet ¶

func ExpandCompressedTaskSet(ts CompressedTaskSet) []RawTask

ExpandCompressedTaskSet takes a CompressedTaskSet object and converts it into a slice of regular Tasks.

func ReadTasksFromBytes ¶

func ReadTasksFromBytes(data []byte) ([]RawTask, error)

ReadTasksFromBytes reads in tasks from a byte array. It will read them whether they are formatted as individual tasks or as a CompressedTaskSet.

func ReadTasksFromFile ¶

func ReadTasksFromFile(filename string) ([]RawTask, error)

ReadTasksFromFile is a wrapper function that reads single tasks, full task sets, or compressed task sets from file.

type SanitizedTask ¶

type SanitizedTask struct {
	URL string

	BrowserBinaryPath string   // Full path to the browser binary we use for the crawl
	BrowserFlags      []string // List of flags we will use when opening the browser (does not include --remote-debugging-port or similar)
	UserDataDirectory string   // Full path to the user data directory for the task

	CS  CompletionSettings  // Task completion settings for the task
	DS  DataSettings        // Data Gathering Settings for the task
	IS  InteractionSettings // Settings on how the browser will interact with the page
	OPS OutputSettings      // Output settings for the task
}

Internal type built from the process of sanitizing a RawTask. Should contain all the parameters needed for a crawl without the need to re-access the raw task. SanitizedTask should not contain information that cannot be deduced based on the raw task (and system parameters).

type SftpOutputSettings ¶

type SftpOutputSettings struct {
	Enable         *bool         `json:"enable,omitempty"`           // Whether this storage method is enabled
	Host           *string       `json:"host,omitempty"`             // IP address or domain name of host to store to
	Port           *int          `json:"port,omitempty"`             // Port to initiate SSH/SFTP connection
	Path           *string       `json:"path,omitempty"`             // Path of the overarching results directory to be written
	UserName       *string       `json:"user_name,omitempty"`        // User name we should use for accessing the host
	PrivateKeyFile *string       `json:"private_key_file,omitempty"` // Path to the private key file we should use for accessing the host
	DS             *DataSettings `json:"data_settings,omitempty"`    // Data settings for output via SSH/SFTP
}

Settings describing results output via SSH/SFTP

func AllocateNewSftpOutputSettings ¶

func AllocateNewSftpOutputSettings() *SftpOutputSettings

type TaskSet ¶

type TaskSet []RawTask

A slice of MIDA tasks, ready to be enqueued

type TaskSummary ¶

type TaskSummary struct {
	NavURL string `json:"nav_url"`
	UUID   string `json:"uuid"`

	Success       bool   `json:"success"`                  // True if the task did not fail
	FailureReason string `json:"failure_reason,omitempty"` // Holds the failure code for the task

	TaskWrapper *TaskWrapper `json:"-"`            // Wrapper containing the full task
	TaskTiming  TaskTiming   `json:"task_timing"`  // Timing data for the task
	CrawlerInfo CrawlerInfo  `json:"crawler_info"` // Information about the infrastructure used to visit the site

	OutputHost string `json:"output_host,omitempty"` // Host to which results were stored via SFTP
	OutputPath string `json:"output_path,omitempty"` // Path to the results of the crawl on the applicable host (after storage)

	NumResources int `json:"num_resources"` // Number of resources the browser downloaded
	NumScripts   int `json:"num_scripts"`   // Number of scripts the browser parsed

	NavHistory []page.NavigationEntry `json:"nav_history"`

	BrowserCovData BrowserCoverageMetadata `json:"browser_cov_data"`
}

Statistics gathered about a specific task

type TaskTiming ¶

type TaskTiming struct {
	BrowserOpen           time.Time `json:"browser_open"`
	ConnectionEstablished time.Time `json:"connection_established"`
	LoadEvent             time.Time `json:"load_event"`
	BrowserClose          time.Time `json:"browser_close"`
	BeginPostprocess      time.Time `json:"begin_postprocess"`
	EndPostprocess        time.Time `json:"end_postprocess"`
	BeginStorage          time.Time `json:"begin_storage"`
	EndStorage            time.Time `json:"-"`
}

TaskTiming contains timing data for the processing of a particular task

type TaskWrapper ¶

type TaskWrapper struct {
	RawTask       RawTask       // A pointer to a MIDA task
	SanitizedTask SanitizedTask // A sanitized MIDA task

	UUID    uuid.UUID
	TempDir string // Temporary directory where results are stored. Can be the same as the UserDataDir in some cases.

	// Dynamic fields
	Log     *logrus.Logger
	LogFile *os.File
}

Wrapper struct which contains a task, along with some dynamic metadata. This is an internal struct only -- It should not be exported/stored.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL