common

package
v1.1.3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 4, 2023 License: MIT Imports: 12 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var Status500Error = errors.New("Server returned 500 status response. (Slow down)")
View Source
var Status503Error = errors.New("Server returned 503 status response")

Functions

func DoRequest

func DoRequest(url string, timeout int, headers map[string]string) ([]byte, error)

func Get

func Get(url string, timeout int, maxRetries int) ([]byte, error)

Get ... Performs HTTP GET request and returns response bytes

func GetFileExtenstion

func GetFileExtenstion(file *[]byte) (string, error)

func SaveFile

func SaveFile(data []byte, path string) error

Save data using file fullpath

func SaveFiles

func SaveFiles(results <-chan []*CdxResponse, outputDir string, errors chan error, downloadRate float32)

Save files from CDX Response channel into output directory

Types

type CdxResponse

type CdxResponse struct {
	Urlkey       string `json:"urlkey,omitempty"`
	Timestamp    string `json:"timestamp,omitempty"`
	Charset      string `json:"charset,omitempty"`
	MimeType     string `json:"mime,omitempty"`
	Languages    string `json:"languages,omitempty"`
	MimeDetected string `json:"mimedetected,omitempty"`
	Digest       string `json:"digest,omitempty"`
	Offset       string `json:"offset,omitempty"`
	Original     string `json:"url,omitempty"` // Original URL
	Length       string `json:"length,omitempty"`
	StatusCode   string `json:"status,omitempty"`
	Filename     string `json:"filename,omitempty"`
	Source       Source
}

WebArchive and Common Crawl (index.commoncrawl.org) CDX API Response structure from

type RequestConfig

type RequestConfig struct {
	URL            string   // Url to parse
	Filters        []string // Extenstion to search
	Limit          uint     // Max number of results per page
	CollapseColumn string   // Which column to use to collapse results
	SinglePage     bool     // Get results only from 1st page (mostly used for tests)
	FromDate       string   // Filter results from Date
	ToDate         string   // Filter results to Date
}

func (RequestConfig) GetUrl added in v1.1.3

func (config RequestConfig) GetUrl(serverURL string, page int) string

GetUrlFromConfig ... Compose URL with CDX server request parameters

type Source

type Source interface {
	Name() string
	ParseResponse(resp []byte) ([]*CdxResponse, error)
	GetNumPages(url string) (int, error)
	GetPages(config RequestConfig) ([]*CdxResponse, error)
	FetchPages(config RequestConfig, results chan []*CdxResponse, errors chan error)
	GetFile(*CdxResponse) ([]byte, error)
}

Source of web archive data

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL