Documentation ¶
Index ¶
- Constants
- type CommonCrawl
- func (cc *CommonCrawl) FetchPages(config common.RequestConfig, results chan []*common.CdxResponse, ...)
- func (cc *CommonCrawl) GetFile(page *common.CdxResponse) ([]byte, error)
- func (cc *CommonCrawl) GetIndexes() ([]latestIndex, error)
- func (cc *CommonCrawl) GetNumPages(url string) (int, error)
- func (cc *CommonCrawl) GetNumPagesIndex(url, index string) (int, error)
- func (cc *CommonCrawl) GetPages(config common.RequestConfig) ([]*common.CdxResponse, error)
- func (cc *CommonCrawl) GetPagesIndex(config common.RequestConfig, index string) ([]*common.CdxResponse, error)
- func (CommonCrawl) Name() string
- func (cc *CommonCrawl) ParseResponse(resp []byte) ([]*common.CdxResponse, error)
Constants ¶
const CRAWL_STORAGE = "https://data.commoncrawl.org/" // https://commoncrawl.s3.amazonaws.com/
const INDEX_SERVER = "https://index.commoncrawl.org/"
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type CommonCrawl ¶
type CommonCrawl struct { MaxTimeout int // Request timeout MaxRetries int // Max number of request retries if timeouted // contains filtered or unexported fields }
func New ¶
func New(timeout, retries int) (*CommonCrawl, error)
func (*CommonCrawl) FetchPages ¶
func (cc *CommonCrawl) FetchPages(config common.RequestConfig, results chan []*common.CdxResponse, errors chan error)
FetchPages is a concurrent way to GetPages. Makes request to CommonCrawl index API and returns observations in a channel.
index: needs to be set manually here
func (*CommonCrawl) GetFile ¶
func (cc *CommonCrawl) GetFile(page *common.CdxResponse) ([]byte, error)
Gets files from CommonCrawl storage using info from CdxResponse server
page: info about found web page in CdxResponse timeout: timeout in seconds
func (*CommonCrawl) GetIndexes ¶
func (cc *CommonCrawl) GetIndexes() ([]latestIndex, error)
Get latest CDX indexes from http://index.commoncrawl.org/collinfo.json
func (*CommonCrawl) GetNumPages ¶
func (cc *CommonCrawl) GetNumPages(url string) (int, error)
Returns the number of pages located in CommonCrawl for given url Use latest index from http://index.commoncrawl.org/collinfo.json
func (*CommonCrawl) GetNumPagesIndex ¶
func (cc *CommonCrawl) GetNumPagesIndex(url, index string) (int, error)
Returns the number of pages located in CommonCrawl for given url
index: needs to be set manually here like "CC-MAIN-2023-14"
func (*CommonCrawl) GetPages ¶
func (cc *CommonCrawl) GetPages(config common.RequestConfig) ([]*common.CdxResponse, error)
Makes request to the Commoncrawl index API to gather all offsets that contain chosen URL.
Uses the latest CommonCrawl index.
func (*CommonCrawl) GetPagesIndex ¶
func (cc *CommonCrawl) GetPagesIndex(config common.RequestConfig, index string) ([]*common.CdxResponse, error)
GetPagesIndex ... Makes request to WebArchive index API to gather all url observations
index: needs to be set manually here like "CC-MAIN-2023-14"
func (CommonCrawl) Name ¶
func (CommonCrawl) Name() string
func (*CommonCrawl) ParseResponse ¶
func (cc *CommonCrawl) ParseResponse(resp []byte) ([]*common.CdxResponse, error)
Parse response from http://index.commoncrawl.org/[Index Version]-index index server