Documentation ¶
Overview ¶
Package sitemap contains the data structures and crawl engine implementations for creating a sitemap.
Index ¶
- Constants
- type AstraDB
- func (c *AstraDB) GetMaxDepthForSitemapID(sitemapID uuid.UUID) (int, error)
- func (c *AstraDB) GetSitemapDetails(sitemapID uuid.UUID) (*Details, error)
- func (c *AstraDB) GetSitemapIDForCrawlID(crawlID uuid.UUID) (*crawlJob, error)
- func (c *AstraDB) GetSitemapResults(sitemapID uuid.UUID) (*[]Result, error)
- func (c *AstraDB) HealthCheck() error
- func (c *AstraDB) URLExistsForSitemapID(sitemapID uuid.UUID, URL string) (bool, error)
- func (c *AstraDB) UpdateStatus(crawlID, sitemapID uuid.UUID, status string) error
- func (c *AstraDB) WriteCrawl(crawlID, sitemapID uuid.UUID, url string, depth, maxDepth int, status string) error
- func (c *AstraDB) WriteResults(sitemapID, crawlID uuid.UUID, URL string, links []string) error
- func (c *AstraDB) WriteSitemap(sitemapID string, url string, maxDepth int) error
- type ConcurrentCrawlEngine
- type ConcurrentLimitedCrawlEngine
- type CrawlEngine
- type CrawlManager
- type CrawlMessage
- type CrawlMessageHandlerFunc
- type Details
- type JobManager
- type Limiter
- type NATS
- func (n *NATS) SendCrawlMessage(crawlID, sitemapID uuid.UUID, URL string, depth int) error
- func (n *NATS) SendResultsMessage(crawlID uuid.UUID, results *[]Result) error
- func (n *NATS) SendStartMessage(sitemapID uuid.UUID, URL string, maxDepth int) error
- func (n *NATS) Stop()
- func (n *NATS) SubscribeCrawlSubject(f CrawlMessageHandlerFunc)
- func (n *NATS) SubscribeResultsSubject(f ResultsMessageHandlerFunc)
- func (n *NATS) SubscribeStartSubject(f StartMessageHandlerFunc)
- type Result
- type ResultContainer
- type ResultsMessage
- type ResultsMessageHandlerFunc
- type SiteMap
- type StartMessage
- type StartMessageHandlerFunc
- type SynchronousCrawlEngine
Constants ¶
const MAX_BACKOFF_MS int = 500
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type AstraDB ¶
type AstraDB struct {
// contains filtered or unexported fields
}
func NewAstraDB ¶
func NewAstraDB() *AstraDB
func (*AstraDB) GetMaxDepthForSitemapID ¶
func (*AstraDB) GetSitemapDetails ¶
func (*AstraDB) GetSitemapIDForCrawlID ¶
func (*AstraDB) GetSitemapResults ¶
func (*AstraDB) HealthCheck ¶
func (*AstraDB) URLExistsForSitemapID ¶
func (*AstraDB) UpdateStatus ¶
func (*AstraDB) WriteCrawl ¶
func (*AstraDB) WriteResults ¶
type ConcurrentCrawlEngine ¶
type ConcurrentCrawlEngine struct { SynchronousCrawlEngine WG sync.WaitGroup }
A ConcurrentCrawlEngine recursively visits extracted URLs up to a specified tree depth, with each visit happening concurrently. A WaitGroup is used to monitor for crawl completion.
func NewConcurrentCrawlEngine ¶
func NewConcurrentCrawlEngine(sitemap *SiteMap, maxDepth int, startURL string) *ConcurrentCrawlEngine
NewConcurrentCrawlEngine returns a pointer to an instance of a ConcurrentCrawlEngine.
func (*ConcurrentCrawlEngine) Run ¶
func (c *ConcurrentCrawlEngine) Run()
Run begins the sitemap crawl activity for the ConcurrentCrawlEngine.
type ConcurrentLimitedCrawlEngine ¶
type ConcurrentLimitedCrawlEngine struct { ConcurrentCrawlEngine // contains filtered or unexported fields }
A ConcurrentLimitedCrawlEngine recursively visits extracted URLs up to a specified tree depth, with each visit happening concurrently, with a limit to the number of concurrent visits. A WaitGroup is used to monitor for crawl completion.
func NewConcurrentLimitedCrawlEngine ¶
func NewConcurrentLimitedCrawlEngine(sitemap *SiteMap, maxDepth int, startURL string, limiter *Limiter) *ConcurrentLimitedCrawlEngine
NewConcurrentLimitedCrawlEngine returns a pointer to an instance of a ConcurrentLimitedCrawlEngine.
func (*ConcurrentLimitedCrawlEngine) Run ¶
func (c *ConcurrentLimitedCrawlEngine) Run()
Run begins the sitemap crawl activity for the ConcurrentLimitedCrawlEngine.
type CrawlEngine ¶
type CrawlEngine interface {
Run()
}
CrawlEngine is the interface implemented by the various crawl engines.
type CrawlManager ¶
type CrawlManager struct { JobManager *JobManager CassDB *AstraDB NatsManager *NATS }
func (*CrawlManager) HandleCrawlMessage ¶
func (cm *CrawlManager) HandleCrawlMessage(c *CrawlMessage)
func (*CrawlManager) HandleResultsMessage ¶
func (cm *CrawlManager) HandleResultsMessage(r *ResultsMessage)
func (*CrawlManager) HandleStartMessage ¶
func (cm *CrawlManager) HandleStartMessage(s *StartMessage)
type CrawlMessage ¶
type CrawlMessageHandlerFunc ¶
type CrawlMessageHandlerFunc func(c *CrawlMessage)
type JobManager ¶
type JobManager struct {
// contains filtered or unexported fields
}
func NewJobManager ¶
func NewJobManager() *JobManager
type Limiter ¶
type Limiter struct {
// contains filtered or unexported fields
}
A Limiter provides a way of governing the number of concurrent goroutines using a buffered channel.
func NewLimiter ¶
NewLimiter returns an instance of Limiter and initialises the limiter channel, filling it with empty structs.
func (*Limiter) RunFunc ¶
RunFunc checks to see if there is room to run an additional concurrent activity by reading from the Limiter's buffered channel. If a struct is returned after reading from the channel the function is run. When the function is complete a new empty struct is placed in the channel. If there are no structs available to read from the channel it means we have no room to run any additional activities, and an error is returned.
type NATS ¶
type NATS struct { CrawlMessageHandler CrawlMessageHandlerFunc ResultsMessageHandler ResultsMessageHandlerFunc StartMessageHandler StartMessageHandlerFunc // contains filtered or unexported fields }
func NewNATSManager ¶
func NewNATSManager() *NATS
func (*NATS) SendCrawlMessage ¶
func (*NATS) SendResultsMessage ¶
func (*NATS) SendStartMessage ¶
func (*NATS) SubscribeCrawlSubject ¶
func (n *NATS) SubscribeCrawlSubject(f CrawlMessageHandlerFunc)
func (*NATS) SubscribeResultsSubject ¶
func (n *NATS) SubscribeResultsSubject(f ResultsMessageHandlerFunc)
func (*NATS) SubscribeStartSubject ¶
func (n *NATS) SubscribeStartSubject(f StartMessageHandlerFunc)
type ResultContainer ¶
type ResultsMessage ¶
type ResultsMessageHandlerFunc ¶
type ResultsMessageHandlerFunc func(c *ResultsMessage)
type SiteMap ¶
type SiteMap struct {
// contains filtered or unexported fields
}
A SiteMap is the data structure used to store a list of links found at crawled URLs. A sync.RWMutex provides access control to the internal map.
func NewSiteMap ¶
func NewSiteMap() *SiteMap
NewSiteMap returns an SiteMap instance with an empty sitemap map, ready for URLs and links to be added.
func (*SiteMap) AddURL ¶
AddURL adds an entry to the internal map for a given URL and initialises the list of links with an empty map.
func (*SiteMap) GetLinks ¶
GetLinks returns the slice of links available for a given URL key. If the URL exists in the internal map the links are returned to the caller along with a boolean with a value of true. If the key is not found in the map a nil slice is returned along with a boolean value of false.
func (*SiteMap) MarshalJSON ¶
func (*SiteMap) UpdateURLWithLinks ¶
UpdateURLWithLinks associates the provided slice of links with the given parent URL.
type StartMessage ¶
type StartMessageHandlerFunc ¶
type StartMessageHandlerFunc func(c *StartMessage)
type SynchronousCrawlEngine ¶
type SynchronousCrawlEngine struct {
// contains filtered or unexported fields
}
A SynchronousCrawlEngine recursively visits extracted URLs one URL at a time up to a specified tree depth.
func NewSynchronousCrawlEngine ¶
func NewSynchronousCrawlEngine(sitemap *SiteMap, maxDepth int, startURL string) *SynchronousCrawlEngine
NewSynchronousCrawlEngine returns a pointer to an instance of a SynchronousCrawlEngine.
func (*SynchronousCrawlEngine) Run ¶
func (c *SynchronousCrawlEngine) Run()
Run begins the sitemap crawl activity for the SynchronousCrawlEngine.