crawler

package
v0.0.0-...-e082e2d Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 20, 2024 License: MIT Imports: 17 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var AlreadyRunningError = errors.New("processor already running")
View Source
var (
	ConfigNotLoadedError = errors.New("config not loaded")
)
View Source
var CrawlerNotFoundError = errors.New("crawler not found")
View Source
var NotRunningError = errors.New("processor not running")
View Source
var NotStartedError = errors.New("processor not started")

Functions

func GetSelenium

func GetSelenium() (selenium.WebDriver, error)

func LoadConfig

func LoadConfig() error

Types

type Config

type Config struct {
	DatabaseUrl  string `mapstructure:"DATABASE_URL"`
	DatabaseName string `mapstructure:"DATABASE_NAME"`

	CrawlerPeriod     time.Duration `mapstructure:"CRAWLER_PERIOD"`
	CrawlerPagesCount int           `mapstructure:"CRAWLER_PAGES_COUNT"`

	SeleniumUrl string `mapstructure:"SELENIUM_URL"`

	ServerEnabled bool   `mapstructure:"SERVER_ENABLED"`
	ServerPort    int    `mapstructure:"SERVER_PORT"`
	ServerIp      string `mapstructure:"SERVER_IP"`
}

func GetConfig

func GetConfig() Config

type Factory

type Factory interface {

	// NewPageCrawler should return struct implementing PageCrawler interface
	NewPageCrawler() PageCrawler

	// NewListCrawler should return struct implementing ListCrawler interface
	NewListCrawler() ListCrawler

	// MatchUrl should return MatchType for the given url
	// CrawlerMatchPage if the given url is supported by NewPageCrawler
	// CrawlerMatchList if the given url is supported by NewListCrawler
	// CrawlerNotMatch if the given url is not supported
	MatchUrl(url string) MatchType
}

Factory is an interface for creating new crawlers and determining if the given url is supported

type FactoryRegistry

type FactoryRegistry struct {
	// contains filtered or unexported fields
}

func NewCrawlerFactoryRegistry

func NewCrawlerFactoryRegistry() *FactoryRegistry

func (*FactoryRegistry) GetCrawler

func (r *FactoryRegistry) GetCrawler(url string) (PageCrawler, ListCrawler)

func (*FactoryRegistry) Register

func (r *FactoryRegistry) Register(factory Factory)

type ListCrawler

type ListCrawler interface {

	// GetUrls should return list of urls from the given url or error
	GetUrls(wd selenium.WebDriver, url string) ([]string, error)

	// NextPage should return next page url or error
	NextPage(wd selenium.WebDriver, url string) (string, error)
}

ListCrawler is an interface for crawling list of offers

type MatchType

type MatchType int
const (
	CrawlerNotMatch  MatchType = 0
	CrawlerMatchPage MatchType = 1
	CrawlerMatchList MatchType = 2
)

type NotificationSender

type NotificationSender interface {
	TrySendNotification(ctx context.Context, offer *model.Offer, action model.OfferAction) error
}

type Offer

type Offer struct {
	SiteId     string
	Site       string
	UpdateTime time.Time

	Name           string
	Description    string
	Price          int
	Area           float32
	Rooms          int
	Floor          int
	BuildingFloors int
	Year           int
	Heating        string
	Market         string
	Window         string
	Elevator       bool
	Balcony        bool
	Media          []string
}

type PageCrawler

type PageCrawler interface {

	// CrawlOffer should return Offer struct with all the data from the given url or error
	CrawlOffer(wd selenium.WebDriver, url string) (*Offer, error)
}

PageCrawler is an interface for crawling single offer page

type SitesProcessor

type SitesProcessor struct {
	// contains filtered or unexported fields
}

func NewSitesProcessor

func NewSitesProcessor(ctx context.Context, registry *FactoryRegistry, sender NotificationSender,
	watchUrlRepo db.WatchUrlRepository, offerRepo db.OfferRepository) *SitesProcessor

func (*SitesProcessor) GetProcessingState

func (s *SitesProcessor) GetProcessingState() (admin.ProcessorState, error)

func (*SitesProcessor) MapOfferToDB

func (s *SitesProcessor) MapOfferToDB(offer *Offer, url string) *model.Offer

func (*SitesProcessor) Process

func (s *SitesProcessor) Process() error

Process runs the processor once

func (*SitesProcessor) ProcessSite

func (s *SitesProcessor) ProcessSite(ctx context.Context, wd selenium.WebDriver, url string) error

func (*SitesProcessor) ProcessSiteList

func (s *SitesProcessor) ProcessSiteList(ctx context.Context, wd selenium.WebDriver, url string) error

func (*SitesProcessor) Run

func (s *SitesProcessor) Run() error

Run starts the processor with periodic jobs

func (*SitesProcessor) RunProcessing

func (s *SitesProcessor) RunProcessing() error

func (*SitesProcessor) StopProcessing

func (s *SitesProcessor) StopProcessing() error

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL