scraper

package
v0.7.5 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 27, 2024 License: Unlicense Imports: 16 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type ChapterExtractor

type ChapterExtractor struct {
	// Selector: CSS selector.
	Selector string
	// Title: Get title from element found by selector.
	Title func(*goquery.Selection) string
	// ID: Get id from parsed url string.
	ID func(string) string
	// URL: Get URL from element found by selector.
	URL func(*goquery.Selection) string
	// Date: Get the published date of the chapter if available.
	Date func(*goquery.Selection) libmangal.Date
	// ScanlationGroup: Get the scanlation group if available.
	ScanlationGroup func(*goquery.Selection) string
	// Action: Something to execute on a headless browser after page is loaded.
	Action rod.Action
}

ChapterExtractor: responsible for finding chapter elements by selector and extracting the data.

type Configuration added in v0.5.3

type Configuration struct {
	// Name: Name of the scraper. E.g. "mangapill"
	Name string
	// Delay: Delay between requests.
	Delay time.Duration
	// Parallelism: Parallelism of the scraper.
	Parallelism uint8

	// ReverseChapters: If chapters should be shown in reverse order.
	ReverseChapters bool

	// NeedsHeadlessBrowser: If a headless browser should be used to proxy any request.
	NeedsHeadlessBrowser bool
	// Cookies: Custom cookies to pass to the request. It is a string, as it is passed as a header.
	Cookies string

	// BaseURL: Base URL of the source.
	BaseURL string
	// GenerateSearchURL: Create search URL from the query.
	// E.g. "one piece" -> "https://manganelo.com/search/story/one%20piece"
	GenerateSearchURL func(baseUrl, query string) (string, error)

	// MangaExtractor: Responsible for finding manga elements and extracting the data.
	MangaExtractor *MangaExtractor
	// VolumeExtractor: Responsible for finding volume elements and extracting the data.
	VolumeExtractor *VolumeExtractor
	// ChapterExtractor: Responsible for finding chapter elements and extracting the data.
	ChapterExtractor *ChapterExtractor
	// PageExtractor: Responsible for finding page elements and extracting required the data.
	PageExtractor *PageExtractor
}

Configuration: Defines behavior of the scraper.

func (*Configuration) GetActions added in v0.7.0

func (c *Configuration) GetActions() map[rod.ActionType]rod.Action

Get the extractor Actions.

type MangaExtractor

type MangaExtractor struct {
	// Selector: CSS selector
	Selector string
	// Title: Get title from element found by selector.
	Title func(*goquery.Selection) string
	// URL: Get URL from element found by selector.
	URL func(*goquery.Selection) string
	// ID: Get id from parsed url string.
	ID func(string) string
	// Cover: Get cover from element found by selector.
	Cover func(*goquery.Selection) string
	// Action: Something to execute on a headless browser after page is loaded.
	Action rod.Action
}

MangaExtractor: responsible for finding manga elements by selector and extracting the data.

type PageExtractor

type PageExtractor struct {
	// Selector: CSS selector.
	Selector string
	// URL: Get URL from element found by selector.
	URL func(*goquery.Selection) string
	// Action: Something to execute on a headless browser after page is loaded.
	Action rod.Action
}

PageExtractor: responsible for finding page elements by selector and extracting the data.

type Scraper

type Scraper struct {
	// contains filtered or unexported fields
}

Scraper: Generic scraper downloads html pages and parses them.

func NewScraper

func NewScraper(config *Configuration, options mango.Options) (scraper *Scraper, err error)

NewScraper: generates a new scraper with given configuration and options.

func (*Scraper) ChapterPages

func (s *Scraper) ChapterPages(_ctx context.Context, store gokv.Store, chapter mango.Chapter) ([]libmangal.Page, error)

func (*Scraper) MangaVolumes

func (s *Scraper) MangaVolumes(_ctx context.Context, store gokv.Store, manga mango.Manga) ([]libmangal.Volume, error)

func (*Scraper) SearchMangas

func (s *Scraper) SearchMangas(_ctx context.Context, store gokv.Store, query string) ([]libmangal.Manga, error)

func (*Scraper) VolumeChapters

func (s *Scraper) VolumeChapters(_ctx context.Context, store gokv.Store, volume mango.Volume) ([]libmangal.Chapter, error)

type VolumeExtractor

type VolumeExtractor struct {
	// Selector: CSS selector.
	Selector string
	// Number: Get number from element found by selector.
	Number func(*goquery.Selection) int
	// Action: Something to execute on a headless browser after page is loaded.
	Action rod.Action
}

VolumeExtractor: responsible for finding volume elements by selector and extracting the data.

Directories

Path Synopsis
rod

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL