scraper

package
v0.0.0-...-03c5aec Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 13, 2024 License: MIT Imports: 12 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func IsFinal

func IsFinal(url string) bool

IsFinal check if the url is a folder-like path, like example.com/path/

func IsInSlice

func IsInSlice(search string, array []string) bool

IsInSlice check if the given link is in a slice

func PrintUsage

func PrintUsage()

PrintUsage prints the usage message

func RemoveLastSlash

func RemoveLastSlash(url string) string

RemoveLastSlash removes the last slash

Types

type Config

type Config struct {
	// Original domain
	OldDomain string `long:"u" short:"u"`

	// New domain to rewrite the download HTML sites
	NewDomain string `long:"new" short:"new"`

	// URL prefixes/roots that should be included in the scraper
	IncludedURLs string `long:"r" short:"r"`

	// Roots contains a range of URLs that can be considered the root
	// This is useful for scraping sites where content is hosted on a CDN
	// Not a flag. This will be filled by the scraper uppon setup
	Roots []string

	// Path where to save the downloads
	DownloadPath string `long:"path" short:"path"`

	// Use args on URLs
	UseQueries bool `long:"q" short:"q"`

	// Number of concurrent queries
	Simultaneous int `long:"s" short:"s"`
}

Config holds the scraper configuration

func ParseFlags

func ParseFlags() (*Config, error)

parseFlags parses command line arguments and validates them

type Console

type Console interface {
	AddDomain(string)
	AddStatus(string)
	AddStarted()
	AddFinished()
	AddAttachments()
	AddDownloaded()
	AddDownloading()
	AddErrors(string)
}

Console interface

type HttpGet

type HttpGet interface {
	ParseURL(baseURLString, relativeURLString string) (final string, err error)
	Get(link string) (final string, status int, buff *bytes.Buffer, err error)
}

HttpGet interface

type Links struct {
	Href string
}

Links model

type Page

type Page struct {
	URL       string
	Canonical string
	Links     []Links
	HTML      string
}

Page model

type Scraper

type Scraper struct {
	// Original domain
	OldDomain string

	// New domain to rewrite the download HTML sites
	NewDomain string

	// Roots contains a range of URLs that can be considered the root
	// This is useful for scraping sites where content is hosted on a CDN
	Roots []string

	// Path where to save the downloads
	DownloadPath string

	// Use args on URLs
	UseQueries bool

	// Number of concurrent queries
	Simultaneous int

	// Scanning now
	Scanning chan int

	// New links found
	NewLinks chan []Links

	// Pages to save
	Pages chan Page

	// Attachments found
	Attachments chan []string

	// Started
	Started chan int

	// Finished
	Finished chan int

	// Indexed pages
	Indexed []string

	// Pages for sitemap
	ForSitemap []string

	// Files to download
	Files []string

	// Seen links
	Seen map[string]bool

	// Start time
	StartTime time.Time

	// GetInterface
	Get HttpGet

	// Console
	Con Console
}

func New

func New(conf *Config, getter HttpGet, con Console) (*Scraper, error)

New creates a new Scraper

func (*Scraper) Close

func (s *Scraper) Close()

Close closes the channels

func (*Scraper) DoesLinkExist

func (s *Scraper) DoesLinkExist(newLink Links, existingLinks []Links) (exists bool)

DoesLinkExist checks if a link exists in a given slice

func (*Scraper) DownloadAttachments

func (s *Scraper) DownloadAttachments()

DownloadAttachments downloads the attachments

func (*Scraper) GetInsideAttachments

func (s *Scraper) GetInsideAttachments(link string) (attachments []string, err error)

GetInsideAttachments gets inside CSS and JS Files

func (*Scraper) GetLastFolder

func (s *Scraper) GetLastFolder(path string) string

GetLastFolder returns the last folder of a path

func (*Scraper) GetPath

func (s *Scraper) GetPath(url string) (path string)

GetPath returns the path of a given URL

func (*Scraper) HasRenderedExtension

func (s *Scraper) HasRenderedExtension(link string) bool

HasRenderedExtension checks if the link has a rendered extension

func (s *Scraper) IsInternLink(link string) bool

IsInternLink checks if a link is intern

func (*Scraper) IsLinkScanned

func (s *Scraper) IsLinkScanned(link string, scanned []string) (exists bool)

IsLinkScanned checks if a link has already been scanned

func (*Scraper) IsStart

func (s *Scraper) IsStart(link string) bool

IsStart cheks if the site is the startsite

func (*Scraper) IsURLInSlice

func (s *Scraper) IsURLInSlice(search string, array []string) bool

IsURLInSlice checks if a URL is in a slice

func (*Scraper) IsValidAttachment

func (s *Scraper) IsValidAttachment(link string) bool

IsValidAttachment checks if the link is a valid extension, not a site

func (*Scraper) IsValidExtension

func (s *Scraper) IsValidExtension(link string) bool

IsValidExtension check if an extension is valid

func (s *Scraper) IsValidLink(link string) (ok bool)

IsValidLink checks if the link is a valid url and from the domain

func (*Scraper) IsValidSite

func (s *Scraper) IsValidSite(link string) bool

IsValidLink checks if the link is a site and not an attachment

func (*Scraper) PreparePathsFile

func (s *Scraper) PreparePathsFile(url string) (folder, filename string)

PreparePathsFile prepares the folder and filename for a given URL, assuming it's a file

func (*Scraper) PreparePathsPage

func (s *Scraper) PreparePathsPage(url string) (folder, filename string)

PreparePathsPage prepares the folder and filename for a given URL, assuming it's a page

func (*Scraper) RemoveDomain

func (s *Scraper) RemoveDomain(link string) string

RemoveDomain returns only the path, without domain, from the given link

func (*Scraper) RemoveTrailingSlash

func (s *Scraper) RemoveTrailingSlash(link string) string

RemoveTrailingSlash removes the trailing slash from a link

func (*Scraper) Run

func (s *Scraper) Run()

Run runs the scraper

func (*Scraper) SanitizeURL

func (s *Scraper) SanitizeURL(link string) string

SanitizeURL sanitizes a URL

func (*Scraper) SaveAttachment

func (s *Scraper) SaveAttachment(url string) (err error)

Download a single link

func (*Scraper) SaveHTML

func (s *Scraper) SaveHTML(url string, html string) (err error)

Download a single link

func (*Scraper) Scrape

func (s *Scraper) Scrape()

Scrape scrapes the site

func (s *Scraper) TakeLinks(link string)

TakeLinks take links from the given site

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL