scraper

package

v0.0.0-...-03c5aec Latest Latest Go to latest Published: Apr 13, 2024 License: MIT Imports: 12 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/antsanchez/go-download-web

Links

Open Source Insights

Documentation ¶

Index ¶

func IsFinal(url string) bool
func IsInSlice(search string, array []string) bool
func PrintUsage()
func RemoveLastSlash(url string) string
type Config
- func ParseFlags() (*Config, error)
type Console
type HttpGet
type Links
type Page
type Scraper
- func New(conf *Config, getter HttpGet, con Console) (*Scraper, error)

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func IsFinal ¶

func IsFinal(url string) bool

IsFinal check if the url is a folder-like path, like example.com/path/

func IsInSlice ¶

func IsInSlice(search string, array []string) bool

IsInSlice check if the given link is in a slice

func PrintUsage ¶

func PrintUsage()

PrintUsage prints the usage message

func RemoveLastSlash ¶

func RemoveLastSlash(url string) string

RemoveLastSlash removes the last slash

Types ¶

type Config ¶

type Config struct {
	// Original domain
	OldDomain string `long:"u" short:"u"`

	// New domain to rewrite the download HTML sites
	NewDomain string `long:"new" short:"new"`

	// URL prefixes/roots that should be included in the scraper
	IncludedURLs string `long:"r" short:"r"`

	// Roots contains a range of URLs that can be considered the root
	// This is useful for scraping sites where content is hosted on a CDN
	// Not a flag. This will be filled by the scraper uppon setup
	Roots []string

	// Path where to save the downloads
	DownloadPath string `long:"path" short:"path"`

	// Use args on URLs
	UseQueries bool `long:"q" short:"q"`

	// Number of concurrent queries
	Simultaneous int `long:"s" short:"s"`
}

Config holds the scraper configuration

func ParseFlags ¶

func ParseFlags() (*Config, error)

parseFlags parses command line arguments and validates them

type Console ¶

type Console interface {
	AddDomain(string)
	AddStatus(string)
	AddStarted()
	AddFinished()
	AddAttachments()
	AddDownloaded()
	AddDownloading()
	AddErrors(string)
}

Console interface

type HttpGet ¶

type HttpGet interface {
	ParseURL(baseURLString, relativeURLString string) (final string, err error)
	Get(link string) (final string, status int, buff *bytes.Buffer, err error)
}

HttpGet interface

type Links ¶

type Links struct {
	Href string
}

Links model

type Page ¶

type Page struct {
	URL       string
	Canonical string
	Links     []Links
	HTML      string
}

Page model

type Scraper ¶

type Scraper struct {
	// Original domain
	OldDomain string

	// New domain to rewrite the download HTML sites
	NewDomain string

	// Roots contains a range of URLs that can be considered the root
	// This is useful for scraping sites where content is hosted on a CDN
	Roots []string

	// Path where to save the downloads
	DownloadPath string

	// Use args on URLs
	UseQueries bool

	// Number of concurrent queries
	Simultaneous int

	// Scanning now
	Scanning chan int

	// New links found
	NewLinks chan []Links

	// Pages to save
	Pages chan Page

	// Attachments found
	Attachments chan []string

	// Started
	Started chan int

	// Finished
	Finished chan int

	// Indexed pages
	Indexed []string

	// Pages for sitemap
	ForSitemap []string

	// Files to download
	Files []string

	// Seen links
	Seen map[string]bool

	// Start time
	StartTime time.Time

	// GetInterface
	Get HttpGet

	// Console
	Con Console
}

func New ¶

func New(conf *Config, getter HttpGet, con Console) (*Scraper, error)

New creates a new Scraper

func (*Scraper) Close ¶

func (s *Scraper) Close()

Close closes the channels

func (*Scraper) DoesLinkExist ¶

func (s *Scraper) DoesLinkExist(newLink Links, existingLinks []Links) (exists bool)

DoesLinkExist checks if a link exists in a given slice

func (*Scraper) DownloadAttachments ¶

func (s *Scraper) DownloadAttachments()

DownloadAttachments downloads the attachments

func (*Scraper) GetInsideAttachments ¶

func (s *Scraper) GetInsideAttachments(link string) (attachments []string, err error)

GetInsideAttachments gets inside CSS and JS Files

func (*Scraper) GetLastFolder ¶

func (s *Scraper) GetLastFolder(path string) string

GetLastFolder returns the last folder of a path

func (*Scraper) GetPath ¶

func (s *Scraper) GetPath(url string) (path string)

GetPath returns the path of a given URL

func (*Scraper) HasRenderedExtension ¶

func (s *Scraper) HasRenderedExtension(link string) bool

HasRenderedExtension checks if the link has a rendered extension

func (*Scraper) IsInternLink ¶

func (s *Scraper) IsInternLink(link string) bool

IsInternLink checks if a link is intern

func (*Scraper) IsLinkScanned ¶

func (s *Scraper) IsLinkScanned(link string, scanned []string) (exists bool)

IsLinkScanned checks if a link has already been scanned

func (*Scraper) IsStart ¶

func (s *Scraper) IsStart(link string) bool

IsStart cheks if the site is the startsite

func (*Scraper) IsURLInSlice ¶

func (s *Scraper) IsURLInSlice(search string, array []string) bool

IsURLInSlice checks if a URL is in a slice

func (*Scraper) IsValidAttachment ¶

func (s *Scraper) IsValidAttachment(link string) bool

IsValidAttachment checks if the link is a valid extension, not a site

func (*Scraper) IsValidExtension ¶

func (s *Scraper) IsValidExtension(link string) bool

IsValidExtension check if an extension is valid

func (*Scraper) IsValidLink ¶

func (s *Scraper) IsValidLink(link string) (ok bool)

IsValidLink checks if the link is a valid url and from the domain

func (*Scraper) IsValidSite ¶

func (s *Scraper) IsValidSite(link string) bool

IsValidLink checks if the link is a site and not an attachment

func (*Scraper) PreparePathsFile ¶

func (s *Scraper) PreparePathsFile(url string) (folder, filename string)

PreparePathsFile prepares the folder and filename for a given URL, assuming it's a file

func (*Scraper) PreparePathsPage ¶

func (s *Scraper) PreparePathsPage(url string) (folder, filename string)

PreparePathsPage prepares the folder and filename for a given URL, assuming it's a page

func (*Scraper) RemoveDomain ¶

func (s *Scraper) RemoveDomain(link string) string

RemoveDomain returns only the path, without domain, from the given link

func (*Scraper) RemoveTrailingSlash ¶

func (s *Scraper) RemoveTrailingSlash(link string) string

RemoveTrailingSlash removes the trailing slash from a link

func (*Scraper) Run ¶

func (s *Scraper) Run()

Run runs the scraper

func (*Scraper) SanitizeURL ¶

func (s *Scraper) SanitizeURL(link string) string

SanitizeURL sanitizes a URL

func (*Scraper) SaveAttachment ¶

func (s *Scraper) SaveAttachment(url string) (err error)

Download a single link

func (*Scraper) SaveHTML ¶

func (s *Scraper) SaveHTML(url string, html string) (err error)

Download a single link

func (*Scraper) Scrape ¶

func (s *Scraper) Scrape()

Scrape scrapes the site

func (*Scraper) TakeLinks ¶

func (s *Scraper) TakeLinks(link string)

TakeLinks take links from the given site

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL