goscraper

package module
v0.0.0-...-5dd20d9 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 30, 2018 License: MIT Imports: 12 Imported by: 0

README

goscraper

Golang package to quickly return a preview of a webpage, you can get easily its title, description & images

Usage

func main() {
	s, err := goscraper.Scrape("https://www.w3.org/", 5)
	if err != nil {
		fmt.Println(err)
		return
	}
	fmt.Printf("Title : %s\n", s.Preview.Title)
	fmt.Printf("Description : %s\n", s.Preview.Description)
	fmt.Printf("Image: %s\n", s.Preview.Images[0])
	fmt.Printf("Url : %s\n", s.Preview.Link)
}

output:

Title : World Wide Web Consortium (W3C)
Description : The World Wide Web Consortium (W3C) is an international community where Member organizations, a full-time staff, and the public work together to develop Web standards.
Image: https://www.w3.org/2008/site/images/logo-w3c-mobile-lg
Url : https://www.w3.org/

License

Goscraper is licensed under the MIT License.

Documentation

Overview

2017/12/29 21:15:58 Fri

2017/12/30 17:11:13 Sat

2017/12/29 23:05:54 Fri

Index

Constants

View Source
const (
	ValImage = iota
	ValScript
)

Variables

View Source
var (
	EscapedFragment string = "_escaped_fragment_="
)

Functions

func AddTagNode

func AddTagNode(tag string, node []TagNode) map[string][]TagNode

func HasSuffix

func HasSuffix(str string, exts []string) bool

func IsImageUrl

func IsImageUrl(s string) bool

func IsJsUrl

func IsJsUrl(s string) bool

Types

type DefaultHandler

type DefaultHandler struct {
}

func (DefaultHandler) OnImage

func (h DefaultHandler) OnImage(parnUrl *url.URL, tag, key, val string, sel *Selection) (string, bool)

func (DefaultHandler) OnScript

func (h DefaultHandler) OnScript(parnUrl *url.URL, tag, key, val string, sel *Selection) (string, bool)

func (*DefaultHandler) TagsFilter

func (h *DefaultHandler) TagsFilter(tag string) ([]TagNode, bool)

type Document

type Document struct {
	*goquery.Document
	Body    *bytes.Buffer
	Preview DocumentPreview
}

func Scrape

func Scrape(opts *Options) (*Document, error)

func ScrapeRedirect

func ScrapeRedirect(opts *Options) (*Document, error)

func Scrape(uri, body string, maxRedirect int, handler TagHandler) (*Document, error) {

type DocumentPreview

type DocumentPreview struct {
	Title       string
	Description string
	Images      []string
	RawImages   []string
	Scripts     []string
	RawScripts  []string

	Link string
}

type HtmlParser

type HtmlParser struct {
	Url                *url.URL
	EscapedFragmentUrl *url.URL
	Body               []byte
	MaxRedirect        int

	Preview DocumentPreview
	TagsMap map[string][]TagNode
	// contains filtered or unexported fields
}

func NewScrape

func NewScrape(opts *Options) (*HtmlParser, error)

func (*HtmlParser) AddTagNode

func (scraper *HtmlParser) AddTagNode(tag string, node []TagNode)

func (*HtmlParser) Scrape

func (scraper *HtmlParser) Scrape() (*Document, error)

type Options

type Options struct {
	Url         string
	MaxRedirect int
	Body        string
	Handler     TagHandler
	HtmlFile    string
}

type Scraper

type Scraper struct {
	Url                *url.URL
	EscapedFragmentUrl *url.URL
	Body               []byte
	MaxRedirect        int
	// contains filtered or unexported fields
}

func (*Scraper) Scrape

func (scraper *Scraper) Scrape() (*Document, error)

type Selection

type Selection struct {
	Sel     *goquery.Selection
	AttrMap map[string]string
}

func (Selection) Attr

func (s Selection) Attr(name string) (string, bool)

type TagHandler

type TagHandler interface {
	// TagsFilter(string) ([]TagNode, bool)
	OnImage(*url.URL, string, string, string, *Selection) (string, bool)
	OnScript(*url.URL, string, string, string, *Selection) (string, bool)
}

type TagNode

type TagNode struct {
	ValType int
	Attrs   []string
}

func (TagNode) HasAttr

func (l TagNode) HasAttr(attr string) bool

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL