flexiscraper

package module
v0.0.0-...-81af71c Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 24, 2017 License: MIT Imports: 11 Imported by: 0

README

GoDoc Build Status Go Report Card Coverage Status

Flexiscraper

A simple web scraper designed for extracting structured data from a small number of pages.

Installation

go get -u github.com/harrisbaird/flexiscraper
External Dependancies

Usage

package main

import (
	"log"

	"github.com/harrisbaird/flexiscraper"
	q "github.com/harrisbaird/flexiscraper/q"
)

type HackerNewsItem struct {
	Title  string
	URL    string
	User   string
	Points int
}

func main() {
	scraper := flexiscraper.New()
	hackerNews := scraper.NewDomain("https://news.ycombinator.com/")
	c, err := hackerNews.FetchRoot()
	if err != nil {
		log.Fatalln(err)
	}

	items := []HackerNewsItem{}

	// Iterate through nodes matching XPath expression
	c.Each("//tr[@class=\"athing\"]", func(i int, c *Context) {
		item := HackerNewsItem{
			// Find is a convience function for looking up an XPath expression,
			// returning the first result as a string.
			Title: c.Find(".//td[@class=\"title\"]/a"),
			URL:   c.Find(".//td[@class=\"title\"]/a/@href"),
			User:  c.Find(".//following-sibling::tr//a[@class=\"hnuser\"]"),

			// Create a more complex value and return as an int.
			Points: c.Build(
				q.XPath(".//following-sibling::tr//span[@class=\"age\"]"),
				q.Regexp("\\d+"),
			).Int(),
		}

		items = append(items, item)
	})
}

Documentation

Index

Constants

View Source
const DefaultUserAgent = "Flexiscraper (https://github.com/harrisbaird/flexiscraper)"

DefaultUserAgent is the default user agent string. It's used in all http requests and during robots.txt validation.

Variables

View Source
var ErrDisallowedByRobots = errors.New("HTTP request disallowed by robots.txt")

ErrDisallowedByRobots is returned when the requested URL is disallowed by robots.txt.

View Source
var ErrNoMatches = errors.New("No matching queries")

Functions

This section is empty.

Types

type Context

type Context struct {
	URL    string
	Node   *html.Node
	Errors []error
}

func (*Context) Attr

func (c *Context) Attr(expr string) string

func (*Context) Build

func (c *Context) Build(input q.InputFunc, processors ...q.ProcessorFunc) *QueryValue

func (*Context) Each

func (c *Context) Each(expr string, fn func(int, *Context))

Each finds nodes matching an xpath expression and calls the given function for each node.

func (*Context) Find

func (c *Context) Find(expr string) string

Find looks up a given xpath expression and returns the first match.

func (*Context) FindAll

func (c *Context) FindAll(expr string) []string

FindAll looks up a given xpath expression and returns all matches.

func (*Context) Or

func (c *Context) Or(values ...*QueryValue) *QueryValue

type Domain

type Domain struct {
	*Scraper
	Domain     *url.URL
	RobotsData *robotstxt.Group
}

Domain defines implementation for scraping a single domain.

func (*Domain) Fetch

func (d *Domain) Fetch(url string) (*Context, error)

Fetch and parse html from the given URL, checks and obeys robots.txt if ObeyRobots is true in the scraper.

func (*Domain) FetchRoot

func (d *Domain) FetchRoot() (*Context, error)

FetchRoot convinience function for fetching the current domains root URL.

func (*Domain) Parse

func (d *Domain) Parse(context *Context, r io.Reader) error

Parse html from the given reader.

type QueryValue

type QueryValue struct {
	Value []string
	Error error
}

func (*QueryValue) Int

func (qv *QueryValue) Int() int

func (*QueryValue) IntSlice

func (qv *QueryValue) IntSlice() (s []int)

func (*QueryValue) String

func (qv *QueryValue) String() string

func (*QueryValue) StringSlice

func (qv *QueryValue) StringSlice() []string

type Scraper

type Scraper struct {
	// The user agent string sent during http requests and when checking
	// robots.txt.
	UserAgent string

	// The http client to use when fetching, defaults to http.DefaultClient.
	HTTPClient *http.Client

	// ObeyRobots enables robot.txt policy checking.
	// Default: true
	ObeyRobots bool
}

A Scraper defines the parameters for running a web scraper.

func New

func New() *Scraper

New initialises a new Scraper.

func (*Scraper) NewDomain

func (s *Scraper) NewDomain(baseDomain string) *Domain

NewDomain initialises a new domain. This is used for robots.txt and to ensure absolute urls.

Directories

Path Synopsis
q

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL