webextractor

package
v0.0.0-...-b1083d3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 15, 2024 License: MIT Imports: 11 Imported by: 0

README

Colibri ~ WebExtractor

WebExtractor are default interfaces for Colibri ready to start crawling or extracting data on the web.

Quick Starts

Do
package main

import (
	"encoding/json"
	"fmt"

	"github.com/gonzxlez/colibri"
	"github.com/gonzxlez/colibri/webextractor"
)

var rawRules = `{
	"Method": "GET",
	"URL": "https://example.com"
}`

func main() {
	we, err := webextractor.New()
	if err != nil {
		panic(err)
	}

	var rules colibri.Rules
	err = json.Unmarshal([]byte(rawRules), &rules)
	if err != nil {
		panic(err)
	}

	resp, err := we.Do(&rules)
	if err != nil {
		panic(err)
	}

	fmt.Println("URL:", resp.URL())
	fmt.Println("Status code:", resp.StatusCode())
	fmt.Println("Content-Type", resp.Header().Get("Content-Type"))
}
URL: https://example.com
Status code: 200
Content-Type text/html; charset=UTF-8
Extract
package main

import (
	"fmt"

	"github.com/gonzxlez/colibri"
	"github.com/gonzxlez/colibri/webextractor"
)

var rawRules = `{
	"Method": "GET",
	"URL":    "https://example.com",
	"Selectors": {
		"title": "//head/title"
	}
}`

func main() {
	we, err := webextractor.New()
	if err != nil {
		panic(err)
	}

	var rules colibri.Rules
	err = json.Unmarshal([]byte(rawRules), &rules)
	if err != nil {
		panic(err)
	}

	output, err := we.Extract(&rules)
	if err != nil {
		panic(err)
	}

	fmt.Println("URL:", output.Response.URL())
	fmt.Println("Status code:", output.Response.StatusCode())
	fmt.Println("Content-Type", output.Response.Header().Get("Content-Type"))
	fmt.Println("Data:", output.Data)
}

URL: https://example.com
Status code: 200
Content-Type text/html; charset=UTF-8
Data: map[title:Example Domain]

Documentation

Overview

webextractor are default interfaces for Colibri ready to start crawling or extracting data on the web.

Index

Constants

View Source
const DefaultTimeout = 5 * time.Second

DefaultTimeout default timeout used for HTTP requests.

Variables

This section is empty.

Functions

func New

func New(cookieJar ...http.CookieJar) (*colibri.Colibri, error)

New returns a new Colibri structure with default values. Returns an error if an error occurs when initializing the values.

Types

type Client

type Client struct {
	// Jar specifies the cookie jar.
	Jar http.CookieJar
	// contains filtered or unexported fields
}

Client represents an HTTP client. See the colibri.HTTPClient interface.

func NewClient

func NewClient(cookieJar ...http.CookieJar) (*Client, error)

NewClient returns a new Client structure. The first cookieJar sent is taken, if no value is sent, a new cookiejar.Jar is initialized.

func (*Client) Clear

func (client *Client) Clear()

Clear assigns nil to Jar.

func (*Client) Do

func (client *Client) Do(c *colibri.Colibri, rules *colibri.Rules) (colibri.Response, error)

Do makes an HTTP request based on the rules.

type ReqDelay

type ReqDelay struct {
	// contains filtered or unexported fields
}

ReqDelay manages the delay between each HTTP request. See the colibri.Delay interface.

func NewReqDelay

func NewReqDelay() *ReqDelay

NewReqDelay returns a new ReqDelay structure.

func (*ReqDelay) Clear

func (rd *ReqDelay) Clear()

func (*ReqDelay) Done

func (rd *ReqDelay) Done(u *url.URL)

func (*ReqDelay) Stamp

func (rd *ReqDelay) Stamp(u *url.URL)

func (*ReqDelay) Wait

func (rd *ReqDelay) Wait(u *url.URL, duration time.Duration)

type Response

type Response struct {
	HTTP *http.Response
	// contains filtered or unexported fields
}

Response represents an HTTP response. See the colibri.Response interface.

func (*Response) Body

func (resp *Response) Body() io.ReadCloser

func (*Response) Do

func (resp *Response) Do(rules *colibri.Rules) (colibri.Response, error)

func (*Response) Extract

func (resp *Response) Extract(rules *colibri.Rules) (*colibri.Output, error)

func (*Response) Header

func (resp *Response) Header() http.Header

func (*Response) Redirects

func (resp *Response) Redirects() []*url.URL

func (*Response) Serializable

func (resp *Response) Serializable() map[string]any

func (*Response) StatusCode

func (resp *Response) StatusCode() int

func (*Response) URL

func (resp *Response) URL() *url.URL

type RobotsData

type RobotsData struct {
	// contains filtered or unexported fields
}

RobotsData gets, stores and parses robots.txt restrictions.

func NewRobotsData

func NewRobotsData() *RobotsData

NewRobotsData returns a new RobotsData structure.

func (*RobotsData) Clear

func (robots *RobotsData) Clear()

Clear removes stored robots.txt restrictions.

func (*RobotsData) IsAllowed

func (robots *RobotsData) IsAllowed(c *colibri.Colibri, rules *colibri.Rules) error

IsAllowed verifies that the User-Agent can access the URL. Gets and stores the robots.txt restrictions of the URL host and for use in URLs with the same host.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL