pagser

package module
v0.1.6 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 15, 2023 License: MIT Imports: 12 Imported by: 7

README

Pagser

go-doc-img travis-img go-report-card-img Coverage Status

Pagser inspired by page parser

Pagser is a simple, extensible, configurable parse and deserialize html page to struct based on goquery and struct tags for golang crawler.

Contents

Install

go get -u github.com/foolin/pagser

Or get the specified version:

go get github.com/foolin/pagser@{version}

The {version} release list: https://github.com/foolin/pagser/releases

Features

  • Simple - Use golang struct tag syntax.
  • Easy - Easy use for your spider/crawler/colly application.
  • Extensible - Support for extension functions.
  • Struct tag grammar - Grammar is simple, like `pagser:"a->attr(href)"`.
  • Nested Structure - Support Nested Structure for node.
  • Configurable - Support configuration.
  • Implicit type conversion - Automatic implicit type conversion, Output result string convert to int, int64, float64...
  • GoQuery/Colly - Support all goquery project, such as go-colly.

Docs

See Pagser

Usage


package main

import (
	"encoding/json"
	"github.com/foolin/pagser"
	"log"
)

const rawPageHtml = `
<!doctype html>
<html>
<head>
    <meta charset="utf-8">
    <title>Pagser Title</title>
	<meta name="keywords" content="golang,pagser,goquery,html,page,parser,colly">
</head>

<body>
	<h1>H1 Pagser Example</h1>
	<div class="navlink">
		<div class="container">
			<ul class="clearfix">
				<li id=''><a href="/">Index</a></li>
				<li id='2'><a href="/list/web" title="web site">Web page</a></li>
				<li id='3'><a href="/list/pc" title="pc page">Pc Page</a></li>
				<li id='4'><a href="/list/mobile" title="mobile page">Mobile Page</a></li>
			</ul>
		</div>
	</div>
</body>
</html>
`

type PageData struct {
	Title    string   `pagser:"title"`
	Keywords []string `pagser:"meta[name='keywords']->attrSplit(content)"`
	H1       string   `pagser:"h1"`
	Navs     []struct {
		ID   int    `pagser:"->attrEmpty(id, -1)"`
		Name string `pagser:"a->text()"`
		Url  string `pagser:"a->attr(href)"`
	} `pagser:".navlink li"`
}

func main() {
	//New default config
	p := pagser.New()

	//data parser model
	var data PageData
	//parse html data
	err := p.Parse(&data, rawPageHtml)
	//check error
	if err != nil {
		log.Fatal(err)
	}

	//print data
	log.Printf("Page data json: \n-------------\n%v\n-------------\n", toJson(data))
}

func toJson(v interface{}) string {
	data, _ := json.MarshalIndent(v, "", "\t")
	return string(data)
}

Run output:


Page data json: 
-------------
{
	"Title": "Pagser Title",
	"Keywords": [
		"golang",
		"pagser",
		"goquery",
		"html",
		"page",
		"parser",
		"colly"
	],
	"H1": "H1 Pagser Example",
	"Navs": [
		{
			"ID": -1,
			"Name": "Index",
			"Url": "/"
		},
		{
			"ID": 2,
			"Name": "Web page",
			"Url": "/list/web"
		},
		{
			"ID": 3,
			"Name": "Pc Page",
			"Url": "/list/pc"
		},
		{
			"ID": 4,
			"Name": "Mobile Page",
			"Url": "/list/mobile"
		}
	]
}
-------------

Configuration


type Config struct {
	TagName    string //struct tag name, default is `pagser`
	FuncSymbol   string //Function symbol, default is `->`
	Debug        bool   //Debug mode, debug will print some log, default is `false`
}

Struct Tag Grammar

[goquery selector]->[function]

Example:


type ExamData struct {
	Herf string `pagser:".navLink li a->attr(href)"`
}

1.Struct tag name: pagser
2.goquery selector: .navLink li a
3.Function symbol: ->
4.Function name: attr
5.Function arguments: href

grammar

Functions

Builtin functions
  • text() get element text, return string, this is default function, if not define function in struct tag.
  • eachText() get each element text, return []string.
  • html() get element inner html, return string.
  • eachHtml() get each element inner html, return []string.
  • outerHtml() get element outer html, return string.
  • eachOutHtml() get each element outer html, return []string.
  • attr(name) get element attribute value, return string.
  • eachAttr() get each element attribute value, return []string.
  • attrSplit(name, sep) get attribute value and split by separator to array string.
  • attr('value') get element attribute value by name is value, return string, eg: will return "xxx".
  • textSplit(sep) get element text and split by separator to array string, return []string.
  • eachTextJoin(sep) get each element text and join to string, return string.
  • eq(index) reduces the set of matched elements to the one at the specified index, return Selection for nested struct.
  • ...

More builtin functions see docs: https://pkg.go.dev/github.com/foolin/pagser?tab=doc#BuiltinFunctions

Extension functions
  • Markdown() //convert html to markdown format.
  • UgcHtml() //sanitize html

Extensions function need register, like:

import "github.com/foolin/pagser/extensions/markdown"

p := pagser.New()

//Register Markdown
markdown.Register(p)

Custom function
Function interface

type CallFunc func(node *goquery.Selection, args ...string) (out interface{}, err error)

Define global function

//global function need call pagser.RegisterFunc("MyGlob", MyGlobalFunc) before use it.
// this global method must call pagser.RegisterFunc("MyGlob", MyGlobalFunc).
func MyGlobalFunc(node *goquery.Selection, args ...string) (out interface{}, err error) {
	return "Global-" + node.Text(), nil
}

type PageData struct{
  MyGlobalValue string    `pagser:"->MyGlob()"`
}

func main(){

    p := pagser.New()

    //Register global function `MyGlob`
    p.RegisterFunc("MyGlob", MyGlobalFunc)

    //Todo

    //data parser model
    var data PageData
    //parse html data
    err := p.Parse(&data, rawPageHtml)

    //...
}

Define struct function

type PageData struct{
  MyFuncValue int    `pagser:"->MyFunc()"`
}

// this method will auto call, not need register.
func (d PageData) MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) {
	return "Struct-" + node.Text(), nil
}


func main(){

    p := pagser.New()

    //Todo

    //data parser model
    var data PageData
    //parse html data
    err := p.Parse(&data, rawPageHtml)

    //...
}

Call Syntax

Note: all function arguments are string, single quotes are optional.

  1. Function call with no arguments

->fn()

  1. Function calls with one argument, and single quotes are optional

->fn(one)

->fn('one')

  1. Function calls with many arguments

->fn(one, two, three, ...)

->fn('one', 'two', 'three', ...)

  1. Function calls with single quotes and escape character

->fn('it\'s ok', 'two,xxx', 'three', ...)

Priority Order

Lookup function priority order:

struct method -> parent method -> ... -> global

More Examples

See advance example: https://github.com/foolin/pagser/tree/master/_examples/advance

Implicit type conversion

Automatic implicit type conversion, Output result string convert to int, int64, float64...

Support type:

  • bool
  • float32
  • float64
  • int
  • int32
  • int64
  • string
  • []bool
  • []float32
  • []float64
  • []int
  • []int32
  • []int64
  • []string

Examples

Crawl page example

package main

import (
	"encoding/json"
	"github.com/foolin/pagser"
	"log"
	"net/http"
)

type PageData struct {
	Title    string `pagser:"title"`
	RepoList []struct {
		Names       []string `pagser:"h1->textSplit('/', true)"`
		Description string   `pagser:"h1 + p"`
		Stars       string   `pagser:"a.muted-link->eqAndText(0)"`
		Repo        string   `pagser:"h1 a->attrConcat('href', 'https://github.com', $value, '?from=pagser')"`
	} `pagser:"article.Box-row"`
}

func main() {
	resp, err := http.Get("https://github.com/trending")
	if err != nil {
		log.Fatal(err)
	}
	defer resp.Body.Close()

	//New default config
	p := pagser.New()

	//data parser model
	var data PageData
	//parse html data
	err = p.ParseReader(&data, resp.Body)
	//check error
	if err != nil {
		log.Fatal(err)
	}

	//print data
	log.Printf("Page data json: \n-------------\n%v\n-------------\n", toJson(data))
}

func toJson(v interface{}) string {
	data, _ := json.MarshalIndent(v, "", "\t")
	return string(data)
}


Run output:


2020/04/25 12:26:04 Page data json: 
-------------
{
	"Title": "Trending  repositories on GitHub today · GitHub",
	"RepoList": [
		{
			"Names": [
				"pcottle",
				"learnGitBranching"
			],
			"Description": "An interactive git visualization to challenge and educate!",
			"Stars": "16,010",
			"Repo": "https://github.com/pcottle/learnGitBranching?from=pagser"
		},
		{
			"Names": [
				"jackfrued",
				"Python-100-Days"
			],
			"Description": "Python - 100天从新手到大师",
			"Stars": "83,484",
			"Repo": "https://github.com/jackfrued/Python-100-Days?from=pagser"
		},
		{
			"Names": [
				"brave",
				"brave-browser"
			],
			"Description": "Next generation Brave browser for macOS, Windows, Linux, Android.",
			"Stars": "5,963",
			"Repo": "https://github.com/brave/brave-browser?from=pagser"
		},
		{
			"Names": [
				"MicrosoftDocs",
				"azure-docs"
			],
			"Description": "Open source documentation of Microsoft Azure",
			"Stars": "3,798",
			"Repo": "https://github.com/MicrosoftDocs/azure-docs?from=pagser"
		},
		{
			"Names": [
				"ahmetb",
				"kubectx"
			],
			"Description": "Faster way to switch between clusters and namespaces in kubectl",
			"Stars": "6,979",
			"Repo": "https://github.com/ahmetb/kubectx?from=pagser"
		},

        //...        

		{
			"Names": [
				"serverless",
				"serverless"
			],
			"Description": "Serverless Framework – Build web, mobile and IoT applications with serverless architectures using AWS Lambda, Azure Functions, Google CloudFunctions \u0026 more! –",
			"Stars": "35,502",
			"Repo": "https://github.com/serverless/serverless?from=pagser"
		},
		{
			"Names": [
				"vuejs",
				"vite"
			],
			"Description": "Experimental no-bundle dev server for Vue SFCs",
			"Stars": "1,573",
			"Repo": "https://github.com/vuejs/vite?from=pagser"
		}
	]
}
-------------
Colly Example

Work with colly:


p := pagser.New()


// On every a element which has href attribute call callback
collector.OnHTML("body", func(e *colly.HTMLElement) {
	//data parser model
	var data PageData
	//parse html data
	err := p.ParseSelection(&data, e.Dom)

})

Dependencies

  • github.com/PuerkitoBio/goquery

  • github.com/spf13/cast

Extensions:

  • github.com/mattn/godown

  • github.com/microcosm-cc/bluemonday

Documentation

Overview

Package pagser is a simple, easy, extensible, configurable HTML parser to struct based on goquery and struct tags, It's parser library from scrago.

The project source code: https://github.com/foolin/pagser

Features

* Simple - Use golang struct tag syntax.

* Easy - Easy use for your spider/crawler/colly application.

* Extensible - Support for extension functions.

* Struct tag grammar - Grammar is simple, like \`pagser:"a->attr(href)"\`.

* Nested Structure - Support Nested Structure for node.

* Configurable - Support configuration.

* GoQuery/Colly - Support all goquery project, such as go-colly.

More info: https://github.com/foolin/pagser

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type BuiltinFunctions added in v0.0.7

type BuiltinFunctions struct {
}

BuiltinFunctions builtin functions are registered with a lowercase initial, eg: Text -> text()

func (BuiltinFunctions) AbsHref added in v0.1.2

func (builtin BuiltinFunctions) AbsHref(selection *goquery.Selection, args ...string) (out interface{}, err error)

AbsHref absHref(baseUrl) get element attribute name `href`, and convert to absolute url, return *URL. `baseUrl` is the base url like `https://example.com/`.

//<a href="/foolin/pagser">Pagser</a>
struct {
	Example string `pagser:".selector->absHref('https://github.com/')"`
}

func (BuiltinFunctions) Attr added in v0.0.7

func (builtin BuiltinFunctions) Attr(node *goquery.Selection, args ...string) (out interface{}, err error)

Attr attr(name, defaultValue=”) get element attribute value, return string. outerHtml() get element outer html, return string.

//<a href="https://github.com/foolin/pagser">Pagser</a>
struct {
	Example string `pagser:".selector->attr(href)"`
}

func (BuiltinFunctions) AttrConcat added in v0.1.1

func (builtin BuiltinFunctions) AttrConcat(node *goquery.Selection, args ...string) (out interface{}, err error)

AttrConcat attrConcat(name, text1, $value, [ text2, ... text_n ]) `name` get element attribute value by name, `text1, text2, ... text_n` The strings that you wish to join together, `$value` is placeholder for get element text return string.

struct {
	Example string `pagser:".selector->attrConcat('Result:', '<', $value, '>')"`
}

func (BuiltinFunctions) AttrEmpty added in v0.1.0

func (builtin BuiltinFunctions) AttrEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)

AttrEmpty attrEmpty(name, defaultValue) get element attribute value, return string.

//<a href="https://github.com/foolin/pagser">Pagser</a>
struct {
	Example string `pagser:".selector->AttrEmpty(href, '#')"`
}

func (BuiltinFunctions) AttrSplit added in v0.0.7

func (builtin BuiltinFunctions) AttrSplit(node *goquery.Selection, args ...string) (out interface{}, err error)

AttrSplit attrSplit(name, sep=',', trim='true') get attribute value and split by separator to array string, return []string.

struct {
	Examples []string `pagser:".selector->attrSplit('keywords', ',')"`
}

func (BuiltinFunctions) EachAttr added in v0.0.7

func (builtin BuiltinFunctions) EachAttr(node *goquery.Selection, args ...string) (out interface{}, err error)

EachAttr eachAttr(name) get each element attribute value, return []string.

//<a href="https://github.com/foolin/pagser">Pagser</a>
struct {
	Examples []string `pagser:".selector->eachAttr(href)"`
}

func (BuiltinFunctions) EachAttrEmpty added in v0.1.0

func (builtin BuiltinFunctions) EachAttrEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)

EachAttrEmpty eachAttrEmpty(name, defaultValue) get each element attribute value, return []string.

//<a href="https://github.com/foolin/pagser">Pagser</a>
struct {
	Examples []string `pagser:".selector->eachAttrEmpty(href, '#')"`
}

func (BuiltinFunctions) EachHtml added in v0.0.7

func (builtin BuiltinFunctions) EachHtml(node *goquery.Selection, args ...string) (out interface{}, err error)

EachHtml eachHtml() get each element inner html, return []string. eachTextEmpty(defaultValue) get each element text, return []string.

struct {
	Examples []string `pagser:".selector->eachHtml()"`
}

func (BuiltinFunctions) EachOutHtml added in v0.0.7

func (builtin BuiltinFunctions) EachOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)

EachOutHtml eachOutHtml() get each element outer html, return []string.

struct {
	Examples []string `pagser:".selector->eachOutHtml()"`
}

func (BuiltinFunctions) EachText added in v0.0.7

func (builtin BuiltinFunctions) EachText(node *goquery.Selection, args ...string) (out interface{}, err error)

EachText eachText() get each element text, return []string.

struct {
	Examples []string `pagser:".selector->eachText('')"`
}

func (BuiltinFunctions) EachTextEmpty added in v0.1.0

func (builtin BuiltinFunctions) EachTextEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)

EachTextEmpty eachTextEmpty(defaultValue) get each element text, return []string.

struct {
	Examples []string `pagser:".selector->eachTextEmpty('')"`
}

func (BuiltinFunctions) EachTextJoin added in v0.1.1

func (builtin BuiltinFunctions) EachTextJoin(node *goquery.Selection, args ...string) (out interface{}, err error)

EachTextJoin eachTextJoin(sep) get each element text and join to string, return string.

struct {
	Example string `pagser:".selector->eachTextJoin(',')"`
}

func (BuiltinFunctions) EqAndAttr added in v0.0.7

func (builtin BuiltinFunctions) EqAndAttr(node *goquery.Selection, args ...string) (out interface{}, err error)

EqAndAttr eqAndAttr(index, name) reduces the set of matched elements to the one at the specified index, and attr() return string.

struct {
	Example string `pagser:".selector->eqAndAttr(0, href)"`
}

func (BuiltinFunctions) EqAndHtml added in v0.0.7

func (builtin BuiltinFunctions) EqAndHtml(node *goquery.Selection, args ...string) (out interface{}, err error)

EqAndHtml eqAndHtml(index) reduces the set of matched elements to the one at the specified index, and html() return string.

struct {
	Example string `pagser:".selector->eqAndHtml(0)"`
}

func (BuiltinFunctions) EqAndOutHtml added in v0.0.7

func (builtin BuiltinFunctions) EqAndOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)

EqAndOutHtml eqAndOutHtml(index) reduces the set of matched elements to the one at the specified index, and outHtml() return string.

struct {
	Example string `pagser:".selector->eqAndOutHtml(0)"`
}

func (BuiltinFunctions) EqAndText added in v0.1.1

func (builtin BuiltinFunctions) EqAndText(node *goquery.Selection, args ...string) (out interface{}, err error)

EqAndText eqAndText(index) reduces the set of matched elements to the one at the specified index, return string.

struct {
	Example string `pagser:".selector->eqAndText(0)"`
}

func (BuiltinFunctions) Html added in v0.0.7

func (builtin BuiltinFunctions) Html(node *goquery.Selection, args ...string) (out interface{}, err error)

Html html() get element inner html, return string.

struct {
	Example string `pagser:".selector->html()"`
}

func (BuiltinFunctions) OutHtml added in v0.0.7

func (builtin BuiltinFunctions) OutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)

OutHtml outerHtml() get element outer html, return string.

struct {
	Example string `pagser:".selector->outerHtml()"`
}

func (BuiltinFunctions) Size added in v0.1.3

func (builtin BuiltinFunctions) Size(node *goquery.Selection, args ...string) (out interface{}, err error)

Size size() returns the number of elements in the Selection object, return int.

struct {
	Size int `pagser:".selector->size()"`
}

func (BuiltinFunctions) Text added in v0.0.7

func (builtin BuiltinFunctions) Text(node *goquery.Selection, args ...string) (out interface{}, err error)

Text text() get element text, return string, this is default function, if not define function in struct tag.

struct {
	Example string `pagser:".selector->text()"`
}

func (BuiltinFunctions) TextConcat added in v0.1.1

func (builtin BuiltinFunctions) TextConcat(node *goquery.Selection, args ...string) (out interface{}, err error)

TextConcat textConcat(text1, $value, [ text2, ... text_n ]) The `text1, text2, ... text_n` strings that you wish to join together, `$value` is placeholder for get element text, return string.

struct {
	Example string `pagser:".selector->textConcat('Result:', '<', $value, '>')"`
}

func (BuiltinFunctions) TextEmpty added in v0.1.0

func (builtin BuiltinFunctions) TextEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)

TextEmpty textEmpty(defaultValue) get element text, if empty will return defaultValue, return string.

struct {
	Example string `pagser:".selector->textEmpty('')"`
}

func (BuiltinFunctions) TextSplit added in v0.1.1

func (builtin BuiltinFunctions) TextSplit(node *goquery.Selection, args ...string) (out interface{}, err error)

TextSplit textSplit(sep=',', trim='true') get element text and split by separator to array string, return []string.

struct {
	Examples []string `pagser:".selector->textSplit('|')"`
}

type BuiltinSelections added in v0.1.2

type BuiltinSelections struct {
}

BuiltinSelections builtin selection functions are registered with a lowercase initial, eg: Text -> text()

func (BuiltinSelections) Child added in v0.1.2

func (builtin BuiltinSelections) Child(node *goquery.Selection, args ...string) (out interface{}, err error)

Child child(selector=”) gets the child elements of each element in the Selection, Filtered by the specified selector if selector not empty, It returns Selection object containing these elements for nested struct..

struct {
	SubStruct struct {
		Example string `pagser:".selector->text()"`
	}	`pagser:".selector->child()"`
}

func (BuiltinSelections) Eq added in v0.1.2

func (builtin BuiltinSelections) Eq(node *goquery.Selection, args ...string) (out interface{}, err error)

Eq eq(index) reduces the set of matched elements to the one at the specified index. If a negative index is given, it counts backwards starting at the end of the set. It returns a Selection object for nested struct, and an empty Selection object if the index is invalid.

struct {
	SubStruct struct {
		Example string `pagser:".selector->text()"`
	}	`pagser:".selector->eq(0)"`
}

func (BuiltinSelections) First added in v0.1.2

func (builtin BuiltinSelections) First(node *goquery.Selection, args ...string) (out interface{}, err error)

First first() First reduces the set of matched elements to the first in the set. It returns a new Selection object, and an empty Selection object if the the selection is empty. It returns Selection object containing these elements for nested struct.

struct {
	SubStruct struct {
		Example string `pagser:".selector->text()"`
	}	`pagser:".selector->first()"`
}

func (BuiltinSelections) Last added in v0.1.2

func (builtin BuiltinSelections) Last(node *goquery.Selection, args ...string) (out interface{}, err error)

Last last(selector=”) reduces the set of matched elements to the last in the set. It returns a new Selection object, and an empty Selection object if the selection is empty.

struct {
	SubStruct struct {
		Example string `pagser:".selector->text()"`
	}	`pagser:".selector->last()"`
}

func (BuiltinSelections) Next added in v0.1.2

func (builtin BuiltinSelections) Next(node *goquery.Selection, args ...string) (out interface{}, err error)

Next next(selector=”) gets the immediately following sibling of each element in the Selection. Filtered by the specified selector if selector not empty, It returns Selection object containing these elements for nested struct.

struct {
	SubStruct struct {
		Example string `pagser:".selector->text()"`
	}	`pagser:".selector->next()"`
}

func (BuiltinSelections) Parent added in v0.1.2

func (builtin BuiltinSelections) Parent(node *goquery.Selection, args ...string) (out interface{}, err error)

Parent parent(selector=”) gets the parent elements of each element in the Selection. Filtered by the specified selector if selector not empty, It returns Selection object containing these elements for nested struct.

struct {
	SubStruct struct {
		Example string `pagser:".selector->text()"`
	}	`pagser:".selector->parent()"`
}

func (BuiltinSelections) Parents added in v0.1.2

func (builtin BuiltinSelections) Parents(node *goquery.Selection, args ...string) (out interface{}, err error)

Parents parents(selector=”) gets the parent elements of each element in the Selection. Filtered by the specified selector if selector not empty, It returns Selection object containing these elements for nested struct.

struct {
	SubStruct struct {
		Example string `pagser:".selector->text()"`
	}	`pagser:".selector->parents()"`
}

func (BuiltinSelections) ParentsUntil added in v0.1.2

func (builtin BuiltinSelections) ParentsUntil(node *goquery.Selection, args ...string) (out interface{}, err error)

ParentsUntil parentsUntil(selector) gets the ancestors of each element in the Selection, up to but not including the element matched by the selector. It returns a new Selection object containing the matched elements. It returns Selection object containing these elements for nested struct.

struct {
	SubStruct struct {
		Example string `pagser:".selector->text()"`
	}	`pagser:".selector->parentsUntil('.wrap')"`
}

func (BuiltinSelections) Prev added in v0.1.2

func (builtin BuiltinSelections) Prev(node *goquery.Selection, args ...string) (out interface{}, err error)

Prev prev() gets the immediately preceding sibling of each element in the Selection. Filtered by the specified selector if selector not empty, It returns Selection object containing these elements for nested struct.

struct {
	SubStruct struct {
		Example string `pagser:".selector->text()"`
	}	`pagser:".selector->prev()"`
}

func (BuiltinSelections) Siblings added in v0.1.2

func (builtin BuiltinSelections) Siblings(node *goquery.Selection, args ...string) (out interface{}, err error)

Siblings siblings() gets the siblings of each element in the Selection. Filtered by the specified selector if selector not empty, It returns Selection object containing these elements for nested struct.

struct {
	SubStruct struct {
		Example string `pagser:".selector->text()"`
	}	`pagser:".selector->siblings()"`
}

type CallFunc

type CallFunc func(node *goquery.Selection, args ...string) (out interface{}, err error)

CallFunc write function interface

Define Global Function

func MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) {
	//Todo
	return "Hello", nil
}

//Register function
pagser.RegisterFunc("MyFunc", MyFunc)

//Use function
type PageData struct{
     Text string `pagser:"h1->MyFunc()"`
}

Define Struct Function

//Use function
type PageData struct{
     Text string `pagser:"h1->MyFunc()"`
}

func (pd PageData) MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) {
	//Todo
	return "Hello", nil
}

Lookup function priority order

struct method -> parent method -> ... -> global

Implicit convert type

Automatic type conversion, Output result string convert to int, int64, float64...

CallFunc is a define function interface

type Config

type Config struct {
	TagName    string //struct tag name, default is `pagser`
	FuncSymbol string //Function symbol, default is `->`
	CastError  bool   //Returns an error when the type cannot be converted, default is `false`
	Debug      bool   //Debug mode, debug will print some log, default is `false`
}

Config configuration

func DefaultConfig

func DefaultConfig() Config

DefaultConfig the default Config

Config{
	TagName:    "pagser",
	FuncSymbol: "->",
	CastError:  false,
	Debug:      false,
}

type Pagser

type Pagser struct {
	Config Config
	// contains filtered or unexported fields
}

Pagser the page parser

func New

func New() *Pagser

New create pagser client

func NewWithConfig

func NewWithConfig(cfg Config) (*Pagser, error)

NewWithConfig create pagser client with Config and error

Example
cfg := Config{
	TagName:    "pagser",
	FuncSymbol: "->",
	CastError:  false,
	Debug:      false,
}
p, err := NewWithConfig(cfg)
if err != nil {
	log.Fatal(err)
}

//data parser model
var page ExamplePage
//parse html data
err = p.Parse(&page, rawExampleHtml)
//check error
if err != nil {
	log.Fatal(err)
}
Output:

func (*Pagser) Parse

func (p *Pagser) Parse(v interface{}, document string) (err error)

Parse parse html to struct

Example
//New default Config
p := New()

//data parser model
var page ExamplePage
//parse html data
err := p.Parse(&page, rawExampleHtml)
//check error
if err != nil {
	log.Fatal(err)
}

//print result
log.Printf("%v", page)
Output:

func (*Pagser) ParseDocument

func (p *Pagser) ParseDocument(v interface{}, document *goquery.Document) (err error)

ParseDocument parse document to struct

Example
//New default Config
p := New()

//data parser model
var data ExamplePage
doc, err := goquery.NewDocumentFromReader(strings.NewReader(rawExampleHtml))
if err != nil {
	log.Fatal(err)
}

//parse document
err = p.ParseDocument(&data, doc)
//check error
if err != nil {
	log.Fatal(err)
}

//print result
log.Printf("%v", data)
Output:

func (*Pagser) ParseReader added in v0.0.3

func (p *Pagser) ParseReader(v interface{}, reader io.Reader) (err error)

ParseReader parse html to struct

Example
resp, err := http.Get("https://raw.githubusercontent.com/foolin/pagser/master/_examples/pages/demo.html")
if err != nil {
	log.Fatal(err)
}
defer resp.Body.Close()

//New default Config
p := New()
//data parser model
var page ExamplePage
//parse html data
err = p.ParseReader(&page, resp.Body)
//check error
if err != nil {
	panic(err)
}

log.Printf("%v", page)
Output:

func (*Pagser) ParseSelection

func (p *Pagser) ParseSelection(v interface{}, selection *goquery.Selection) (err error)

ParseSelection parse selection to struct

Example
//New default Config
p := New()

//data parser model
var data ExamplePage
doc, err := goquery.NewDocumentFromReader(strings.NewReader(rawExampleHtml))
if err != nil {
	log.Fatal(err)
}

//parse document
err = p.ParseSelection(&data, doc.Selection)
//check error
if err != nil {
	log.Fatal(err)
}

//print result
log.Printf("%v", data)
Output:

func (*Pagser) RegisterFunc

func (p *Pagser) RegisterFunc(name string, fn CallFunc)

RegisterFunc register function for parse result

pagser.RegisterFunc("MyFunc", func(node *goquery.Selection, args ...string) (out interface{}, err error) {
	//Todo
	return "Hello", nil
})
Example
p := New()

p.RegisterFunc("MyFunc", func(node *goquery.Selection, args ...string) (out interface{}, err error) {
	//Todo
	return "Hello", nil
})
Output:

Directories

Path Synopsis
_examples
extensions

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL