unhtml

package module
v1.1.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 1, 2018 License: MIT Imports: 5 Imported by: 1

README

Coverage Status Go Report Card Build Status Documentation

Table of Contents

Example & Performance

A HTML file

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test">
        <ul>
            <li>0</li>
            <li>1</li>
            <li>2</li>
            <li>3</li>
        </ul>
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
        <p>Hello World!</p>
        <p>10</p>
        <p>3.14</p>
        <p>true</p>
    </div>
</body>
</html>

Read it

AllTypeHTML, _ := ioutil.ReadFile("testHTML/all-type.html")

If we want to parse it and get the values we want, like follow structs, how should we do?

type (
	AllTypeTest struct {
		Slice   []int    
		Struct  TestUser 
		String  string   
		Int     int      
		Int8    int8     
		Int16   int16    
		Int32   int32    
		Int64   int64    
		Uint    uint     
		Uint8   uint8    
		Uint16  uint16   
		Uint32  uint32   
		Uint64  uint64   

		Float32 float32  
		Float64 float64  
		Bool    bool     
	}
	TestUser struct {
		Name      string 
		Age       uint   
		LikeLemon bool   
	}
)

In traditional way, we should do like this

package example

import (
	"bytes"
	"github.com/PuerkitoBio/goquery"
	"strconv"
)

func parseAllTypesLogically() (AllTypeTest, error) {
	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(AllTypeHTML))
	allTypes := AllTypeTest{}
	if err == nil {
		selection := doc.Find(allTypes.Root())
		allTypes.Slice = make([]int, 0)
		selection.Find(`ul > li`).Each(func(i int, selection *goquery.Selection) {
			Int, parseErr := strconv.Atoi(selection.Text())
			if parseErr != nil {
				err = parseErr
			}
			allTypes.Slice = append(allTypes.Slice, Int)
		})
		if err == nil {
			allTypes.Struct.Name = selection.Find(`#test > div > p:nth-child(1)`).Text()
			Int, parseErr := strconv.Atoi(selection.Find(`#test > div > p:nth-child(2)`).Text())
			if err = parseErr; err == nil {
				allTypes.Struct.Age = uint(Int)
				Bool, parseErr := strconv.ParseBool(selection.Find(`#test > div > p:nth-child(3)`).Text())
				if err = parseErr; err == nil {
					allTypes.Struct.LikeLemon = Bool

					String := selection.Find(`#test > p:nth-child(3)"`).Text()
					Int, parseErr := strconv.Atoi(selection.Find(`#test > p:nth-child(4)`).Text())

					if err = parseErr; err != nil {
						return allTypes, err
					}
					Int8, parseErr := strconv.Atoi(selection.Find(`#test > p:nth-child(4)`).Text())
					if err = parseErr; err != nil {
						return allTypes, err
					}
					Int16, parseErr := strconv.Atoi(selection.Find(`#test > p:nth-child(4)`).Text())
					if err = parseErr; err != nil {
						return allTypes, err
					}
					Int32, parseErr := strconv.Atoi(selection.Find(`#test > p:nth-child(4)`).Text())
					if err = parseErr; err != nil {
						return allTypes, err
					}
					Int64, parseErr := strconv.Atoi(selection.Find(`#test > p:nth-child(4)`).Text())
					if err = parseErr; err != nil {
						return allTypes, err
					}
					Uint, parseErr := strconv.ParseUint(selection.Find(`#test > p:nth-child(4)`).Text(), 0, 0)
					if err = parseErr; err != nil {
						return allTypes, err
					}
					Uint8, parseErr := strconv.ParseUint(selection.Find(`#test > p:nth-child(4)`).Text(), 0, 0)
					if err = parseErr; err != nil {
						return allTypes, err
					}
					Uint16, parseErr := strconv.ParseUint(selection.Find(`#test > p:nth-child(4)`).Text(), 0, 0)
					if err = parseErr; err != nil {
						return allTypes, err
					}
					Uint32, parseErr := strconv.ParseUint(selection.Find(`#test > p:nth-child(4)`).Text(), 0, 0)
					if err = parseErr; err != nil {
						return allTypes, err
					}
					Uint64, parseErr := strconv.ParseUint(selection.Find(`#test > p:nth-child(4)`).Text(), 0, 0)
					if err = parseErr; err != nil {
						return allTypes, err
					}

					Float32, parseErr := strconv.ParseFloat(selection.Find(`#test > p:nth-child(5)`).Text(), 0)
					if err = parseErr; err != nil {
						return allTypes, err
					}

					Float64, parseErr := strconv.ParseFloat(selection.Find(`#test > p:nth-child(5)`).Text(), 0)
					if err = parseErr; err != nil {
						return allTypes, err
					}

					Bool, parseErr := strconv.ParseBool(selection.Find(`#test > p:nth-child(6)`).Text())
					if err = parseErr; err != nil {
						return allTypes, err
					}
					allTypes.String = String
					allTypes.Int = Int
					allTypes.Int8 = int8(Int8)
					allTypes.Int16 = int16(Int16)
					allTypes.Int32 = int32(Int32)
					allTypes.Int64 = int64(Int64)
					allTypes.Uint = uint(Uint)
					allTypes.Uint8 = uint8(Uint8)
					allTypes.Uint16 = uint16(Uint16)
					allTypes.Uint32 = uint32(Uint32)
					allTypes.Uint64 = uint64(Uint64)
					allTypes.Float32 = float32(Float32)
					allTypes.Float64 = Float64
					allTypes.Bool = Bool

				}
			}
		}
	}

	return allTypes, err
}

It works pretty good, but is boring. And now, you can do like this:

package main

import (
	"encoding/json"
	"fmt"
	"github.com/Hexilee/unhtml"
	"io/ioutil"
)

type (
	AllTypeTest struct {
		Slice   []int    `html:"ul > li"`
		Struct  TestUser `html:"#test > div"`
		String  string   `html:"#test > p:nth-child(3)"`
		Int     int      `html:"#test > p:nth-child(4)"`
		Int8    int8     `html:"#test > p:nth-child(4)"`
		Int16   int16    `html:"#test > p:nth-child(4)"`
		Int32   int32    `html:"#test > p:nth-child(4)"`
		Int64   int64    `html:"#test > p:nth-child(4)"`
		Uint    uint     `html:"#test > p:nth-child(4)"`
		Uint8   uint8    `html:"#test > p:nth-child(4)"`
		Uint16  uint16   `html:"#test > p:nth-child(4)"`
		Uint32  uint32   `html:"#test > p:nth-child(4)"`
		Uint64  uint64   `html:"#test > p:nth-child(4)"`
		Float32 float32  `html:"#test > p:nth-child(5)"`
		Float64 float64  `html:"#test > p:nth-child(5)"`
		Bool    bool     `html:"#test > p:nth-child(6)"`
	}

	TestUser struct {
		Name      string `html:"p:nth-child(1)"`
		Age       uint   `html:"p:nth-child(2)"`
		LikeLemon bool   `html:"p:nth-child(3)"`
	}
)

func (AllTypeTest) Root() string {
	return "#test"
}

func main() {
	allTypes := AllTypeTest{}
	_ := unhtml.Unmarshal(AllTypeHTML, &allTypes)
	result, _ := json.Marshal(&allTypes)
	fmt.Println(string(result))
}

Result:

{
  "Slice": [
    0,
    1,
    2,
    3
  ],
  "Struct": {
    "Name": "Hexilee",
    "Age": 20,
    "LikeLemon": true
  },
  "String": "Hello World!",
  "Int": 10,
  "Int8": 10,
  "Int16": 10,
  "Int32": 10,
  "Int64": 10,
  "Uint": 10,
  "Uint8": 10,
  "Uint16": 10,
  "Uint32": 10,
  "Uint64": 10,
  "Float32": 3.14,
  "Float64": 3.14,
  "Bool": true
}

I think it can improve much efficiency of my development, however, what about its performance?

There are two benchmarks

func BenchmarkUnmarshalAllTypes(b *testing.B) {
	assert.NotNil(b, AllTypeHTML)
	for i := 0; i < b.N; i++ {
		allTypes := AllTypeTest{}
		assert.Nil(b, Unmarshal(AllTypeHTML, &allTypes))
	}
}

func BenchmarkParseAllTypesLogically(b *testing.B) {
	assert.NotNil(b, AllTypeHTML)
	for i := 0; i < b.N; i++ {
		_, err := parseAllTypesLogically()
		assert.Nil(b, err)
	}
}

Test it:

> go test -bench=.
goos: darwin
goarch: amd64
pkg: github.com/Hexilee/unhtml
BenchmarkUnmarshalAllTypes-4        	   20000	     85108 ns/op
BenchmarkParseAllTypesLogically-4   	   20000	     64781 ns/op
PASS
ok  	github.com/Hexilee/unhtml	4.621s

Not very bad, in consideration of the small size of the demo HTML. In true development with more complicated HTML, their efficiency are almost the same.

Tips & Features

The only API this package exposed is the function,

func Unmarshal(data []byte, v interface{}) error

which is compatible with the standard libraries json and xml. However, you can do some jobs with the data types in your code.

Types

This package supports part kinds of type, the all kinds of type in the reflect package except Ptr/Uintptr/Interface/Chan/Func.

Follow fields are invalid and will cause UnmarshalerItemKindError.

type WrongFieldsStruct struct {
    Ptr *int
    Uintptr uintptr
    Interface io.Reader
    Chan chan int
    Func func()
}

However, when you call the function Unmarshal, you MUST pass a pointer otherwise you will get an UnmarshaledKindMustBePtrError.

a := 1

// Wrong
Unmarshal([]byte(""), a)

// Right
Unmarshal([]byte(""), &a)
Root

Return the root selector.

You are only supported to define a Root() string method for the root type, like

func (AllTypeTest) Root() string {
	return "#test"
}

If you define it for a field type, such as TestUser

func (TestUser) Root() string {
	return "#test"
}

In this case, in AllTypeTest, the field selector will be covered.

type (
	AllTypeTest struct {
		...
		Struct  TestUser `html:"#test > div"`
		...
	}
)

// real
type (
	AllTypeTest struct {
		...
		Struct  TestUser `html:"#test"`
		...
	}
)
Selector

This package base on github.com/PuerkitoBio/goquery and supports standard css selector.

You can define selector of a field in tag, like this

type (
	AllTypeTest struct {
	   ...
		Int     int      `html:"#test > p:nth-child(4)"`
		...
	}
)

In most cases, this package will find the #test > p:nth-child(4) element and try to parse its innerText as int.

However, when the field type is Struct or Slice, something will be more complex.

Struct
type (
	AllTypeTest struct {
		...
		Struct  TestUser `html:"#test > div"`
		...
	}

	TestUser struct {
		Name      string `html:"p:nth-child(1)"`
		Age       uint   `html:"p:nth-child(2)"`
		LikeLemon bool   `html:"p:nth-child(3)"`
	}
)

func (AllTypeTest) Root() string {
	return "#test"
}

First, it will call *goquery.Selection.Find("#test"), we get:

    <div id="test">
        <ul>
            <li>0</li>
            <li>1</li>
            <li>2</li>
            <li>3</li>
        </ul>
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
        <p>Hello World!</p>
        <p>10</p>
        <p>3.14</p>
        <p>true</p>
    </div>

Then, it will call *goquery.Selection.Find("#test > div"), we get

<div>
    <p>Hexilee</p>
    <p>20</p>
    <p>true</p>
</div>

Then, in TestUser, it will call

*goquery.Selection.Find("p:nth-child(1)") // as Name
*goquery.Selection.Find("p:nth-child(2)") // as Age
*goquery.Selection.Find("p:nth-child(3)") // as LikeLemon
Slice
type (
	AllTypeTest struct {
		Slice   []int    `html:"ul > li"`		...
	}
)

func (AllTypeTest) Root() string {
	return "#test"
}

As above, we get

    <div id="test">
        <ul>
            <li>0</li>
            <li>1</li>
            <li>2</li>
            <li>3</li>
        </ul>
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
        <p>Hello World!</p>
        <p>10</p>
        <p>3.14</p>
        <p>true</p>
    </div>

Then it will call *goquery.Selection.Find("ul > li"), we get

  <li>0</li>
  <li>1</li>
  <li>2</li>
  <li>3</li>

Then, it will call *goquery.Selection.Each(func(int, *goquery.Selection)), iterate the list and parse values for slice.

Tags

This package supports three tags, html, attr and converter

html

Provide the css selector of this field.

attr

By default, this package regard the innerText of a element as its value

<a href="https://google.com">Google</a>
type Link struct {
    Text string `html:"a"`
}

You will get Text = Google. However, how should we do if we want to get href?

type Link struct {
    Href string `html:"a" attr:"href"`
    Text string `html:"a"`
}

You will get link.Href == "https://google.com"

converter

Sometimes, you want to process the original data

<p>2018-10-01 00:00:01</p>

You may unmarshal it like this

type Birthday struct {
	Time time.Time `html:"p"`
}

func TestConverter(t *testing.T) {
	birthday := Birthday{}
	assert.Nil(t, Unmarshal([]byte(BirthdayHTML), &birthday))
	assert.Equal(t, 2018, birthday.Time.Year())
	assert.Equal(t, time.October, birthday.Time.Month())
	assert.Equal(t, 1, birthday.Time.Day())
}

Absolutely, you will fail, because you don't define the way converts string to time.Time. unhtml will regard it as a struct.

However, you can use converter

type Birthday struct {
    Time time.Time `html:"p" converter:"StringToTime"`
}

const TimeStandard = `2006-01-02 15:04:05`

func (Birthday) StringToTime(str string) (time.Time, error) {
	return time.Parse(TimeStandard, str)
}

func TestConverter(t *testing.T) {
	birthday := Birthday{}
	assert.Nil(t, Unmarshal([]byte(BirthdayHTML), &birthday))
	assert.Equal(t, 2018, birthday.Time.Year())
	assert.Equal(t, time.October, birthday.Time.Month())
	assert.Equal(t, 1, birthday.Time.Day())
}

Make it.

The type of converter MUST be

func (inputType) (resultType, error)

resultType MUST be the same with the field type, and they can be any type.

inputType MUST NOT violate the requirements in Types.

Documentation

Index

Examples

Constants

View Source
const (
	UnmarshaledKindMustBePtr = "unmarshaled kind must be Ptr"
	UnmarshalerItemKind      = "unmarshaled elem cannot be Ptr/Uintptr/Interface/Chan/Func/"
	DtoZero                  = "dto cannot be zero"
	SelectionNil             = "selection cannot be nil"
	ConverterNotExist        = "converter not exist"
	ConverterTypeWrong       = "type of converter is wrong"
)
View Source
const (
	SelectorKey  = "html"
	AttrKey      = "attr"
	ConverterKey = "converter"
	ZeroStr      = ""
)
View Source
const (
	AttrHref = "href"
)
View Source
const (
	ErrorMethodName = "Error"
)

Variables

This section is empty.

Functions

func Unmarshal

func Unmarshal(data []byte, v interface{}) error
Example
package main

import (
	"encoding/json"
	"fmt"
)

const (
	AllTypesHTML = `
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test">
        <ul>
            <li>0</li>
            <li>1</li>
            <li>2</li>
            <li>3</li>
        </ul>
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
        <p>Hello World!</p>
        <p>10</p>
        <p>3.14</p>
        <p>true</p>
    </div>
</body>
</html>
`
)

func main() {
	allTypes := AllTypeTest{}
	_ = Unmarshal(AllTypeHTML, &allTypes)
	result, _ := json.Marshal(&allTypes)
	fmt.Println(string(result))
}
Output:

{"Slice":[0,1,2,3],"Struct":{"Name":"Hexilee","Age":20,"LikeLemon":true},"String":"Hello World!","Int":10,"Int8":10,"Int16":10,"Int32":10,"Int64":10,"Uint":10,"Uint8":10,"Uint16":10,"Uint32":10,"Uint64":10,"Float32":3.14,"Float64":3.14,"Bool":true}

Types

type ConverterNotExistError

type ConverterNotExistError struct {
	// contains filtered or unexported fields
}

func NewConverterNotExistError

func NewConverterNotExistError(name string) *ConverterNotExistError

func (ConverterNotExistError) Error

func (err ConverterNotExistError) Error() string

type ConverterTypeWrongError

type ConverterTypeWrongError struct {
	// contains filtered or unexported fields
}

func NewConverterTypeWrongError

func NewConverterTypeWrongError(name string, methodType reflect.Type) *ConverterTypeWrongError

func (ConverterTypeWrongError) Error

func (err ConverterTypeWrongError) Error() string

type HTMLModel

type HTMLModel interface {
	// Root return root selector
	Root() string
}

HTMLModel: HTML model with root selector

type HTMLUnmarshaler

type HTMLUnmarshaler struct {
	// contains filtered or unexported fields
}

HTMLUnmarshaler: inner hidden

type HTMLUnmarshalerBuilder

type HTMLUnmarshalerBuilder struct {
	// contains filtered or unexported fields
}

HTMLUnmarshalerBuilder: inner hidden

type UnmarshaledKindMustBePtrError

type UnmarshaledKindMustBePtrError struct {
	// contains filtered or unexported fields
}

func NewUnmarshaledKindMustBePtrError

func NewUnmarshaledKindMustBePtrError(dtoType reflect.Type) *UnmarshaledKindMustBePtrError

func (UnmarshaledKindMustBePtrError) Error

type UnmarshalerItemKindError

type UnmarshalerItemKindError struct {
	// contains filtered or unexported fields
}

func NewUnmarshalerItemKindError

func NewUnmarshalerItemKindError(dtoType reflect.Type) *UnmarshalerItemKindError

func (UnmarshalerItemKindError) Error

func (err UnmarshalerItemKindError) Error() string

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL