htmltojson

package module
v1.0.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 12, 2020 License: MIT Imports: 6 Imported by: 2

README

HTMLtoJSON

Go Reference

HTMLtoJSON is a HTML parser, based on net/html package. This package actually just to simplify HTML parsing. If you need more complex HTML processing, please use net/html as its offer more features. The package name is actually is not really fitting for this package purpose, but I use this package for may scraper engines, so I don't really want to bother with changing the package name...

Installation

HTMLtoJSON requires Golang v1.14 or higher

$ GO111MODULE=on go get github.com/tamboto2000/htmltojson

Examples

Example 1

Parse from file

package main

import "github.com/tamboto2000/htmltojson"

func main() {
	// Parse from file
	node, err := htmltojson.ParseFromFile("test.html")
	if err != nil {
		panic(err.Error())
	}

	// Save node
	if err := htmltojson.Save(node); err != nil {
		panic(err.Error())
	}
}
Example 2

Parse from reader

package main

import (
	"os"

	"github.com/tamboto2000/htmltojson"
)

func main() {
	f, err := os.Open("test.html")
	if err != nil {
		panic(err.Error())
	}

	defer f.Close()

	// Parse from io.Reader
	node, err := htmltojson.ParseFromReader(f)
	if err != nil {
		panic(err.Error())
	}

	// Save node
	if err := htmltojson.Save(node); err != nil {
		panic(err.Error())
	}
}
Example 3

Parse from string

package main

import (
	"io/ioutil"
	"os"

	"github.com/tamboto2000/htmltojson"
)

func main() {
	f, err := os.Open("test.html")
	if err != nil {
		panic(err.Error())
	}

	defer f.Close()

	bytes, err := ioutil.ReadAll(f)
	if err != nil {
		panic(err.Error())
	}

	// Convert to string
	htmlString := string(bytes)

	// Parse from string
	node, err := htmltojson.ParseString(htmlString)
	if err != nil {
		panic(err.Error())
	}

	// Save node
	if err := htmltojson.Save(node); err != nil {
		panic(err.Error())
	}
}
Example 4

Parse from bytes

package main

import (
	"io/ioutil"
	"os"

	"github.com/tamboto2000/htmltojson"
)

func main() {
	f, err := os.Open("test.html")
	if err != nil {
		panic(err.Error())
	}

	defer f.Close()

	bytes, err := ioutil.ReadAll(f)
	if err != nil {
		panic(err.Error())
	}

	// Parse from bytes
	node, err := htmltojson.ParseBytes(bytes)
	if err != nil {
		panic(err.Error())
	}

	// Save node
	if err := htmltojson.Save(node); err != nil {
		panic(err.Error())
	}
}

License

MIT

Documentation

Overview

Package htmltojson is a HTML parser, based on net/html package. This package actually just to simplify HTML parsing. If you need more complex HTML processing, please use net/html as its offer more features. The package name is actually is not really fitting for this package purpose, but I use this package for may scraper engines, so I don't really want to bother with changing the package name...

Index

Constants

View Source
const (
	Text     = "text"
	Document = "document"
	Element  = "element"
	Comment  = "comment"
	Doctype  = "doctype"
)

Node Types

Variables

This section is empty.

Functions

func Save

func Save(node *Node) error

Save save a node to ./html_node.json

func SaveNodes

func SaveNodes(nodes []Node) error

SaveNodes saves array of nodes to ./html_nodes.json

func SaveNodesToPath

func SaveNodesToPath(nodes []Node, path string) error

SaveNodesToPath saves array of nodes to path

func SaveToPath

func SaveToPath(node *Node, path string) error

SaveToPath save a node to path

Types

type Attr

type Attr struct {
	Namespace string `json:"namespace"`
	Key       string `json:"key"`
	Val       string `json:"val"`
}

Attr is HTML attributes, like class, style, id, etc.

type Node

type Node struct {
	Type      string `json:"type"`
	Data      string `json:"data"`
	Namespace string `json:"namespace"`
	Attr      []Attr `json:"attr"`
	Child     []Node `json:"child"`
}

Node is parsed HTML object

func Parse

func Parse(root *html.Node) *Node

Parse parse HTML node to marshalable node

func ParseBytes

func ParseBytes(byts []byte) (*Node, error)

ParseBytes parse HTML bytes to marshalable node

func ParseFromFile

func ParseFromFile(path string) (*Node, error)

ParseFromFile parse HTML from file in path

func ParseFromReader

func ParseFromReader(reader io.Reader) (*Node, error)

ParseFromReader parse reader to marshalable node

func ParseString

func ParseString(str string) (*Node, error)

ParseString parse HTML string to marshalable node

func SearchAllNode

func SearchAllNode(ty, data, namespace, key, val string, node *Node) []Node

SearchAllNode search nodes matched with options. ty for HTML object type, data is for HTML tag name, key is for attribute key val is for attribute value with key

func SearchNode

func SearchNode(ty, data, namespace, key, val string, node *Node) *Node

SearchNode search a node matched with params. ty for HTML object type, data is for HTML tag name, key is for attribute key val is for attribute value with key

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL