node

package module
v1.0.4 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 20, 2023 License: MIT Imports: 8 Imported by: 0

README

node

GoDev Go CoverageStatus GoReportCard

Node is a Go package for parsing HTML and XML documents, inspired by the popular Python library Beautiful Soup. Node provides APIs for extracting data from HTML and XML documents by traversing the parse tree and accessing elements and attributes.

Installation

To install Node, use the go get command:

go get -u github.com/sunshineplan/node

Usage

package main

import (
	"fmt"
	"strings"

	"github.com/sunshineplan/node"
)

func main() {
	// Parse an HTML document from a string
	doc, err := node.ParseHTML("<html><head><title>Page Title</title></head><body><p>Hello, World!</p></body></html>")
	if err != nil {
		fmt.Println(err)
		return
	}

	// Find the page title
	title := doc.Find(node.Descendant, node.Title)
	fmt.Println(title.GetText())

	// Find the first paragraph
	p := doc.Find(node.Descendant, node.P)
	fmt.Println(p.GetText())

	// Find all paragraphs
	paragraphs := doc.FindAll(node.Descendant, node.P)
	for _, p := range paragraphs {
		fmt.Println(p.GetText())
	}
}

API

// Node is an interface representing an HTML node.
type Node interface {
	HtmlNode

	// String returns a TextNode if the node has only one child whose type is text, otherwise returns nil.
	String() TextNode

	// Strings return all of the text nodes inside this node.
	Strings() []TextNode

	// StrippedStrings return a list of strings generated by Strings, where strings consisting entirely of
	// whitespace are ignored, and whitespace at the beginning and end of strings is removed.
	StrippedStrings() []string

	// GetText concatenates all of the text node's content.
	GetText() string
}

// TextNode is an interface representing a text node.
type TextNode interface {
	HtmlNode

	// String returns content for text node.
	String() string
}

// HtmlNode is an interface representing an HTML node.
type HtmlNode interface {
	// Raw returns orgin *html.Node.
	Raw() *html.Node
	// ToNode converts HtmlNode to Node.
	ToNode() Node
	// ToTextNode converts HtmlNode to TextNode.
	// It will panic if the node type is not text node.
	ToTextNode() TextNode

	// Type returns a NodeType.
	Type() html.NodeType
	// Data returns tag name for element node or content for text node.
	Data() string
	// Attrs returns an Attributes interface for element node.
	Attrs() Attributes
	// HasAttr return whether node has an attribute.
	HasAttr(string) bool
	// HTML renders the node's parse tree as HTML code.
	HTML() string
	// Readable renders unescaped HTML code.
	Readable() string

	// Parent returns the parent of this node.
	Parent() Node
	// FirstChild returns the first child of this node.
	FirstChild() Node
	// LastChild returns the last child of this node.
	LastChild() Node
	// PrevSibling returns the previous node that are on the same level of the parse tree.
	PrevSibling() Node
	// NextSibling returns the next node that are on the same level of the parse tree.
	NextSibling() Node
	// PrevNode returns the node that was parsed immediately before this node.
	PrevNode() Node
	// NextNode returns the node that was parsed immediately after this node.
	NextNode() Node

	// Parents iterate over all of this node's parent recursively.
	Parents() []Node
	// Children return all of this node's direct children.
	Children() []Node
	// Descendants iterate over all of this node's children recursively.
	Descendants() []Node
	// PrevSiblings return all of this node's previous nodes that are on the same level of the parse tree.
	PrevSiblings() []Node
	// NextSiblings return all of this node's next nodes that are on the same level of the parse tree.
	NextSiblings() []Node
	// PrevNodes return all of the nodes that was parsed before this node.
	PrevNodes() []Node
	// NextNodes return all of the nodes that was parsed after this node.
	NextNodes() []Node

	// Finder includes a set of find methods.
	Finder
}

// Attributes is an interface that describes a node's attributes with
// methods for getting and iterating over key-value pairs.
type Attributes interface {
	// Range calls the provided function for each key-value pair in the Attributes
	// iteration stops if the function returns false for any pair.
	Range(func(key, value string) bool)

	// Get returns the value associated with the specified key and
	// a boolean indicating whether the key exists in the Attributes.
	Get(key string) (value string, exists bool)
}

// Finder represents a set of methods for finding nodes.
type Finder interface {
	// Find searches for the first matched node in the parse tree based on the specified find method and filters.
	Find(FindMethod, TagFilter, ...Filter) Node

	// FindN searches for up to n nodes in the parse tree based on the specified find method and filters.
	FindN(FindMethod, int, TagFilter, ...Filter) []Node

	// FindAll searches for all nodes in the parse tree based on the specified find method and filters.
	FindAll(FindMethod, TagFilter, ...Filter) []Node

	// FindString searches for the first matched text node in the parse tree based on the specified find method and filters.
	FindString(FindMethod, StringFilter) TextNode

	// FindStringN searches for up to n text nodes in the parse tree based on the specified find method and filters.
	FindStringN(FindMethod, int, StringFilter) []TextNode

	// FindAllString searches for all text nodes in the parse tree based on the specified find method and filters.
	FindAllString(FindMethod, StringFilter) []TextNode

	// CSS selectors support

	// Select searches for the first matched node in the parse tree based on the css selector.
	// Will panics if the selector cannot be parsed.
	Select(string) Node

	// SelectAll searches for all nodes in the parse tree based on the css selector.
	// Will panics if the selector cannot be parsed.
	SelectAll(string) []Node

	// xpath support

	// XPath searches for all node that matches by the specified XPath expr. Will panics if the expression cannot be parsed.
	XPath(string) []Node

	// Evaluate returns the result of the xpath expression.
	// The result type of the expression is one of the follow: bool, float64, string, *xpath.NodeIterator.
	Evaluate(string) (any, error)
}

// FindMethod represents the method used to search for nodes in the parse tree.
type FindMethod int

const (
	// Descendant represents a search for nodes that are descendants of the current node.
	Descendant FindMethod = iota

	// NoRecursive represents a search for nodes that are direct children of the current node.
	NoRecursive

	// Parent represents a search for the parent node of the current node.
	Parent

	// PrevSibling represents a search for the previous sibling node of the current node.
	PrevSibling

	// NextSibling represents a search for the next sibling node of the current node.
	NextSibling

	// Previous represents a search for the previous node in the parse tree.
	Previous

	// Next represents a search for the next node in the parse tree.
	Next
)

// TagFilter represents an interface that can be used to filter node based on node element's tag.
type TagFilter interface {
	Ignore() bool
	IsMatch(node Node) bool
}

// Filter is an interface that describes a filter that can be used to select nodes.
type Filter interface {
	// IsAttribute returns true if the filter represents an attribute filter.
	IsAttribute() bool

	// IsMatch returns true if the filter matches the given node.
	IsMatch(node Node) bool
}

// StringFilter interface extends the Filter interface and defines
// a method for checking if the filter represents an string filter.
type StringFilter interface {
	Filter
	IsString() bool
}

Credits

This repo relies on the following third-party projects:

License

The MIT License (MIT)

Documentation

Index

Examples

Constants

This section is empty.

Variables

View Source
var (
	A      = Tag("A")
	B      = Tag("b")
	Body   = Tag("body")
	Div    = Tag("div")
	Em     = Tag("em")
	Form   = Tag("form")
	H1     = Tag("h1")
	H2     = Tag("h2")
	Head   = Tag("head")
	I      = Tag("i")
	Img    = Tag("img")
	Input  = Tag("input")
	Label  = Tag("label")
	Li     = Tag("li")
	Option = Tag("option")
	P      = Tag("p")
	Select = Tag("select")
	Span   = Tag("span")
	Svg    = Tag("svg")
	Table  = Tag("table")
	Td     = Tag("td")
	Th     = Tag("th")
	Title  = Tag("title")
	Tr     = Tag("tr")
	Ul     = Tag("ul")
)

These variables are used to represent common tags.

View Source
var True everything

True is a special value that matches any node.

Functions

This section is empty.

Types

type Attributes

type Attributes interface {
	// Range calls the provided function for each key-value pair in the Attributes
	// iteration stops if the function returns false for any pair.
	Range(func(key, value string) bool)

	// Get returns the value associated with the specified key and
	// a boolean indicating whether the key exists in the Attributes.
	Get(key string) (value string, exists bool)
}

Attributes is an interface that describes a node's attributes with methods for getting and iterating over key-value pairs.

type Filter

type Filter interface {
	// IsAttribute returns true if the filter represents an attribute filter.
	IsAttribute() bool

	// IsMatch returns true if the filter matches the given node.
	IsMatch(node Node) bool
}

Filter is an interface that describes a filter that can be used to select nodes.

func Attr

func Attr[T Value](name string, value T) Filter

Attr returns a new attribute filter with the specified name and value.

Example
node, err := ParseHTML(`<div data-foo="value">foo!</div>`)
if err != nil {
	log.Fatal(err)
}
if nodes := node.FindAll(0, nil, Attr("data-foo", "value")); len(nodes) != 1 {
	log.Fatalf("expected nodes %d; got %d", 1, len(nodes))
} else {
	fmt.Println(nodes[0].Readable())
}
node, err = ParseHTML(`<input name="email"/>`)
if err != nil {
	log.Fatal(err)
}
if nodes := node.SelectAll(`[name="email"]`); len(nodes) != 1 {
	log.Fatalf("expected nodes %d; got %d", 1, len(nodes))
} else {
	fmt.Println(nodes[0].Readable())
}
if nodes := node.XPath(`//*[@name="email"]`); len(nodes) != 1 {
	log.Fatalf("expected nodes %d; got %d", 1, len(nodes))
} else {
	fmt.Println(nodes[0].Readable())
}
Output:

<div data-foo="value">foo!</div>
<input name="email"/>
<input name="email"/>

func Class

func Class[T Value](v T) Filter

Class returns a new class filter with the specified value. This filter is an attribute filter.

Example
node, err := ParseHTML(`<p class="body strikeout"></p>`)
if err != nil {
	log.Fatal(err)
}
if nodes := node.FindAll(0, nil, Class("body strikeout")); len(nodes) != 1 {
	log.Fatalf("expected nodes %d; got %d", 1, len(nodes))
} else {
	fmt.Println(nodes[0].Readable())
}
if nodes := node.FindAll(0, nil, Class("strikeout body")); len(nodes) != 1 {
	log.Fatalf("expected nodes %d; got %d", 1, len(nodes))
} else {
	fmt.Println(nodes[0].Readable())
}
if nodes := node.FindAll(0, nil, ClassStrict("body strikeout")); len(nodes) != 1 {
	log.Fatalf("expected nodes %d; got %d", 1, len(nodes))
} else {
	fmt.Println(nodes[0].Readable())
}
if nodes := node.FindAll(0, nil, ClassStrict("strikeout body")); len(nodes) != 0 {
	log.Fatalf("expected nodes %d; got %d", 0, len(nodes))
} else {
	fmt.Println(nodes)
}
Output:

<p class="body strikeout"></p>
<p class="body strikeout"></p>
<p class="body strikeout"></p>
[]

func ClassStrict

func ClassStrict(cls string) Filter

ClassStrict returns a new strict class filter with the specified string. This filter is an attribute filter.

func Id

func Id[T Value](id T) Filter

Id returns a new attribute filter for the "id" attribute with the specified value.

type FindMethod

type FindMethod int

FindMethod represents the method used to search for nodes in the parse tree.

const (
	// Descendant represents a search for nodes that are descendants of the current node.
	Descendant FindMethod = iota

	// NoRecursive represents a search for nodes that are direct children of the current node.
	NoRecursive

	// Parent represents a search for the parent node of the current node.
	Parent

	// PrevSibling represents a search for the previous sibling node of the current node.
	PrevSibling

	// NextSibling represents a search for the next sibling node of the current node.
	NextSibling

	// Previous represents a search for the previous node in the parse tree.
	Previous

	// Next represents a search for the next node in the parse tree.
	Next
)

type Finder

type Finder interface {
	// Find searches for the first matched node in the parse tree based on the specified find method and filters.
	Find(FindMethod, TagFilter, ...Filter) Node

	// FindN searches for up to n nodes in the parse tree based on the specified find method and filters.
	FindN(FindMethod, int, TagFilter, ...Filter) []Node

	// FindAll searches for all nodes in the parse tree based on the specified find method and filters.
	FindAll(FindMethod, TagFilter, ...Filter) []Node

	// FindString searches for the first matched text node in the parse tree based on the specified find method and filters.
	FindString(FindMethod, StringFilter) TextNode

	// FindStringN searches for up to n text nodes in the parse tree based on the specified find method and filters.
	FindStringN(FindMethod, int, StringFilter) []TextNode

	// FindAllString searches for all text nodes in the parse tree based on the specified find method and filters.
	FindAllString(FindMethod, StringFilter) []TextNode

	// Select searches for the first matched node in the parse tree based on the css selector.
	// Will panics if the selector cannot be parsed.
	Select(string) Node

	// SelectAll searches for all nodes in the parse tree based on the css selector.
	// Will panics if the selector cannot be parsed.
	SelectAll(string) []Node

	// XPath searches for all node that matches by the specified XPath expr. Will panics if the expression cannot be parsed.
	XPath(string) []Node

	// Evaluate returns the result of the xpath expression.
	// The result type of the expression is one of the follow: bool, float64, string, *xpath.NodeIterator.
	Evaluate(string) (any, error)
}

Finder represents a set of methods for finding nodes.

type HtmlNode

type HtmlNode interface {
	// Raw returns origin *html.Node.
	Raw() *html.Node
	// ToNode converts HtmlNode to Node.
	ToNode() Node
	// ToTextNode converts HtmlNode to TextNode.
	// It will panic if the node type is not text node.
	ToTextNode() TextNode

	// Type returns a NodeType.
	Type() html.NodeType
	// Data returns tag name for element node or content for text node.
	Data() string
	// Attrs returns an Attributes interface for element node.
	Attrs() Attributes
	// HasAttr return whether node has an attribute.
	HasAttr(string) bool
	// HTML renders the node's parse tree as HTML code.
	HTML() string
	// Readable renders unescaped HTML code.
	Readable() string

	// Parent returns the parent of this node.
	Parent() Node
	// FirstChild returns the first child of this node.
	FirstChild() Node
	// LastChild returns the last child of this node.
	LastChild() Node
	// PrevSibling returns the previous node that are on the same level of the parse tree.
	PrevSibling() Node
	// NextSibling returns the next node that are on the same level of the parse tree.
	NextSibling() Node
	// PrevNode returns the node that was parsed immediately before this node.
	PrevNode() Node
	// NextNode returns the node that was parsed immediately after this node.
	NextNode() Node

	// Parents iterate over all of this node's parent recursively.
	Parents() []Node
	// Children return all of this node's direct children.
	Children() []Node
	// Descendants iterate over all of this node's children recursively.
	Descendants() []Node
	// PrevSiblings return all of this node's previous nodes that are on the same level of the parse tree.
	PrevSiblings() []Node
	// NextSiblings return all of this node's next nodes that are on the same level of the parse tree.
	NextSiblings() []Node
	// PrevNodes return all of the nodes that was parsed before this node.
	PrevNodes() []Node
	// NextNodes return all of the nodes that was parsed after this node.
	NextNodes() []Node

	// Finder includes a set of find methods.
	Finder
}

HtmlNode is an interface representing an HTML node.

Example
node, err := ParseHTML("<a><b>text1</b><c>text2</c></a>")
if err != nil {
	log.Fatal(err)
}
fmt.Println(node.Find(0, B).NextSibling().Readable())
fmt.Println(node.Find(0, Tag("c")).PrevSibling().Readable())
fmt.Println(node.Find(0, B).PrevSibling())
fmt.Println(node.Find(0, Tag("c")).NextSibling())
fmt.Println(node.Find(0, B).String().String())
fmt.Println(node.Find(0, B).String().NextSibling())
Output:

<c>text2</c>
<b>text1</b>
<nil>
<nil>
text1
<nil>

type Node

type Node interface {
	HtmlNode

	// String returns a TextNode if the node has only one child whose type is text, otherwise returns nil.
	String() TextNode

	// Strings return all of the text nodes inside this node.
	Strings() []TextNode

	// StrippedStrings return a list of strings generated by Strings, where strings consisting entirely of
	// whitespace are ignored, and whitespace at the beginning and end of strings is removed.
	StrippedStrings() []string

	// GetText concatenates all of the text node's content.
	GetText() string
}

Node is an interface representing an HTML node.

func NewNode

func NewNode(n *html.Node) Node

NewNode returns a Node with the specified *html.Node.

func Parse

func Parse(r io.Reader) (Node, error)

Parse returns the parse tree for the HTML from the given Reader.

func ParseHTML

func ParseHTML(s string) (Node, error)

ParseHTML returns the parse tree for the HTML from string.

func ParseWithOptions

func ParseWithOptions(r io.Reader, opts ...html.ParseOption) (Node, error)

ParseWithOptions is like Parse, with options.

type StringFilter

type StringFilter interface {
	Filter
	IsString() bool
}

StringFilter interface extends the Filter interface and defines a method for checking if the filter represents an string filter.

func String

func String[T Value](t T) StringFilter

String returns a StringFilter with the specified value.

func Text

func Text[T Value](t T) StringFilter

Text is an alias of String.

type TagFilter

type TagFilter interface {
	Ignore() bool
	IsMatch(node Node) bool
}

TagFilter represents an interface that can be used to filter node based on node element's tag.

func Tag

func Tag[T Value](t T) TagFilter

Tag creates a new TagFilter based on a given tag value.

func Tags

func Tags(tag ...string) TagFilter

Tags creates a new TagFilter based on a list of tag values.

type TextNode

type TextNode interface {
	HtmlNode

	// String returns content for text node.
	String() string
}

TextNode is an interface representing a text node.

type Value

type Value interface {
	// Value can be one of the following types:
	// - string: a simple string value
	// - []string: a slice of strings
	// - *regexp.Regexp: a regular expression
	// - everything: a special value that matches any node
	// - func(string, Node) bool: a function that takes a string and a node and returns true or false
	string | []string | *regexp.Regexp | everything | func(string, Node) bool
}

Value is an interface that represents a value that can be used as a filter.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL