htmlparser

package module
v0.0.0-...-b19b5ed Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 2, 2020 License: MIT Imports: 7 Imported by: 0

README

go-htmlparser

Events-based HTML 5.0 compliant parser in Go (SAX-style parsing)

Typical Scenarios

  • Use it to scrape pieces of HTML
  • Detect META / LINK tags (e.g. Open Graph tags)
  • Optimize the output HTML (remove whitespace, clear empty tags)
  • Detect HTML syntax errors and notify developers
  • Extract text from the HTML

Sample

Get the RSS Feed of a website
	rssFeed := ""
	parser := NewParser(htmlContent)

	parser.Parse(nil, func(e *HtmlElement, isEmpty bool) {
		if e.TagName == "link" {

			if ty,_ := e.GetAttributeValue("type"); ty == "application/rss+xml" {
				t.Logf("rss-e: %v %v\n", e.TagName, e.Attributes)
				rssFeed,_ = e.GetAttributeValue("href")
				parser.Stop()
			}
		}
	}, nil)
	
	fmt.Println(rssFeed)
Remove whitespaces
	parser := NewParser(origHtml)

	parser.PreserveCRLFTab = false

	n := bytes.NewBufferString("")

	parser.Parse(func(text string, parent *HtmlElement) {
		escaped := html.EscapeString(text)
		n.WriteString(escaped)
	}, func(parent *HtmlElement, isEmptyTag bool) {
		n.WriteString(parent.GetOpenTag(false, false))
	}, func(closeTag string) {
		n.WriteString("</" + closeTag + ">")
	})

	newHtml := n.String()

Questions

Contributors

Documentation

Index

Constants

View Source
const (
	HETPhrasing   HtmlElementType = 0x1  // former "inline element"
	HETFlow                       = 0x2  // former "block element"
	HETMeta                       = 0x4  // control elements
	HETText                       = 0x8  // text block
	HETNRCharData                 = 0x10 // Non-Replaceable Char Data

	HETAnyContent  = HETPhrasing | HETFlow | HETText
	HETTransparent = HETPhrasing | HETFlow
	HETNone        = 0
)

Variables

This section is empty.

Functions

func BuildOpenTag

func BuildOpenTag(tagName string, attributes []attributeInfo, noEvents, noUnknownAttributes bool) string

func BuildOpenTagHEI

func BuildOpenTagHEI(ei *HtmlElementInfo, attributes []attributeInfo, noEvents, noUnknownAttributes bool) string

func HtmlAttributeEncode

func HtmlAttributeEncode(attributeValue string) string

Types

type AttrStatus

type AttrStatus uint8

AttrStatus indicate a status of an attribute

const (
	ASValid AttrStatus = iota
	ASDeprecated
	ASUnknown
)

type ElementCallback

type ElementCallback func(*HtmlElement, bool)

type EndElementCallback

type EndElementCallback func(string)

type HtmlElement

type HtmlElement struct {
	TagName                 string
	TagNameNS               string
	Id                      string
	Attributes              []attributeInfo
	ElementInfo             *HtmlElementInfo
	Namespace               string
	HasNamespace            bool
	XmlEmptyTag             bool
	Parent                  *HtmlElement
	HasDeprecatedAttributes bool
	HasOnlyKnownAttributes  bool
	SyntaxError             bool
	FatalSyntaxError        bool
	OriginalOpenTag         string
	// contains filtered or unexported fields
}

func NewHtmlElement

func NewHtmlElement(openElement string, parent *HtmlElement, errors, warnings *[]string) *HtmlElement

func (*HtmlElement) AddAttribute

func (he *HtmlElement) AddAttribute(attrName, attrVal string)

func (*HtmlElement) FindAttributeIndex

func (he *HtmlElement) FindAttributeIndex(attrName string) int

func (*HtmlElement) GetAttributeValue

func (he *HtmlElement) GetAttributeValue(attrName string) (string, bool)

func (*HtmlElement) GetCloseTag

func (he *HtmlElement) GetCloseTag() string

func (*HtmlElement) GetOpenTag

func (he *HtmlElement) GetOpenTag(noEvents, noUnknownAttributes bool) string

func (*HtmlElement) HasAttribute

func (he *HtmlElement) HasAttribute(attrName string) bool

func (*HtmlElement) RemoveAttribute

func (he *HtmlElement) RemoveAttribute(attrName string)

func (*HtmlElement) SetAttribute

func (he *HtmlElement) SetAttribute(attrName, attrValue string) bool

type HtmlElementInfo

type HtmlElementInfo struct {
	TagName                string
	HtmlVersion            int  // HTML version that introduced this tag
	Obsolete               bool // Indicates if this element is obsolete
	TagFormatting          HtmlTagFormatting
	ElementType            HtmlElementType
	PermittedChildrenTypes HtmlElementType // Valid types of elements that can be nested inside this tag
	PermittedChildrenTags  []string        // Valid children for this tag
	Attributes             []string

	ObsoleteAttributes []string
	ParentContentTypes HtmlElementType
	ParentTags         []string
	ExcludeParentTags  []string
	// contains filtered or unexported fields
}

func GetElementInfo

func GetElementInfo(tagName string) *HtmlElementInfo

GetElementInfo returns the HtmlElementInfo for this tag

func (*HtmlElementInfo) GetAttributeStatus

func (hei *HtmlElementInfo) GetAttributeStatus(attrName string) AttrStatus

func (*HtmlElementInfo) IsValidParent

func (hei *HtmlElementInfo) IsValidParent(parentTagName string) bool

type HtmlElementType

type HtmlElementType uint8

Type of HTML Element according to the HTML 5.0 spec

type HtmlParser

type HtmlParser struct {
	OrigHtml string

	Errors   []string
	Warnings []string

	Ids map[string]bool

	InnerText string

	HasValidSyntax          bool
	HasOnlyValidTags        bool
	HasOnlyValidAttributes  bool
	HasOnlyKnownTags        bool
	HasDeprecatedAttributes bool
	HasDeprecatedTags       bool

	SkipComments    bool
	PreserveCRLFTab bool
	// contains filtered or unexported fields
}

func NewParser

func NewParser(html string) HtmlParser

func (*HtmlParser) IsValidHTML401

func (parser *HtmlParser) IsValidHTML401() bool

func (*HtmlParser) IsValidStrictHTML401

func (parser *HtmlParser) IsValidStrictHTML401() bool

func (*HtmlParser) IsValidStrictHTMLNoDeprecated

func (parser *HtmlParser) IsValidStrictHTMLNoDeprecated() bool

func (*HtmlParser) Parse

func (parser *HtmlParser) Parse(textCallback TextCallback, elementCallback ElementCallback, endElementCallback EndElementCallback) bool

func (*HtmlParser) Stop

func (parser *HtmlParser) Stop()

type HtmlTagFormatting

type HtmlTagFormatting uint8
const (
	HTFSingle          HtmlTagFormatting = iota // Has no closing tag, e.g. <br>
	HTFOptionalClosing                          // has an optional closing tag, e.g. <li>
	HTFComplete                                 // must have a closing tag
)

type QuoteType

type QuoteType uint8
const (
	QTNone QuoteType = iota
	QTSingle
	QTDouble
)

func NeedQuotesForAttr

func NeedQuotesForAttr(val string) QuoteType

type TextCallback

type TextCallback func(string, *HtmlElement)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL