html

package
v2.3.4+incompatible Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 7, 2018 License: MIT Imports: 4 Imported by: 37

README

HTML GoDoc GoCover

This package is an HTML5 lexer written in Go. It follows the specification at The HTML syntax. The lexer takes an io.Reader and converts it into tokens until the EOF.

Installation

Run the following command

go get github.com/tdewolff/parse/html

or add the following import and run project with go get

import "github.com/tdewolff/parse/html"

Lexer

Usage

The following initializes a new Lexer with io.Reader r:

l := html.NewLexer(r)

To tokenize until EOF an error, use:

for {
	tt, data := l.Next()
	switch tt {
	case html.ErrorToken:
		// error or EOF set in l.Err()
		return
	case html.StartTagToken:
		// ...
		for {
			ttAttr, dataAttr := l.Next()
			if ttAttr != html.AttributeToken {
				break
			}
			// ...
		}
	// ...
	}
}

All tokens:

ErrorToken TokenType = iota // extra token when errors occur
CommentToken
DoctypeToken
StartTagToken
StartTagCloseToken
StartTagVoidToken
EndTagToken
AttributeToken
TextToken
Examples
package main

import (
	"os"

	"github.com/tdewolff/parse/html"
)

// Tokenize HTML from stdin.
func main() {
	l := html.NewLexer(os.Stdin)
	for {
		tt, data := l.Next()
		switch tt {
		case html.ErrorToken:
			if l.Err() != io.EOF {
				fmt.Println("Error on line", l.Line(), ":", l.Err())
			}
			return
		case html.StartTagToken:
			fmt.Println("Tag", string(data))
			for {
				ttAttr, dataAttr := l.Next()
				if ttAttr != html.AttributeToken {
					break
				}

				key := dataAttr
				val := l.AttrVal()
				fmt.Println("Attribute", string(key), "=", string(val))
			}
		// ...
		}
	}
}

License

Released under the MIT license.

Documentation

Overview

Package html is an HTML5 lexer following the specifications at http://www.w3.org/TR/html5/syntax.html.

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

func EscapeAttrVal

func EscapeAttrVal(buf *[]byte, orig, b []byte) []byte

EscapeAttrVal returns the escaped attribute value bytes without quotes.

Types

type Hash

type Hash uint32

Hash defines perfect hashes for a predefined list of strings

const (
	A                Hash = 0x1     // a
	Abbr             Hash = 0x4     // abbr
	Accept           Hash = 0x3206  // accept
	Accept_Charset   Hash = 0x320e  // accept-charset
	Accesskey        Hash = 0x4409  // accesskey
	Acronym          Hash = 0xbb07  // acronym
	Action           Hash = 0x2ba06 // action
	Address          Hash = 0x67e07 // address
	Align            Hash = 0x1605  // align
	Alink            Hash = 0xd205  // alink
	Allowfullscreen  Hash = 0x23d0f // allowfullscreen
	Alt              Hash = 0xee03  // alt
	Annotation       Hash = 0x2070a // annotation
	AnnotationXml    Hash = 0x2070d // annotationXml
	Applet           Hash = 0x14506 // applet
	Area             Hash = 0x38d04 // area
	Article          Hash = 0x40e07 // article
	Aside            Hash = 0x8305  // aside
	Async            Hash = 0xfa05  // async
	Audio            Hash = 0x11605 // audio
	Autocomplete     Hash = 0x12e0c // autocomplete
	Autofocus        Hash = 0x13a09 // autofocus
	Autoplay         Hash = 0x14f08 // autoplay
	Axis             Hash = 0x15704 // axis
	B                Hash = 0x101   // b
	Background       Hash = 0x1e0a  // background
	Base             Hash = 0x45404 // base
	Basefont         Hash = 0x45408 // basefont
	Bdi              Hash = 0xcb03  // bdi
	Bdo              Hash = 0x18403 // bdo
	Bgcolor          Hash = 0x19707 // bgcolor
	Bgsound          Hash = 0x19e07 // bgsound
	Big              Hash = 0x1a603 // big
	Blink            Hash = 0x1a905 // blink
	Blockquote       Hash = 0x1ae0a // blockquote
	Body             Hash = 0x4004  // body
	Border           Hash = 0x33806 // border
	Br               Hash = 0x202   // br
	Button           Hash = 0x1b806 // button
	Canvas           Hash = 0x7f06  // canvas
	Caption          Hash = 0x27f07 // caption
	Center           Hash = 0x62a06 // center
	Challenge        Hash = 0x1e509 // challenge
	Charset          Hash = 0x3907  // charset
	Checked          Hash = 0x3b407 // checked
	Cite             Hash = 0xfe04  // cite
	Class            Hash = 0x1c305 // class
	Classid          Hash = 0x1c307 // classid
	Clear            Hash = 0x41205 // clear
	Code             Hash = 0x1d604 // code
	Codebase         Hash = 0x45008 // codebase
	Codetype         Hash = 0x1d608 // codetype
	Col              Hash = 0x19903 // col
	Colgroup         Hash = 0x1ee08 // colgroup
	Color            Hash = 0x19905 // color
	Cols             Hash = 0x20204 // cols
	Colspan          Hash = 0x20207 // colspan
	Command          Hash = 0x21407 // command
	Compact          Hash = 0x21b07 // compact
	Content          Hash = 0x4a907 // content
	Contenteditable  Hash = 0x4a90f // contenteditable
	Contextmenu      Hash = 0x3bd0b // contextmenu
	Controls         Hash = 0x22a08 // controls
	Coords           Hash = 0x23606 // coords
	Crossorigin      Hash = 0x25b0b // crossorigin
	Data             Hash = 0x4c004 // data
	Datalist         Hash = 0x4c008 // datalist
	Datetime         Hash = 0x2ea08 // datetime
	Dd               Hash = 0x31602 // dd
	Declare          Hash = 0x8607  // declare
	Default          Hash = 0x5407  // default
	DefaultChecked   Hash = 0x5040e // defaultChecked
	DefaultMuted     Hash = 0x5650c // defaultMuted
	DefaultSelected  Hash = 0x540f  // defaultSelected
	Defer            Hash = 0x6205  // defer
	Del              Hash = 0x7203  // del
	Desc             Hash = 0x7c04  // desc
	Details          Hash = 0x9207  // details
	Dfn              Hash = 0xab03  // dfn
	Dialog           Hash = 0xcc06  // dialog
	Dir              Hash = 0xd903  // dir
	Dirname          Hash = 0xd907  // dirname
	Disabled         Hash = 0x10408 // disabled
	Div              Hash = 0x10b03 // div
	Dl               Hash = 0x1a402 // dl
	Download         Hash = 0x48608 // download
	Draggable        Hash = 0x1c909 // draggable
	Dropzone         Hash = 0x41908 // dropzone
	Dt               Hash = 0x60602 // dt
	Em               Hash = 0x6e02  // em
	Embed            Hash = 0x6e05  // embed
	Enabled          Hash = 0x4e07  // enabled
	Enctype          Hash = 0x2cf07 // enctype
	Face             Hash = 0x62804 // face
	Fieldset         Hash = 0x26c08 // fieldset
	Figcaption       Hash = 0x27c0a // figcaption
	Figure           Hash = 0x29006 // figure
	Font             Hash = 0x45804 // font
	Footer           Hash = 0xf106  // footer
	For              Hash = 0x29c03 // for
	ForeignObject    Hash = 0x29c0d // foreignObject
	Foreignobject    Hash = 0x2a90d // foreignobject
	Form             Hash = 0x2b604 // form
	Formaction       Hash = 0x2b60a // formaction
	Formenctype      Hash = 0x2cb0b // formenctype
	Formmethod       Hash = 0x2d60a // formmethod
	Formnovalidate   Hash = 0x2e00e // formnovalidate
	Formtarget       Hash = 0x2f50a // formtarget
	Frame            Hash = 0xa305  // frame
	Frameborder      Hash = 0x3330b // frameborder
	Frameset         Hash = 0xa308  // frameset
	H1               Hash = 0x19502 // h1
	H2               Hash = 0x32402 // h2
	H3               Hash = 0x34902 // h3
	H4               Hash = 0x38602 // h4
	H5               Hash = 0x60802 // h5
	H6               Hash = 0x2ff02 // h6
	Head             Hash = 0x37204 // head
	Header           Hash = 0x37206 // header
	Headers          Hash = 0x37207 // headers
	Height           Hash = 0x30106 // height
	Hgroup           Hash = 0x30906 // hgroup
	Hidden           Hash = 0x31406 // hidden
	High             Hash = 0x32104 // high
	Hr               Hash = 0xaf02  // hr
	Href             Hash = 0xaf04  // href
	Hreflang         Hash = 0xaf08  // hreflang
	Html             Hash = 0x30504 // html
	Http_Equiv       Hash = 0x3260a // http-equiv
	I                Hash = 0x601   // i
	Icon             Hash = 0x4a804 // icon
	Id               Hash = 0x8502  // id
	Iframe           Hash = 0x33206 // iframe
	Image            Hash = 0x33e05 // image
	Img              Hash = 0x34303 // img
	Inert            Hash = 0x55005 // inert
	Input            Hash = 0x47305 // input
	Ins              Hash = 0x26403 // ins
	Isindex          Hash = 0x15907 // isindex
	Ismap            Hash = 0x34b05 // ismap
	Itemid           Hash = 0xff06  // itemid
	Itemprop         Hash = 0x58808 // itemprop
	Itemref          Hash = 0x62207 // itemref
	Itemscope        Hash = 0x35609 // itemscope
	Itemtype         Hash = 0x36008 // itemtype
	Kbd              Hash = 0xca03  // kbd
	Keygen           Hash = 0x4a06  // keygen
	Keytype          Hash = 0x68807 // keytype
	Kind             Hash = 0xd604  // kind
	Label            Hash = 0x7405  // label
	Lang             Hash = 0xb304  // lang
	Language         Hash = 0xb308  // language
	Legend           Hash = 0x1d006 // legend
	Li               Hash = 0x1702  // li
	Link             Hash = 0xd304  // link
	List             Hash = 0x4c404 // list
	Listing          Hash = 0x4c407 // listing
	Longdesc         Hash = 0x7808  // longdesc
	Loop             Hash = 0x12104 // loop
	Low              Hash = 0x23f03 // low
	Main             Hash = 0x1004  // main
	Malignmark       Hash = 0xc10a  // malignmark
	Manifest         Hash = 0x65e08 // manifest
	Map              Hash = 0x14403 // map
	Mark             Hash = 0xc704  // mark
	Marquee          Hash = 0x36807 // marquee
	Math             Hash = 0x36f04 // math
	Max              Hash = 0x37e03 // max
	Maxlength        Hash = 0x37e09 // maxlength
	Media            Hash = 0xde05  // media
	Mediagroup       Hash = 0xde0a  // mediagroup
	Menu             Hash = 0x3c404 // menu
	Meta             Hash = 0x4d304 // meta
	Meter            Hash = 0x2f005 // meter
	Method           Hash = 0x2da06 // method
	Mglyph           Hash = 0x34406 // mglyph
	Mi               Hash = 0x2c02  // mi
	Min              Hash = 0x2c03  // min
	Mn               Hash = 0x2e302 // mn
	Mo               Hash = 0x4f702 // mo
	Ms               Hash = 0x35902 // ms
	Mtext            Hash = 0x38805 // mtext
	Multiple         Hash = 0x39608 // multiple
	Muted            Hash = 0x39e05 // muted
	Name             Hash = 0xdc04  // name
	Nav              Hash = 0x1303  // nav
	Nobr             Hash = 0x1a04  // nobr
	Noembed          Hash = 0x6c07  // noembed
	Noframes         Hash = 0xa108  // noframes
	Nohref           Hash = 0xad06  // nohref
	Noresize         Hash = 0x24b08 // noresize
	Noscript         Hash = 0x31908 // noscript
	Noshade          Hash = 0x4ff07 // noshade
	Novalidate       Hash = 0x2e40a // novalidate
	Nowrap           Hash = 0x59106 // nowrap
	Object           Hash = 0x2b006 // object
	Ol               Hash = 0x17102 // ol
	Onabort          Hash = 0x1bc07 // onabort
	Onafterprint     Hash = 0x2840c // onafterprint
	Onbeforeprint    Hash = 0x2be0d // onbeforeprint
	Onbeforeunload   Hash = 0x6720e // onbeforeunload
	Onblur           Hash = 0x17e06 // onblur
	Oncancel         Hash = 0x11a08 // oncancel
	Oncanplay        Hash = 0x18609 // oncanplay
	Oncanplaythrough Hash = 0x18610 // oncanplaythrough
	Onchange         Hash = 0x42f08 // onchange
	Onclick          Hash = 0x6b607 // onclick
	Onclose          Hash = 0x3a307 // onclose
	Oncontextmenu    Hash = 0x3bb0d // oncontextmenu
	Oncuechange      Hash = 0x3c80b // oncuechange
	Ondblclick       Hash = 0x3d30a // ondblclick
	Ondrag           Hash = 0x3dd06 // ondrag
	Ondragend        Hash = 0x3dd09 // ondragend
	Ondragenter      Hash = 0x3e60b // ondragenter
	Ondragleave      Hash = 0x3f10b // ondragleave
	Ondragover       Hash = 0x3fc0a // ondragover
	Ondragstart      Hash = 0x4060b // ondragstart
	Ondrop           Hash = 0x41706 // ondrop
	Ondurationchange Hash = 0x42710 // ondurationchange
	Onemptied        Hash = 0x41e09 // onemptied
	Onended          Hash = 0x43707 // onended
	Onerror          Hash = 0x43e07 // onerror
	Onfocus          Hash = 0x44507 // onfocus
	Onhashchange     Hash = 0x4650c // onhashchange
	Oninput          Hash = 0x47107 // oninput
	Oninvalid        Hash = 0x47809 // oninvalid
	Onkeydown        Hash = 0x48109 // onkeydown
	Onkeypress       Hash = 0x48e0a // onkeypress
	Onkeyup          Hash = 0x49e07 // onkeyup
	Onload           Hash = 0x4b806 // onload
	Onloadeddata     Hash = 0x4b80c // onloadeddata
	Onloadedmetadata Hash = 0x4cb10 // onloadedmetadata
	Onloadstart      Hash = 0x4e10b // onloadstart
	Onmessage        Hash = 0x4ec09 // onmessage
	Onmousedown      Hash = 0x4f50b // onmousedown
	Onmousemove      Hash = 0x5120b // onmousemove
	Onmouseout       Hash = 0x51d0a // onmouseout
	Onmouseover      Hash = 0x52a0b // onmouseover
	Onmouseup        Hash = 0x53509 // onmouseup
	Onmousewheel     Hash = 0x53e0c // onmousewheel
	Onoffline        Hash = 0x54a09 // onoffline
	Ononline         Hash = 0x55508 // ononline
	Onpagehide       Hash = 0x55d0a // onpagehide
	Onpageshow       Hash = 0x5710a // onpageshow
	Onpause          Hash = 0x57d07 // onpause
	Onplay           Hash = 0x59c06 // onplay
	Onplaying        Hash = 0x59c09 // onplaying
	Onpopstate       Hash = 0x5a50a // onpopstate
	Onprogress       Hash = 0x5af0a // onprogress
	Onratechange     Hash = 0x5be0c // onratechange
	Onreset          Hash = 0x5ca07 // onreset
	Onresize         Hash = 0x5d108 // onresize
	Onscroll         Hash = 0x5d908 // onscroll
	Onseeked         Hash = 0x5e408 // onseeked
	Onseeking        Hash = 0x5ec09 // onseeking
	Onselect         Hash = 0x5f508 // onselect
	Onshow           Hash = 0x5ff06 // onshow
	Onstalled        Hash = 0x60a09 // onstalled
	Onstorage        Hash = 0x61309 // onstorage
	Onsubmit         Hash = 0x61c08 // onsubmit
	Onsuspend        Hash = 0x63009 // onsuspend
	Ontimeupdate     Hash = 0x4590c // ontimeupdate
	Onunload         Hash = 0x63908 // onunload
	Onvolumechange   Hash = 0x6410e // onvolumechange
	Onwaiting        Hash = 0x64f09 // onwaiting
	Open             Hash = 0x58e04 // open
	Optgroup         Hash = 0x12308 // optgroup
	Optimum          Hash = 0x65807 // optimum
	Option           Hash = 0x66e06 // option
	Output           Hash = 0x52406 // output
	P                Hash = 0xc01   // p
	Param            Hash = 0xc05   // param
	Pattern          Hash = 0x9b07  // pattern
	Pauseonexit      Hash = 0x57f0b // pauseonexit
	Picture          Hash = 0xe707  // picture
	Ping             Hash = 0x12a04 // ping
	Placeholder      Hash = 0x16b0b // placeholder
	Plaintext        Hash = 0x1f509 // plaintext
	Poster           Hash = 0x30e06 // poster
	Pre              Hash = 0x34f03 // pre
	Preload          Hash = 0x34f07 // preload
	Profile          Hash = 0x66707 // profile
	Progress         Hash = 0x5b108 // progress
	Prompt           Hash = 0x59606 // prompt
	Public           Hash = 0x4a406 // public
	Q                Hash = 0x8d01  // q
	Radiogroup       Hash = 0x30a   // radiogroup
	Rb               Hash = 0x1d02  // rb
	Readonly         Hash = 0x38e08 // readonly
	Rel              Hash = 0x35003 // rel
	Required         Hash = 0x8b08  // required
	Rev              Hash = 0x29403 // rev
	Reversed         Hash = 0x29408 // reversed
	Rows             Hash = 0x6604  // rows
	Rowspan          Hash = 0x6607  // rowspan
	Rp               Hash = 0x28a02 // rp
	Rt               Hash = 0x1c102 // rt
	Rtc              Hash = 0x1c103 // rtc
	Ruby             Hash = 0xf604  // ruby
	Rules            Hash = 0x17505 // rules
	S                Hash = 0x3d01  // s
	Samp             Hash = 0x9804  // samp
	Sandbox          Hash = 0x16307 // sandbox
	Scope            Hash = 0x35a05 // scope
	Scoped           Hash = 0x35a06 // scoped
	Script           Hash = 0x31b06 // script
	Scrolling        Hash = 0x5db09 // scrolling
	Seamless         Hash = 0x3a808 // seamless
	Section          Hash = 0x17907 // section
	Select           Hash = 0x5f706 // select
	Selected         Hash = 0x5f708 // selected
	Shape            Hash = 0x23105 // shape
	Size             Hash = 0x24f04 // size
	Sizes            Hash = 0x24f05 // sizes
	Small            Hash = 0x23b05 // small
	Sortable         Hash = 0x25308 // sortable
	Source           Hash = 0x26606 // source
	Spacer           Hash = 0x37806 // spacer
	Span             Hash = 0x6904  // span
	Spellcheck       Hash = 0x3af0a // spellcheck
	Src              Hash = 0x44b03 // src
	Srcdoc           Hash = 0x44b06 // srcdoc
	Srclang          Hash = 0x49707 // srclang
	Srcset           Hash = 0x5b806 // srcset
	Start            Hash = 0x40c05 // start
	Step             Hash = 0x66404 // step
	Strike           Hash = 0x68406 // strike
	Strong           Hash = 0x68f06 // strong
	Style            Hash = 0x69505 // style
	Sub              Hash = 0x61e03 // sub
	Summary          Hash = 0x69a07 // summary
	Sup              Hash = 0x6a103 // sup
	Svg              Hash = 0x6a403 // svg
	System           Hash = 0x6a706 // system
	Tabindex         Hash = 0x4d908 // tabindex
	Table            Hash = 0x25605 // table
	Target           Hash = 0x2f906 // target
	Tbody            Hash = 0x3f05  // tbody
	Td               Hash = 0xaa02  // td
	Template         Hash = 0x6aa08 // template
	Text             Hash = 0x1fa04 // text
	Textarea         Hash = 0x38908 // textarea
	Tfoot            Hash = 0xf005  // tfoot
	Th               Hash = 0x18f02 // th
	Thead            Hash = 0x37105 // thead
	Time             Hash = 0x2ee04 // time
	Title            Hash = 0x14a05 // title
	Tr               Hash = 0x1fd02 // tr
	Track            Hash = 0x1fd05 // track
	Translate        Hash = 0x22109 // translate
	Truespeed        Hash = 0x27309 // truespeed
	Tt               Hash = 0x9d02  // tt
	Type             Hash = 0x11204 // type
	Typemustmatch    Hash = 0x1da0d // typemustmatch
	U                Hash = 0xb01   // u
	Ul               Hash = 0x5802  // ul
	Undeterminate    Hash = 0x250d  // undeterminate
	Usemap           Hash = 0x14106 // usemap
	Valign           Hash = 0x1506  // valign
	Value            Hash = 0x10d05 // value
	Valuetype        Hash = 0x10d09 // valuetype
	Var              Hash = 0x32f03 // var
	Video            Hash = 0x6b205 // video
	Visible          Hash = 0x6bd07 // visible
	Vlink            Hash = 0x6c405 // vlink
	Wbr              Hash = 0x57a03 // wbr
	Width            Hash = 0x60405 // width
	Wrap             Hash = 0x59304 // wrap
	Xmlns            Hash = 0x15f05 // xmlns
	Xmp              Hash = 0x16903 // xmp
)

Unique hash definitions to be used instead of strings

func ToHash

func ToHash(s []byte) Hash

ToHash returns the hash whose name is s. It returns zero if there is no such hash. It is case sensitive.

func (Hash) String

func (i Hash) String() string

String returns the hash' name.

type Lexer

type Lexer struct {
	// contains filtered or unexported fields
}

Lexer is the state for the lexer.

func NewLexer

func NewLexer(r io.Reader) *Lexer

NewLexer returns a new Lexer for a given io.Reader.

Example
l := NewLexer(bytes.NewBufferString("<span class='user'>John Doe</span>"))
out := ""
for {
	tt, data := l.Next()
	if tt == ErrorToken {
		break
	}
	out += string(data)
}
fmt.Println(out)
Output:

<span class='user'>John Doe</span>

func (*Lexer) AttrVal

func (l *Lexer) AttrVal() []byte

AttrVal returns the attribute value when an AttributeToken was returned from Next.

func (*Lexer) Err

func (l *Lexer) Err() error

Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.

func (*Lexer) Next

func (l *Lexer) Next() (TokenType, []byte)

Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.

func (*Lexer) Restore

func (l *Lexer) Restore()

Restore restores the NULL byte at the end of the buffer.

func (*Lexer) Text

func (l *Lexer) Text() []byte

Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters.

type TokenType

type TokenType uint32

TokenType determines the type of token, eg. a number or a semicolon.

const (
	ErrorToken TokenType = iota // extra token when errors occur
	CommentToken
	DoctypeToken
	StartTagToken
	StartTagCloseToken
	StartTagVoidToken
	EndTagToken
	AttributeToken
	TextToken
	SvgToken
	MathToken
)

TokenType values.

func (TokenType) String

func (tt TokenType) String() string

String returns the string representation of a TokenType.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL