fastxml

package module
v0.0.0-...-e1ff3be Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 8, 2021 License: MIT Imports: 10 Imported by: 0

README

fastxml Go Reference

A "fast" implementation of Golang's xml.TokenReader for well-formed XML input.

Security

Some of fastxml's performance gains come from assuming that the input XML is well-formed. It should never be used in a security sensitive context (ex: parsing SAML data) as it can almost certainly be tricked into parsing data incorrectly or even panicing.

Benchmark

Testing against the SwissProt (109 MB) XML file shows a 2x performance improvement over stdlib and a 26x improvement when using just Scanner (somewhat unfair):

$ go test -bench=. -benchmem
goos: darwin
goarch: amd64
pkg: github.com/bored-engineer/fastxml
BenchmarkScanner-12               	       8	 126334701 ns/op	       0 B/op	       0 allocs/op
BenchmarkEncodingXMLDecoder-12    	       1	3336588490 ns/op	715211208 B/op	23563878 allocs/op
BenchmarkXMLTokenReader-12        	       1	1526152566 ns/op	702095696 B/op	15335500 allocs/op
PASS
ok  	github.com/bored-engineer/fastxml	8.168s

Also note, fastxml has an unfair advantage in these benchmarks over stdlib as it only operates on a complete []byte slice instead of a streaming io.Reader.

Usage

import (
  "log"
  
  "github.com/bored-engineer/fastxml"
)

func main() {
  tr := fastxml.NewScanner([]byte(`<!directive>some <xml key="value">data`))
  for {
    token, chardata, err := tr.Next()
    if err != nil {
      log.Fatal(err)
    }
    switch {
    case chardata:
      decoded, err := fastxml.CharData(token)
      if err != nil {
        log.Fatalf("failed to decode %q: %s", string(token), err)
      }
      log.Printf("CharData: %q", string(decoded))
    case fastxml.IsDirective(token):
      dir := fastxml.Directive(token)
      log.Printf("Directive: %q", string(dir))
    case fastxml.IsProcInst(token):
      target, inst := fastxml.ProcInst(token)
      log.Printf("ProcInst: (%q, %q)", string(target), string(inst))
    case fastxml.IsComment(token):
      comment := fastxml.Comment(token)
      log.Printf("Comment: %q", comment)
    default:
      name, attrs := fastxml.Element(token)
      space, local := fastxml.Name(name)
      log.Printf("Element: (%q, %q) %b", string(space), string(local), fastxml.IsSelfClosing(token))
      if fastxml.IsStartElement(token) {
        if err := fastxml.Attrs(attrs, func(key, val []byte) error{
          decoded, err := fastxml.DecodeEntities(val)
          if err != nil {
            log.Fatalf("failed to decode %q: %s", string(val), err)
          }
          log.Printf("%q: %q", string(key), string(decoded))
          return nil
        }); err != nil {
          log.Fatalf("failed to read attribute: %s", err)
        }
      }
    }
  }
}

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func Attr

func Attr(attrsToken []byte, attrKey []byte) (attrValue []byte, err error)

Attr reads a specific attribute and returns the (non-decoded) value

func Attrs

func Attrs(attrsToken []byte, f func(key []byte, value []byte) bool) error

Attrs calls f for each key="value" in token, stopping if f returns false The value will _not_ be decoded yet

func CharData

func CharData(charToken []byte, scratch []byte) ([]byte, error)

CharData will output the decoded CharData

func CharDataAppend

func CharDataAppend(out []byte, charToken []byte) ([]byte, error)

CharDataAppend will efficiently append the decoded CharData to the output slice

func Comment

func Comment(token []byte) []byte

Comment extracts the contents of a comment

func DecodeEntities

func DecodeEntities(in []byte, scratch []byte) ([]byte, error)

DecodeEntities will resolve any (known) XML entities in the input scratch is an optional existing byte slice to append the decoded values to. If scratch is nil a new slice will be allocated

func DecodeEntitiesAppend

func DecodeEntitiesAppend(out []byte, in []byte) ([]byte, error)

DecodeEntitiesAppend will efficiently append the decoded in to out Behaves the same as DecodeEntities

func Directive

func Directive(b []byte) []byte

Directive returns the contents of a directive (ex: `<!text>` -> `text`)

func Element

func Element(token []byte) (name []byte, attrs []byte)

Element extracts the name of the element (ex: `<foo:bar key="val"/>` -> `foo:bar`) and attribute sections

func IsComment

func IsComment(token []byte) bool

IsComment determines if a Directive is a comment (<!--)

func IsDirective

func IsDirective(b []byte) bool

IsDirective determines if a []byte is directive (ex: <!text>)

func IsElement

func IsElement(token []byte) bool

IsElement checks if a []byte is an element (is not a ProcInst or Directive)

func IsEndElement

func IsEndElement(token []byte) bool

IsEndElement checks if a []byte is a </element>

func IsProcInst

func IsProcInst(b []byte) bool

IsProcInst determines if a []byte is proc inst (ex: <?target inst>)

func IsSelfClosing

func IsSelfClosing(token []byte) bool

IsSelfClosing checks if a []byte is an self closing element (<element/>)

func IsStartElement

func IsStartElement(token []byte) bool

IsStartElement is the inverse of IsEndElement

func Name

func Name(token []byte) (space []byte, local []byte)

Name produces the space and local values given a name (ex: `foo:bar` -> (`foo`, `bar`))

func NewXMLTokenReader

func NewXMLTokenReader(s *Scanner) xml.TokenReader

NewXMLTokenReader creates a xml.TokenReader given a scanner

func ProcInst

func ProcInst(b []byte) (target []byte, inst []byte)

ProcInst extracts the target and inst from a ProcInst (ex: `<?target inst>` -> (`target`, `inst`))

func RawAttr

func RawAttr(attrsToken []byte, attrKey []byte) (start int, stop int, err error)

RawAttr reads a specific attribute value (or -1 if not found)

func RawAttrs

func RawAttrs(attrsToken []byte, f func(keyStart, keyEnd, valueStart, valueEnd int) bool) error

RawAttrs calls f for each key="value" in token, stopping if f returns false

func String

func String(buf []byte) string

String performs an _unsafe_ no-copy string allocation from buf https://github.com/golang/go/issues/25484 has more info on this. The implementation is roughly taken from strings.Builder's

This function is used internally to build encoding/xml elements without copying the underlying values on the assumption the original bytes slice given to NewScanner was immutable.

func XMLAttr

func XMLAttr(key []byte, value []byte) (attr xml.Attr, err error)

XMLAttr produces a xml.Attr given a key, value

func XMLAttrs

func XMLAttrs(token []byte) ([]xml.Attr, error)

XMLAttrs produces a []xml.Attr given attributes slice

func XMLCharData

func XMLCharData(token []byte, scratch []byte) (xml.CharData, error)

XMLCharData produces a xml.CharData given a token

func XMLComment

func XMLComment(token []byte) xml.Comment

XMLComment produces a xml.Comment given a token

func XMLDirective

func XMLDirective(token []byte) xml.Directive

XMLDirective produces a xml.Directive given a token

func XMLElement

func XMLElement(token []byte) (xml.Token, error)

XMLElement produces a xml.EndElement or xml.StartElement depending on IsEndElement

func XMLEndElement

func XMLEndElement(token []byte) xml.EndElement

XMLEndElement produces a xml.EndElement given a token

func XMLName

func XMLName(token []byte) xml.Name

XMLName produces a xml.Name given a token

func XMLProcInst

func XMLProcInst(token []byte) xml.ProcInst

XMLProcInst produces a xml.ProcInst given a token

func XMLStartElement

func XMLStartElement(token []byte) (xml.StartElement, error)

XMLStartElement produces a xml.StartElement given a token

func XMLToken

func XMLToken(token []byte, chardata bool) (xml.Token, error)

XMLToken produces a xml.Token given a piece of data

Types

type Scanner

type Scanner struct {
	// contains filtered or unexported fields
}

Scanner reads a []byte emitting each "token" as a slice

func NewScanner

func NewScanner(buf []byte) *Scanner

NewScanner creates a *Scanner for a given byte slice

func (*Scanner) Next

func (s *Scanner) Next() (token []byte, chardata bool, err error)

Next produces the next token from the scanner When no more tokens are available io.EOF is returned AND the trailing token (if any)

func (*Scanner) NextElement

func (s *Scanner) NextElement() (elemToken []byte, err error)

NextElement calls Next until a Element is reached

func (*Scanner) Offset

func (s *Scanner) Offset() int

Offset outputs the internal position the Scanner is at

func (*Scanner) Reset

func (s *Scanner) Reset(buf []byte)

Reset replaces the buf in scanner to a new slice

func (*Scanner) Seek

func (s *Scanner) Seek(offset int64, whence int) (int64, error)

Seek implements the io.Seeker interface

func (*Scanner) Skip

func (s *Scanner) Skip() error

Skip will skip until the end of the most recently processed element

func (*Scanner) SkipElement

func (s *Scanner) SkipElement(elemToken []byte) error

SkipElement extends Skip with a helper for self-closed elements It is faster than SkipToken as it assumes the token is an element

func (*Scanner) SkipToken

func (s *Scanner) SkipToken(token []byte) error

SkipToken extends Skip with a helper for self-closed elements. token is an _optional_ parameter, if present it will check if the element was a self-closed element in which case it will exit immediately

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL