sleekhtml

package module
v0.0.0-...-3702361 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 10, 2015 License: MIT Imports: 5 Imported by: 0

README

sleekhtml

Cleans & grooms HTML document from unnecessary white-space, HTML tags, comments & other elements.

Build Status GoDoc MITLicense

Benchmark (default options)

Website Raw wget (bytes) Sanitized (bytes)
example.com 12876 531
google.com 116772 2781
economist.com 194588 62278

Tags

Use tags to get desired output

Defaults:

// DefaultIgnoredHTMLTags -
// Form, input, selects
DefaultIgnoredHTMLTags = []atom.Atom{
	atom.Script, atom.Style, atom.Iframe, atom.Hr,

	atom.Form, atom.Input, atom.Select, atom.Label,
	atom.Fieldset, atom.Button, atom.Textarea,

	atom.Noembed, atom.Embed, atom.Object, atom.Base,
	atom.Canvas, atom.Svg,
}

// DefaultAllowedHTMLAttributes -
// http-equiv, content & charset tags should be always present
// since they handles HTML encoding
DefaultAllowedHTMLAttributes = []string{
	"src", "href", "title", "alt",
	"rel", "http-equiv", "content", "name",
	"description", "charset", "lang",
	"itemprop", "itemscope", "itemref", "itemtype", // Microdata
}

AllowIEComments: false

Usage

package main

import (
	"fmt"
	"net/http"

	"github.com/sleekhtml/sleekhtml"
)

func main() {
	response, err := http.Get("http://www.example.com/")
	defer response.Body.Close()

	tags := sleekhtml.NewTags() // Default Options

	output, err := sleekhtml.Sanitize(response.Body, tags)
	if err != nil {
		panic(err)
	}

	fmt.Println(output)
}

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	// DefaultIgnoredHTMLTags -
	// Form, input, selects
	// Plugin
	DefaultIgnoredHTMLTags = []atom.Atom{
		atom.Script, atom.Style, atom.Iframe, atom.Hr,

		atom.Form, atom.Input, atom.Select, atom.Label,
		atom.Fieldset, atom.Button, atom.Textarea,

		atom.Noembed, atom.Embed, atom.Object, atom.Base,
		atom.Canvas, atom.Svg,
	}

	// DefaultAllowedHTMLAttributes -
	// http-equiv, content & charset tags should be always present
	// since they handles HTML encoding
	DefaultAllowedHTMLAttributes = []string{
		"src", "href", "title", "alt",
		"rel", "http-equiv", "content", "name",
		"description", "charset", "lang",
		"itemprop", "itemscope", "itemref", "itemtype",
	}
)

Functions

func Process

func Process(parser *html.Tokenizer, tags *Tags, tokenFilter FilterTokenFunc) ([]byte, error)

Process -

func Sanitize

func Sanitize(r io.Reader, tags *Tags) ([]byte, error)

Sanitize - sanitizes & grooms HTML from unnecessary space & tags

Types

type FilterTokenFunc

type FilterTokenFunc func(t *html.Token)

FilterTokenFunc - callback function process token

type Tags

type Tags struct {

	// IgnoredHTMLTags - contains tags which will be ignored/removed
	IgnoredHTMLTags []atom.Atom

	// AllowedHTMLAttributes - contains HTML attributes
	AllowedHTMLAttributes []string

	// AllowIEComments - ignore or save IE comments
	AllowIEComments bool
}

Tags - HTML tags structure

func NewTags

func NewTags() *Tags

NewTags - initializes Tags with default values defaults can be overridden on Tags initialization

func (*Tags) IsAllowedAttribute

func (t *Tags) IsAllowedAttribute(attr string) bool

IsAllowedAttribute - checks whether HTML attribute is allowed

func (*Tags) IsIgnoredHTMLTag

func (t *Tags) IsIgnoredHTMLTag(a atom.Atom) bool

IsIgnoredHTMLTag - checks whether tag is not ignored

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL