html2data

package module
v1.2.3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 27, 2021 License: MIT Imports: 11 Imported by: 2

README

html2data

Go Reference Go Coverage Status Sourcegraph Report Card

Library and cli-utility for extracting data from HTML via CSS selectors

Install

Install package and command line utility:

go get -u github.com/msoap/html2data/cmd/html2data

Install package only:

go get -u github.com/msoap/html2data

Methods

  • FromReader(io.Reader) - create document for parse
  • FromURL(URL, [config URLCfg]) - create document from http(s) URL
  • FromFile(file) - create document from local file
  • doc.GetData(css map[string]string) - get texts by CSS selectors
  • doc.GetDataFirst(css map[string]string) - get texts by CSS selectors, get first entry for each selector or ""
  • doc.GetDataNested(outerCss string, css map[string]string) - extract nested data by CSS-selectors from another CSS-selector
  • doc.GetDataNestedFirst(outerCss string, css map[string]string) - extract nested data by CSS-selectors from another CSS-selector, get first entry for each selector or ""
  • doc.GetDataSingle(css string) - get one result by one CSS selector

or with config:

  • doc.GetData(css map[string]string, html2data.Cfg{DontTrimSpaces: true})
  • doc.GetDataNested(outerCss string, css map[string]string, html2data.Cfg{DontTrimSpaces: true})
  • doc.GetDataSingle(css string, html2data.Cfg{DontTrimSpaces: true})

Pseudo-selectors

  • :attr(attr_name) - getting attribute instead of text, for example getting urls from links: a:attr(href)
  • :html - getting HTML instead of text
  • :get(N) - getting n-th element from list

Example

package main

import (
    "fmt"
    "log"

    "github.com/msoap/html2data"
)

func main() {
    doc := html2data.FromURL("http://example.com")
    // or with config
    // doc := html2data.FromURL("http://example.com", html2data.URLCfg{UA: "userAgent", TimeOut: 10, DontDetectCharset: false})
    if doc.Err != nil {
        log.Fatal(doc.Err)
    }

    // get title
    title, _ := doc.GetDataSingle("title")
    fmt.Println("Title is:", title)

    title, _ = doc.GetDataSingle("title", html2data.Cfg{DontTrimSpaces: true})
    fmt.Println("Title as is, with spaces:", title)

    texts, _ := doc.GetData(map[string]string{"h1": "h1", "links": "a:attr(href)"})
    // get all H1 headers:
    if textOne, ok := texts["h1"]; ok {
        for _, text := range textOne {
            fmt.Println(text)
        }
    }
    // get all urls from links
    if links, ok := texts["links"]; ok {
        for _, text := range links {
            fmt.Println(text)
        }
    }
}

Command line utility

Homebrew formula exists

Usage
html2data [options] URL "css selector"
html2data [options] URL :name1 "css1" :name2 "css2"...
html2data [options] file.html "css selector"
cat file.html | html2data "css selector"
Options
  • -user-agent="Custom UA" -- set custom user-agent
  • -find-in="outer.css.selector" -- search in the specified elements instead document
  • -json -- get result as JSON
  • -dont-trim-spaces -- get text as is
  • -dont-detect-charset -- don't detect charset and convert text
  • -timeout=10 -- setting timeout when loading the URL
Install

Download binaries from: releases (OS X/Linux/Windows/RaspberryPi)

Or install from homebrew (MacOS):

brew tap msoap/tools
brew install html2data
# update:
brew upgrade html2data

Using snap (Ubuntu or any Linux distribution with snap):

# install stable version:
sudo snap install html2data

# install the latest version:
sudo snap install --edge html2data

# update
sudo snap refresh html2data

From source:

go get -u github.com/msoap/html2data/cmd/html2data
examples

Get title of page:

html2data https://golang.org/ title

Last blog posts:

html2data https://blog.golang.org/ h3

Getting RSS URL:

html2data https://blog.golang.org/ 'link[type="application/atom+xml"]:attr(href)'

More examples from wiki.

See also

Documentation

Overview

Package html2data - extract data from HTML via CSS selectors

Install package and command line utility:

go get -u github.com/msoap/html2data/cmd/html2data

Install package only:

go get -u github.com/msoap/html2data

Allowed pseudo-selectors:

:attr(attr_name) - for getting attributes instead text

:html - for getting HTML instead text

:get(N) - get n-th element from list

Command line utility:

html2data URL "css selector"
html2data file.html "css selector"
cat file.html | html2data "css selector"
Example
package main

import (
	"fmt"
	"log"

	"github.com/msoap/html2data"
)

func main() {
	doc := html2data.FromURL("http://example.com")
	// or with config
	// doc := FromURL("http://example.com", URLCfg{UA: "userAgent", TimeOut: 10, DontDetectCharset: true})
	if doc.Err != nil {
		log.Fatal(doc.Err)
	}

	// get title
	title, _ := doc.GetDataSingle("title")
	fmt.Println("Title is:", title)

	title, _ = doc.GetDataSingle("title", html2data.Cfg{DontTrimSpaces: true})
	fmt.Println("Title as is, with spaces:", title)

	texts, _ := doc.GetData(map[string]string{"h1": "h1", "links": "a:attr(href)"})
	// get all H1 headers:
	if textOne, ok := texts["h1"]; ok {
		for _, text := range textOne {
			fmt.Println(text)
		}
	}
	// get all urls from links
	if links, ok := texts["links"]; ok {
		for _, text := range links {
			fmt.Println(text)
		}
	}
}
Output:

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type CSSSelector

type CSSSelector struct {
	// contains filtered or unexported fields
}

CSSSelector - selector with settings

type Cfg

type Cfg struct {
	DontTrimSpaces bool // get text as is, by default trim spaces
}

Cfg - config for GetData* methods

type Doc

type Doc struct {
	Err error
	// contains filtered or unexported fields
}

Doc - html document for parse

func FromFile

func FromFile(fileName string) Doc

FromFile - get doc from file

Example
package main

import (
	"log"

	"github.com/msoap/html2data"
)

func main() {
	doc := html2data.FromFile("file_name.html")
	if doc.Err != nil {
		log.Fatal(doc.Err)
	}
}
Output:

func FromReader

func FromReader(reader io.Reader) Doc

FromReader - get doc from io.Reader

Example
package main

import (
	"bufio"
	"log"
	"os"

	"github.com/msoap/html2data"
)

func main() {
	doc := html2data.FromReader(bufio.NewReader(os.Stdin))
	if doc.Err != nil {
		log.Fatal(doc.Err)
	}
}
Output:

func FromURL

func FromURL(URL string, config ...URLCfg) Doc

FromURL - get doc from URL

FromURL("https://url")
FromURL("https://url", URLCfg{UA: "Custom UA 1.0", TimeOut: 10})
Example
package main

import (
	"log"

	"github.com/msoap/html2data"
)

func main() {
	doc := html2data.FromURL("http://example.com")
	if doc.Err != nil {
		log.Fatal(doc.Err)
	}

	// or with config
	doc = html2data.FromURL("http://example.com", html2data.URLCfg{UA: "userAgent", TimeOut: 10, DontDetectCharset: false})
	if doc.Err != nil {
		log.Fatal(doc.Err)
	}
}
Output:

func (Doc) GetData

func (doc Doc) GetData(selectors map[string]string, configs ...Cfg) (result map[string][]string, err error)

GetData - extract data by CSS-selectors

texts, err := doc.GetData(map[string]string{"h1": "h1"})
Example
package main

import (
	"fmt"

	"github.com/msoap/html2data"
)

func main() {
	texts, _ := html2data.FromURL("http://example.com").GetData(map[string]string{"headers": "h1", "links": "a:attr(href)"})
	// get all H1 headers:
	if textOne, ok := texts["headers"]; ok {
		for _, text := range textOne {
			fmt.Println(text)
		}
	}
	// get all urls from links
	if links, ok := texts["links"]; ok {
		for _, text := range links {
			fmt.Println(text)
		}
	}
}
Output:

func (Doc) GetDataFirst

func (doc Doc) GetDataFirst(selectors map[string]string, configs ...Cfg) (result map[string]string, err error)

GetDataFirst - extract data by CSS-selectors, get first entry for each selector or ""

texts, err := doc.GetDataFirst(map[string]string{"h1": "h1"})
Example
package main

import (
	"fmt"
	"log"

	"github.com/msoap/html2data"
)

func main() {
	texts, err := html2data.FromURL("http://example.com").GetDataFirst(map[string]string{"header": "h1", "first_link": "a:attr(href)"})
	if err != nil {
		log.Fatal(err)
	}

	// get H1 header:
	fmt.Println("header: ", texts["header"])
	// get URL in first link:
	fmt.Println("first link: ", texts["first_link"])
}
Output:

func (Doc) GetDataNested

func (doc Doc) GetDataNested(selectorRaw string, nestedSelectors map[string]string, configs ...Cfg) (result []map[string][]string, err error)

GetDataNested - extract nested data by CSS-selectors from another CSS-selector

texts, err := doc.GetDataNested("CSS.selector", map[string]string{"h1": "h1"}) - get h1 from CSS.selector
Example
package main

import (
	"fmt"

	"github.com/msoap/html2data"
)

func main() {
	texts, _ := html2data.FromFile("test.html").GetDataNested("div.article", map[string]string{"headers": "h1", "links": "a:attr(href)"})
	for _, article := range texts {
		// get all H1 headers inside each <div class="article">:
		if textOne, ok := article["headers"]; ok {
			for _, text := range textOne {
				fmt.Println(text)
			}
		}
		// get all urls from links inside each <div class="article">
		if links, ok := article["links"]; ok {
			for _, text := range links {
				fmt.Println(text)
			}
		}
	}
}
Output:

func (Doc) GetDataNestedFirst

func (doc Doc) GetDataNestedFirst(selectorRaw string, nestedSelectors map[string]string, configs ...Cfg) (result []map[string]string, err error)

GetDataNestedFirst - extract nested data by CSS-selectors from another CSS-selector get first entry for each selector or ""

texts, err := doc.GetDataNestedFirst("CSS.selector", map[string]string{"h1": "h1"}) - get h1 from CSS.selector
Example
package main

import (
	"fmt"
	"log"

	"github.com/msoap/html2data"
)

func main() {
	texts, err := html2data.FromFile("cmd/html2data/test.html").GetDataNestedFirst("div.block", map[string]string{"header": "h1", "link": "a:attr(href)", "sp": "span"})
	if err != nil {
		log.Fatal(err)
	}

	fmt.Println("")
	for _, block := range texts {
		// get first H1 header
		fmt.Printf("header - %s\n", block["header"])

		// get first link
		fmt.Printf("first URL - %s\n", block["link"])

		// get not exists span
		fmt.Printf("span - '%s'\n", block["span"])
	}

}
Output:

header - Head1.1
first URL - http://url1
span - ''
header - Head2.1
first URL - http://url2
span - ''

func (Doc) GetDataSingle

func (doc Doc) GetDataSingle(selector string, configs ...Cfg) (result string, err error)

GetDataSingle - extract data by one CSS-selector

title, err := doc.GetDataSingle("title")
Example
package main

import (
	"fmt"
	"log"

	"github.com/msoap/html2data"
)

func main() {
	// get title
	title, err := html2data.FromFile("cmd/html2data/test.html").GetDataSingle("title")
	if err != nil {
		log.Fatal(err)
	}

	fmt.Println("Title is:", title)
}
Output:

Title is: Title

type URLCfg

type URLCfg struct {
	UA                string // custom user-agent
	TimeOut           int    // timeout in seconds
	DontDetectCharset bool   // don't autoconvert to UTF8
}

URLCfg - config for FromURL()

Directories

Path Synopsis
cmd

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL