html2data: github.com/msoap/html2data Index | Examples | Files | Directories

package html2data

import "github.com/msoap/html2data"

Package html2data - extract data from HTML via CSS selectors

Install package and command line utility:

go get -u github.com/msoap/html2data/cmd/html2data

Install package only:

go get -u github.com/msoap/html2data

Allowed pseudo-selectors:

:attr(attr_name) - for getting attributes instead text

:html - for getting HTML instead text

:get(N) - get n-th element from list

Command line utility:

html2data URL "css selector"
html2data file.html "css selector"
cat file.html | html2data "css selector"

Code:

doc := FromURL("http://example.com")
// or with config
// doc := FromURL("http://example.com", URLCfg{UA: "userAgent", TimeOut: 10, DontDetectCharset: true})
if doc.Err != nil {
    log.Fatal(doc.Err)
}

// get title
title, _ := doc.GetDataSingle("title")
fmt.Println("Title is:", title)

title, _ = doc.GetDataSingle("title", Cfg{DontTrimSpaces: true})
fmt.Println("Title as is, with spaces:", title)

texts, _ := doc.GetData(map[string]string{"h1": "h1", "links": "a:attr(href)"})
// get all H1 headers:
if textOne, ok := texts["h1"]; ok {
    for _, text := range textOne {
        fmt.Println(text)
    }
}
// get all urls from links
if links, ok := texts["links"]; ok {
    for _, text := range links {
        fmt.Println(text)
    }
}

Index

Examples

Package Files

html2data.go

type CSSSelector Uses

type CSSSelector struct {
    // contains filtered or unexported fields
}

CSSSelector - selector with settings

type Cfg Uses

type Cfg struct {
    DontTrimSpaces bool // get text as is, by default trim spaces
}

Cfg - config for GetData* methods

type Doc Uses

type Doc struct {
    Err error
    // contains filtered or unexported fields
}

Doc - html document for parse

func FromFile Uses

func FromFile(fileName string) Doc

FromFile - get doc from file

Code:

doc := FromFile("file_name.html")
if doc.Err != nil {
    log.Fatal(doc.Err)
}

func FromReader Uses

func FromReader(reader io.Reader) Doc

FromReader - get doc from io.Reader

Code:

doc := FromReader(bufio.NewReader(os.Stdin))
if doc.Err != nil {
    log.Fatal(doc.Err)
}

func FromURL Uses

func FromURL(URL string, config ...URLCfg) Doc

FromURL - get doc from URL

FromURL("https://url")
FromURL("https://url", URLCfg{UA: "Custom UA 1.0", TimeOut: 10})

Code:

doc := FromURL("http://example.com")
if doc.Err != nil {
    log.Fatal(doc.Err)
}

// or with config
doc = FromURL("http://example.com", URLCfg{UA: "userAgent", TimeOut: 10, DontDetectCharset: false})
if doc.Err != nil {
    log.Fatal(doc.Err)
}

func (Doc) GetData Uses

func (doc Doc) GetData(selectors map[string]string, configs ...Cfg) (result map[string][]string, err error)

GetData - extract data by CSS-selectors

texts, err := doc.GetData(map[string]string{"h1": "h1"})

Code:

texts, _ := FromURL("http://example.com").GetData(map[string]string{"headers": "h1", "links": "a:attr(href)"})
// get all H1 headers:
if textOne, ok := texts["headers"]; ok {
    for _, text := range textOne {
        fmt.Println(text)
    }
}
// get all urls from links
if links, ok := texts["links"]; ok {
    for _, text := range links {
        fmt.Println(text)
    }
}

func (Doc) GetDataFirst Uses

func (doc Doc) GetDataFirst(selectors map[string]string, configs ...Cfg) (result map[string]string, err error)

GetDataFirst - extract data by CSS-selectors, get first entry for each selector or ""

texts, err := doc.GetDataFirst(map[string]string{"h1": "h1"})

Code:

texts, err := FromURL("http://example.com").GetDataFirst(map[string]string{"header": "h1", "first_link": "a:attr(href)"})
if err != nil {
    log.Fatal(err)
}

// get H1 header:
fmt.Println("header: ", texts["header"])
// get URL in first link:
fmt.Println("first link: ", texts["first_link"])

func (Doc) GetDataNested Uses

func (doc Doc) GetDataNested(selectorRaw string, nestedSelectors map[string]string, configs ...Cfg) (result []map[string][]string, err error)

GetDataNested - extract nested data by CSS-selectors from another CSS-selector

texts, err := doc.GetDataNested("CSS.selector", map[string]string{"h1": "h1"}) - get h1 from CSS.selector

Code:

texts, _ := FromFile("test.html").GetDataNested("div.article", map[string]string{"headers": "h1", "links": "a:attr(href)"})
for _, article := range texts {
    // get all H1 headers inside each <div class="article">:
    if textOne, ok := article["headers"]; ok {
        for _, text := range textOne {
            fmt.Println(text)
        }
    }
    // get all urls from links inside each <div class="article">
    if links, ok := article["links"]; ok {
        for _, text := range links {
            fmt.Println(text)
        }
    }
}

func (Doc) GetDataNestedFirst Uses

func (doc Doc) GetDataNestedFirst(selectorRaw string, nestedSelectors map[string]string, configs ...Cfg) (result []map[string]string, err error)

GetDataNestedFirst - extract nested data by CSS-selectors from another CSS-selector get first entry for each selector or ""

texts, err := doc.GetDataNestedFirst("CSS.selector", map[string]string{"h1": "h1"}) - get h1 from CSS.selector

Code:

texts, err := FromFile("cmd/html2data/test.html").GetDataNestedFirst("div.block", map[string]string{"header": "h1", "link": "a:attr(href)", "sp": "span"})
if err != nil {
    log.Fatal(err)
}

fmt.Println("")
for _, block := range texts {
    // get first H1 header
    fmt.Printf("header - %s\n", block["header"])

    // get first link
    fmt.Printf("first URL - %s\n", block["link"])

    // get not exists span
    fmt.Printf("span - '%s'\n", block["span"])
}

Output:

header - Head1.1
first URL - http://url1
span - ''
header - Head2.1
first URL - http://url2
span - ''

func (Doc) GetDataSingle Uses

func (doc Doc) GetDataSingle(selector string, configs ...Cfg) (result string, err error)

GetDataSingle - extract data by one CSS-selector

title, err := doc.GetDataSingle("title")

Code:

// get title
title, err := FromFile("cmd/html2data/test.html").GetDataSingle("title")
if err != nil {
    log.Fatal(err)
}

fmt.Println("Title is:", title)

Output:

Title is: Title

type URLCfg Uses

type URLCfg struct {
    UA                string // custom user-agent
    TimeOut           int    // timeout in seconds
    DontDetectCharset bool   // don't autoconvert to UTF8
}

URLCfg - config for FromURL()

Directories

PathSynopsis
cmd/html2data

Package html2data imports 11 packages (graph) and is imported by 3 packages. Updated 2019-02-11. Refresh now. Tools for package owners.