document

package
v0.0.0-...-9a2d8fb Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 12, 2013 License: Apache-2.0 Imports: 22 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func DownloadFile

func DownloadFile(uri string) (string, string, string, error)

download to tmp path, ungzipped already filepath, mediate-type, charset, error

func DownloadHtml

func DownloadHtml(uri string) (string, string, error)

html has been converted to utf-8 return local_filepath, media_type, error

func FlattenHtmlDocument

func FlattenHtmlDocument(body *html.Node) (doc *html.Node, article *html.Node)

func GetInnerText

func GetInnerText(n *html.Node) string

func NewHtmlDocument

func NewHtmlDocument(localpath string) (*html.Node, error)

read utf-8 html file

func WriteHtmlFile2

func WriteHtmlFile2(doc *html.Node) (string, error)

write html.Node to tmp file return tmp_filename, utf-8 encoded

Types

type Boilerpiper

type Boilerpiper struct {
	// contains filtered or unexported fields
}

func NewBoilerpiper

func NewBoilerpiper(article *html.Node) *Boilerpiper

func (*Boilerpiper) FormPrefixFilter

func (this *Boilerpiper) FormPrefixFilter()

清除表单前的提示行

func (*Boilerpiper) NumberWordsRulesFilter

func (this *Boilerpiper) NumberWordsRulesFilter()

http://www.l3s.de/~kohlschuetter/boilerplate/ implement

type Curl

type Curl interface {
	Download(uri string) (filepath, media_type, charset string, err error)
	DownloadHtml(uri string) (filepath, media_type string, err error)
}

func DefaultCurl

func DefaultCurl() Curl

func NewCurl

func NewCurl(tmpdir string) Curl

type Curler

type Curler struct {
	TempDir string
	Prefix  string
	UseExt  bool
}

func (*Curler) Download

func (this *Curler) Download(uri string) (filepath, media_type, charset string, err error)

func (*Curler) DownloadHtml

func (this *Curler) DownloadHtml(uri string) (filepath, media_type string, err error)

type HtmlCleaner

type HtmlCleaner struct {
	Article *html.Node // body or article or a table's body
	// contains filtered or unexported fields
}

func NewHtmlCleaner

func NewHtmlCleaner(u string) *HtmlCleaner

func (*HtmlCleaner) CleanForm

func (this *HtmlCleaner) CleanForm()

func (*HtmlCleaner) CleanHtml

func (cleaner *HtmlCleaner) CleanHtml(root *html.Node)

CleanHtml 清洗掉所有的link/style/css 删除/html/head 转换所有的tag为小写字母 找到body/article节点 找到h1节点或者h2节点,根据数目设置body

func (*HtmlCleaner) String

func (this *HtmlCleaner) String() string

type Readabilitier

type Readabilitier struct {
	// contains filtered or unexported fields
}

func NewReadabilitier

func NewReadabilitier(body *html.Node) *Readabilitier

func (*Readabilitier) CreateArticle

func (this *Readabilitier) CreateArticle() (*html.Node, *html.Node)

func (*Readabilitier) String

func (this *Readabilitier) String() string

type SummaryScore

type SummaryScore struct {
	WordCount int `json:"word_count" bson:"word_count"`
	//	ImageCount int      `json:"image_count" bson:"image_count"`
	LinkCount int      `json:"link_count" bson:"link_count"`
	Images    []string `json:"image,omitempty" bson:"image,omitempty"`
}

func CleanFragment

func CleanFragment(cont, uri string) (string, *SummaryScore)

return local_filepath, words, images

func ExtractHtml

func ExtractHtml(url string) (string, *SummaryScore, error)

cleaned html doc by utf-8 encoded return filepath, *SummaryScore, error

func NewSummaryScore

func NewSummaryScore(n *html.Node) *SummaryScore

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL