gec

package module
v0.0.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 12, 2015 License: MIT Imports: 7 Imported by: 0

README

gec

"gec" is port of ExtractContent.rb by golang.

Original

Install

go get github.com/yukihir0/gec

How to use

text := "..."
opt := gec.NewOption()
content, title := gec.Analyse(text, opt)

License

Copyright © 2015 yukihir0

Documentation

Overview

gec is port of ExtractContent.rb by golang. original: http://labs.cybozu.co.jp/blog/nakatani/2007/09/web_1.html

Index

Constants

View Source
const (
	DECAY      = 1.0
	CONTINUOUS = 1.0
)

Variables

This section is empty.

Functions

func Analyse

func Analyse(doc string, o *Option) (content, title string)

Types

type Block

type Block struct {
	Text  string
	Score float64
}

func NewBlock

func NewBlock(text string, score float64) (b *Block)

type BlockList

type BlockList []*Block

func NewBlockList

func NewBlockList() (bl BlockList)

func (BlockList) AppendBlock

func (self BlockList) AppendBlock() (bl BlockList)

func (BlockList) Len

func (self BlockList) Len() int

func (BlockList) Less

func (self BlockList) Less(i, j int) bool

func (BlockList) Swap

func (self BlockList) Swap(i, j int)

type BlockProcessor

type BlockProcessor struct {
	// contains filtered or unexported fields
}

func NewBlockProcessor

func NewBlockProcessor(o *Option, tp *TextProcessor) (bp *BlockProcessor)

func (*BlockProcessor) GetMaxScoreContent

func (self *BlockProcessor) GetMaxScoreContent() (c string)

func (*BlockProcessor) Process

func (self *BlockProcessor) Process(doc string)

type Extractor

type Extractor struct {
	// contains filtered or unexported fields
}

func NewExtractor

func NewExtractor(o *Option) (e *Extractor)

func (*Extractor) ExtractContent

func (self *Extractor) ExtractContent(doc string) (c string)

func (*Extractor) ExtractTitle

func (self *Extractor) ExtractTitle(doc string) (t string)

type Option

type Option struct {
	Threashold        float64        // 本文と見なすスコアの閾値
	MinLength         int            // 評価を行うブロック長の最小値
	DecayFactor       float64        // 減衰係数(小さいほど先頭に近いブロックのスコアが高くなる)
	ContinuousFactor  float64        // 連続ブロック係数(大きいほどブロックを連続と判定しにくくなる)
	NotBodyFactor     float64        // 非Body係数(大きいほどブロックのスコアが高くなる)
	PunctuationWeight int            // 句読点に対するスコア
	Punctuations      *regexp.Regexp // 句読点
	WasteExpressions  *regexp.Regexp // フッタに含まれる特徴的なキーワード
	DomSeparator      string         // DOM間に挿入する文字列
	Debug             bool           // ブロック情報を出力
}

func NewOption

func NewOption() (o *Option)

func (*Option) Initialize

func (self *Option) Initialize()

type TextProcessor

type TextProcessor struct {
	// contains filtered or unexported fields
}

func NewTextProcessor

func NewTextProcessor(o *Option) (tp *TextProcessor)
func (self *TextProcessor) EliminateLink(doc string) (s string)

func (*TextProcessor) EliminateTag

func (self *TextProcessor) EliminateTag(doc string) (s string)

func (*TextProcessor) EliminateTags

func (self *TextProcessor) EliminateTags(doc string, separator string) (s string)

func (*TextProcessor) EliminateUselessTags

func (self *TextProcessor) EliminateUselessTags(doc string) (s string)

func (*TextProcessor) HasFramesetOrRedirect

func (self *TextProcessor) HasFramesetOrRedirect(doc string) (b bool)

func (*TextProcessor) IsOnlyTags

func (self *TextProcessor) IsOnlyTags(doc string) (b bool)

func (*TextProcessor) IsShortLength

func (self *TextProcessor) IsShortLength(doc string) (b bool)

func (*TextProcessor) IsZeroLength

func (self *TextProcessor) IsZeroLength(doc string) (b bool)

func (*TextProcessor) ParseAmazons

func (self *TextProcessor) ParseAmazons(doc string) (s []string)

func (*TextProcessor) ParseBlock

func (self *TextProcessor) ParseBlock(doc string) (s []string)

func (*TextProcessor) ParseBodyHTML

func (self *TextProcessor) ParseBodyHTML(doc string) (s string)

func (*TextProcessor) ParseGoogleAdsSectionTargetHTML

func (self *TextProcessor) ParseGoogleAdsSectionTargetHTML(doc string) (s string)

func (*TextProcessor) ParseHeadHTML

func (self *TextProcessor) ParseHeadHTML(doc string) (s string)

func (*TextProcessor) ParsePunctuations

func (self *TextProcessor) ParsePunctuations(doc string) (s []string)

func (*TextProcessor) ParseTitle

func (self *TextProcessor) ParseTitle(doc string) (s string)

func (*TextProcessor) ParseWasteExpressions

func (self *TextProcessor) ParseWasteExpressions(doc string) (s []string)

func (*TextProcessor) ReplaceHTag

func (self *TextProcessor) ReplaceHTag(doc, title string) (s string)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL