Documentation ¶
Overview ¶
gec is port of ExtractContent.rb by golang. original: http://labs.cybozu.co.jp/blog/nakatani/2007/09/web_1.html
Index ¶
- Constants
- func Analyse(doc string, o *Option) (content, title string)
- type Block
- type BlockList
- type BlockProcessor
- type Extractor
- type Option
- type TextProcessor
- func (self *TextProcessor) EliminateLink(doc string) (s string)
- func (self *TextProcessor) EliminateTag(doc string) (s string)
- func (self *TextProcessor) EliminateTags(doc string, separator string) (s string)
- func (self *TextProcessor) EliminateUselessTags(doc string) (s string)
- func (self *TextProcessor) HasFramesetOrRedirect(doc string) (b bool)
- func (self *TextProcessor) IsOnlyTags(doc string) (b bool)
- func (self *TextProcessor) IsShortLength(doc string) (b bool)
- func (self *TextProcessor) IsZeroLength(doc string) (b bool)
- func (self *TextProcessor) ParseAmazons(doc string) (s []string)
- func (self *TextProcessor) ParseBlock(doc string) (s []string)
- func (self *TextProcessor) ParseBodyHTML(doc string) (s string)
- func (self *TextProcessor) ParseGoogleAdsSectionTargetHTML(doc string) (s string)
- func (self *TextProcessor) ParseHeadHTML(doc string) (s string)
- func (self *TextProcessor) ParsePunctuations(doc string) (s []string)
- func (self *TextProcessor) ParseTitle(doc string) (s string)
- func (self *TextProcessor) ParseWasteExpressions(doc string) (s []string)
- func (self *TextProcessor) ReplaceHTag(doc, title string) (s string)
Constants ¶
View Source
const ( DECAY = 1.0 CONTINUOUS = 1.0 )
Variables ¶
This section is empty.
Functions ¶
Types ¶
type BlockList ¶
type BlockList []*Block
func NewBlockList ¶
func NewBlockList() (bl BlockList)
func (BlockList) AppendBlock ¶
type BlockProcessor ¶
type BlockProcessor struct {
// contains filtered or unexported fields
}
func NewBlockProcessor ¶
func NewBlockProcessor(o *Option, tp *TextProcessor) (bp *BlockProcessor)
func (*BlockProcessor) GetMaxScoreContent ¶
func (self *BlockProcessor) GetMaxScoreContent() (c string)
func (*BlockProcessor) Process ¶
func (self *BlockProcessor) Process(doc string)
type Extractor ¶
type Extractor struct {
// contains filtered or unexported fields
}
func NewExtractor ¶
func (*Extractor) ExtractContent ¶
func (*Extractor) ExtractTitle ¶
type Option ¶
type Option struct { Threashold float64 // 本文と見なすスコアの閾値 MinLength int // 評価を行うブロック長の最小値 DecayFactor float64 // 減衰係数(小さいほど先頭に近いブロックのスコアが高くなる) ContinuousFactor float64 // 連続ブロック係数(大きいほどブロックを連続と判定しにくくなる) NotBodyFactor float64 // 非Body係数(大きいほどブロックのスコアが高くなる) PunctuationWeight int // 句読点に対するスコア Punctuations *regexp.Regexp // 句読点 WasteExpressions *regexp.Regexp // フッタに含まれる特徴的なキーワード DomSeparator string // DOM間に挿入する文字列 Debug bool // ブロック情報を出力 }
func (*Option) Initialize ¶
func (self *Option) Initialize()
type TextProcessor ¶
type TextProcessor struct {
// contains filtered or unexported fields
}
func NewTextProcessor ¶
func NewTextProcessor(o *Option) (tp *TextProcessor)
func (*TextProcessor) EliminateLink ¶
func (self *TextProcessor) EliminateLink(doc string) (s string)
func (*TextProcessor) EliminateTag ¶
func (self *TextProcessor) EliminateTag(doc string) (s string)
func (*TextProcessor) EliminateTags ¶
func (self *TextProcessor) EliminateTags(doc string, separator string) (s string)
func (*TextProcessor) EliminateUselessTags ¶
func (self *TextProcessor) EliminateUselessTags(doc string) (s string)
func (*TextProcessor) HasFramesetOrRedirect ¶
func (self *TextProcessor) HasFramesetOrRedirect(doc string) (b bool)
func (*TextProcessor) IsOnlyTags ¶
func (self *TextProcessor) IsOnlyTags(doc string) (b bool)
func (*TextProcessor) IsShortLength ¶
func (self *TextProcessor) IsShortLength(doc string) (b bool)
func (*TextProcessor) IsZeroLength ¶
func (self *TextProcessor) IsZeroLength(doc string) (b bool)
func (*TextProcessor) ParseAmazons ¶
func (self *TextProcessor) ParseAmazons(doc string) (s []string)
func (*TextProcessor) ParseBlock ¶
func (self *TextProcessor) ParseBlock(doc string) (s []string)
func (*TextProcessor) ParseBodyHTML ¶
func (self *TextProcessor) ParseBodyHTML(doc string) (s string)
func (*TextProcessor) ParseGoogleAdsSectionTargetHTML ¶
func (self *TextProcessor) ParseGoogleAdsSectionTargetHTML(doc string) (s string)
func (*TextProcessor) ParseHeadHTML ¶
func (self *TextProcessor) ParseHeadHTML(doc string) (s string)
func (*TextProcessor) ParsePunctuations ¶
func (self *TextProcessor) ParsePunctuations(doc string) (s []string)
func (*TextProcessor) ParseTitle ¶
func (self *TextProcessor) ParseTitle(doc string) (s string)
func (*TextProcessor) ParseWasteExpressions ¶
func (self *TextProcessor) ParseWasteExpressions(doc string) (s []string)
func (*TextProcessor) ReplaceHTag ¶
func (self *TextProcessor) ReplaceHTag(doc, title string) (s string)
Click to show internal directories.
Click to hide internal directories.