keywords

package
v1.1.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 5, 2023 License: MIT Imports: 6 Imported by: 0

Documentation

Index

Constants

View Source
const (
	ZhColon = ':'
	EnColon = ':'
	//结束提取字符
	ENSP  = '\u2002'
	EMSP  = '\u2003'
	NBSP  = '\xa0'
	Space = '\x20'
	Enter = '\n'
	Table = '\t'
	Sep1  = ','
	Sep2  = '。'
	Sep3  = ';'
	Sep4  = '】'
)
View Source
const KeyP = `(\s|\xa0)*`

关键字之间处理空格的正则段 32 or 160

Variables

View Source
var ArrBlock = []string{
	"h1", "h2", "h3", "h4", "h5",
	"p",
	"br", "b",
	"div",
	"ul", "li", "ol", "dl", "dt", "dd",
}
View Source
var DefaultFilter = func(item string) string {

	if utf8.RuneCountInString(item) > 60 {
		return ""
	}

	if strings.ContainsRune(item, ZhColon) {
		return ""
	}

	idx := strings.IndexFunc(item, func(r rune) bool {

		if r == Sep1 || r == Sep2 || r == Sep3 || r == Sep4 {
			return true
		}
		return false
	})
	if idx > 0 {
		item = item[:idx]
	}
	return item
}

Functions

func ParseHtml

func ParseHtml(r io.Reader) (string, error)

Types

type Extractor

type Extractor interface {
	//计算方法
	ExtractKeywordsFromHtml(html string) error
	Clear()
	//无副作用方法
	GetResult(filter bool) []ResultRow
	GetContent() string
	GetSubject() string
	Filter(item string) string               //return filter 后的值
	GetItemsByWeight(filter bool) *ResultRow //按关键字列表序列获取第一个非空结果集
}

单个实例 线程不安全

func NewExtractor

func NewExtractor(keys []string, ops ...ExtractorOptionFunc) Extractor

arrKeys init

type ExtractorOptionFunc added in v1.0.3

type ExtractorOptionFunc func(o *extract)

func WithFilter added in v1.0.3

func WithFilter(fn func(string) string) ExtractorOptionFunc

func WithSubject added in v1.0.3

func WithSubject(s string) ExtractorOptionFunc

type ResultRow added in v1.1.1

type ResultRow struct {
	Key string
	Val []string
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL