gopiper

package module
v1.1.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 6, 2023 License: MIT Imports: 16 Imported by: 21

README

gopiper

[TOC]

介绍

gopiper提供一种通过配置规则的方式将网页源码【网页源码类型可以为html/json/text】提取结果为json序列化的数据格式。

比如豆瓣电影的一个网页[https://movie.douban.com/subject/26580232/]

可以将页面提取成一个json对象

{
	"name": "看不见的客人",
	"pic": "https://img3.doubanio.com/view/movie_poster_cover/lpst/public/p2498971355.webp",
	"score": 8.7,
	"director": "奥里奥尔·保罗",
	"actor": ["马里奥·卡萨斯","阿娜·瓦格纳","何塞·科罗纳多","巴巴拉·莱涅","弗兰塞斯克·奥雷利亚"]
}

规则描述

规则被描述成一个可嵌套的JSON结构(包含子结构),json结构如下:

{
	"name": "结果名",
	"selector": "节点选择器",
	"type": "规则类型",
	"filter": "过滤处理函数",
	"subitem": [
	    //子规则嵌套, 只有规则类型为map或array
	],
}

其Go表示的Struct如下:

type PipeItem struct {
	Name     string     `json:"name,omitempty"`       // 结果名称[map子结构有效]
	Selector string     `json:"selector,omitempty"`   // 节点选择器
	Type     string     `json:"type"`                 // 规则类型
	Filter   string     `json:"filter,omitempty"`     // 过滤器或结果函数处理
	SubItem  []PipeItem `json:"subitem,omitempty"`    // 嵌套子结构
}
规则类型

规则类型主要可分为三种,一种是map类型(结果值为json对象),一种是array类型(结果值为json数组),另外一种为单值类型(字符串、数值等)

map类型
array类型
值类型
选择器
过滤器函数
规则案例

豆瓣电影页面提取规则: http://movie.douban.com/subject/25850640/

{
	"type": "map",
	"selector": "",
	"subitem": [
		{
			"type": "string",
			"selector": "title",
			"name": "name",
			"filter": "trimspace|replace((豆瓣))|trim( )"
		},
		{
			"type": "string",
			"selector": "#content .gtleft a.bn-sharing@attr[data-type]",
			"name": "fenlei"
		},
		{
			"type": "string",
			"selector": "#content .gtleft a.bn-sharing@attr[data-pic]",
			"name": "thumbnail"
		},
		{
			"type": "string-array",
			"selector": "#info span.attrs a[rel=v\\:directedBy]",
			"name": "direct"
		},
		{
			"type": "string-array",
			"selector": "#info span a[rel=v\\:starring]",
			"name": "starring"
		},
		{
			"type": "string-array",
			"selector": "#info span[property=v\\:genre]",
			"name": "type"
		},
		{
			"type": "string-array",
			"selector": "#related-pic .related-pic-bd a:not(.related-pic-video) img@attr[src]",
			"name": "imgs",
			"filter": "join($)|replace(albumicon,photo)|split($)"
		},
		{
			"type": "string-array",
			"selector": "#info span[property=v\\:initialReleaseDate]",
			"name": "releasetime"
		},
		{
			"type": "string",
			"selector": "regexp:<span class=\"pl\">单集片长:</span> ([\\w\\W]+?)<br/>",
			"name": "longtime"
		},
		{
			"type": "string",
			"selector": "regexp:<span class=\"pl\">制片国家/地区:</span> ([\\w\\W]+?)<br/>",
			"name": "country",
			"filter": "split(/)|trimspace"
		},
		{
			"type": "string",
			"selector": "regexp:<span class=\"pl\">语言:</span> ([\\w\\W]+?)<br/>",
			"name": "language",
			"filter": "split(/)|trimspace"
		},
		{
			"type": "int",
			"selector": "regexp:<span class=\"pl\">集数:</span> (\\d+)<br/>",
			"name": "episode"
		},
		{
			"type": "string",
			"selector": "regexp:<span class=\"pl\">又名:</span> ([\\w\\W]+?)<br/>",
			"name": "alias",
			"filter": "split(/)|trimspace"
		},
		{
			"type": "string",
			"selector": "#link-report span.hidden, #link-report span[property=v\\:summary]|last",
			"name": "brief",
			"filter": "trimspace|split(\n)|trimspace|wraphtml(p)|join"
		},
		{
			"type": "float",
			"selector": "#interest_sectl .rating_num",
			"name": "score"
		},
		{
			"type": "string",
			"selector": "#content h1 span.year",
			"name": "year",
			"filter": "replace(()|replace())|intval"
		},
		{
			"type": "string",
			"selector": "#comments-section > .mod-hd h2 a",
			"name": "comment",
			"filter": "replace(全部)|replace(条)|trimspace|intval"
		}
	]
}

用法

Documentation

Index

Constants

View Source
const (
	// begin new version
	PT_RAW          = "raw"
	PT_INT          = "int"
	PT_FLOAT        = "float"
	PT_BOOL         = "bool"
	PT_STRING       = "string"
	PT_INT_ARRAY    = "int-array"
	PT_FLOAT_ARRAY  = "float-array"
	PT_BOOL_ARRAY   = "bool-array"
	PT_STRING_ARRAY = "string-array"
	PT_HTML_ARRAY   = "html-array"
	PT_MAP          = "map"
	PT_ARRAY        = "array"
	PT_JSON_VALUE   = "json"
	PT_JSON_PARSE   = "jsonparse"

	// begin compatible old version
	PT_TEXT       = "text"
	PT_HREF       = "href"
	PT_HTML       = "html"
	PT_ATTR       = `attr\[([\w\W]+)\]`
	PT_ATTR_ARRAY = `attr-array\[([\w\W]+)\]`
	PT_IMG_SRC    = "src"
	PT_IMG_ALT    = "alt"
	PT_TEXT_ARRAY = "text-array"
	PT_HREF_ARRAY = "href-array"
	PT_OUT_HTML   = "outhtml"

	PAGE_JSON = "json"
	PAGE_HTML = "html"
	PAGE_JS   = "js"
	PAGE_XML  = "xml"
	PAGE_TEXT = "text"

	REGEXP_PRE  = "regexp:"
	REGEXP2_PRE = "regexp2:"
)

Variables

View Source
var (
	ErrJsonparseNeedSubItem    = errors.New("Pipe type jsonparse need one subItem")
	ErrArrayNeedSubItem        = errors.New("Pipe type array need one subItem")
	ErrNotSupportPipeType      = errors.New("Not support pipe type")
	ErrUnknowHTMLAttr          = errors.New("Unknow html attr")
	ErrUnsupportText2boolType  = errors.New("Unsupport text2bool type")
	ErrUnsupportText2floatType = errors.New("Unsupport text2float type")
	ErrUnsupportText2intType   = errors.New("Unsupport text2int type")
	ErrTrimNilParams           = errors.New("Filter trim nil params")
	ErrSplitNilParams          = errors.New("Filter split nil params")
	ErrJoinNilParams           = errors.New("Filter join nil params")
	ErrFetcherNotRegistered    = errors.New("Fetcher not registered")
	ErrStorerNotRegistered     = errors.New("Storer not registered")
	ErrInvalidContent          = errors.New("Invalid content")
)

Functions

func AllFilter

func AllFilter() map[string]*Filter

func RegisterFilter

func RegisterFilter(name string, fn FilterFunction, description, usage, example string)

func ReplaceFilter

func ReplaceFilter(name string, fn FilterFunction, description, usage, example string)

func SplitParams

func SplitParams(params string, separators ...string) []string

func VerifySelector

func VerifySelector(selector string) (err error)

VerifySelector 验证正则表达式

Types

type Fether added in v1.0.1

type Fether func(pageURL string) (body []byte, err error)

type Filter

type Filter struct {
	Name string

	Description string `json:",omitempty"`
	Usage       string `json:",omitempty"`
	Example     string `json:",omitempty"`
	// contains filtered or unexported fields
}

func NewFilter

func NewFilter(name string, fn FilterFunction, description, usage, example string) *Filter

type FilterFunction

type FilterFunction func(pipe *PipeItem, src interface{}, params string) (interface{}, error)

type PipeItem

type PipeItem struct {
	Name     string     `json:"name,omitempty"` //只有类型为map的时候才会用到
	Selector string     `json:"selector,omitempty"`
	Type     string     `json:"type"`
	Filter   string     `json:"filter,omitempty"`
	SubItem  []PipeItem `json:"subitem,omitempty"`
	// contains filtered or unexported fields
}

func (*PipeItem) CallFilter added in v1.0.1

func (p *PipeItem) CallFilter(src interface{}, filters string) (interface{}, error)

func (*PipeItem) CopyFrom added in v1.0.1

func (p *PipeItem) CopyFrom(from *PipeItem)

func (*PipeItem) Fetcher added in v1.0.1

func (p *PipeItem) Fetcher() Fether

func (*PipeItem) PipeBytes

func (p *PipeItem) PipeBytes(body []byte, pageType string) (interface{}, error)

func (*PipeItem) SetFetcher added in v1.0.1

func (p *PipeItem) SetFetcher(fetcher Fether)

func (*PipeItem) SetStorer added in v1.0.1

func (p *PipeItem) SetStorer(storer Storer)

func (*PipeItem) Storer added in v1.0.1

func (p *PipeItem) Storer() Storer

type Storer added in v1.0.1

type Storer func(fileURL, savePath string, fetched bool) (newPath string, err error)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL