extract

package

v0.25.0 Latest Latest Go to latest Published: Feb 19, 2024 License: Apache-2.0 Imports: 13 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/suosi-inc/go-pkg-spider

Links

Open Source Insights

Documentation ¶

Overview ¶

Package extract 新闻要素抽取, 在 CEPF 算法基础上做了大量的优化 Refer to: 基于标签路径特征融合新闻内容抽取的 CEPF 算法 (吴共庆等) http://www.jos.org.cn/jos/article/abstract/4868

Index ¶

Constants
Variables
func DomainTop(d string) string
func DomainTopFromUrl(urlStr string) string
func Icp(doc *goquery.Document) (string, string)
func IcpFromText(text string) (string, string)
func LinkIsContentByRegex(linkUrl *url.URL, rules LinkTypeRule) bool
func MetaFromHost(host string, lang string) (string, string, string)
func WebContentTitleClean(title string, lang string) string
func WebDescription(doc *goquery.Document, maxLength int) string
func WebKeywords(doc *goquery.Document) string
func WebLinkTitles(doc *goquery.Document, baseUrl *url.URL, strictDomain bool) (map[string]string, map[string]string)
func WebTitle(doc *goquery.Document, maxLength int) string
func WebTitleClean(title string, lang string) string
type Content
- func NewContent(docOrg *goquery.Document, lang string, originTitle string, originUrl string) *Content
- func (c *Content) Debug()
- func (c *Content) ExtractNews() *News
type Domain
- func DomainParse(domain string) (*Domain, error)
- func DomainParseFromUrl(urlStr string) (*Domain, error)
type LinkRes
- func LinkTypes(linkTitles map[string]string, lang string, rules LinkTypeRule) (*LinkRes, map[string]bool)
type LinkType
- func LinkIsContentByTitle(linkUrl *url.URL, title string, lang string) LinkType
type LinkTypeRule
type News

Constants ¶

View Source

const (
	ContentRemoveTags = "script,noscript,style,iframe,br,link,svg,textarea"

	// RegexPublishDate 完整的发布时间正则
	RegexPublishDate = "" /* 242-byte string literal not displayed */

	// RegexPublishShortDate 年份缩写发布时间正则, 如 22-09-02 11:11:11
	RegexPublishShortDate = "" /* 254-byte string literal not displayed */

	// RegexPublishDateNoYear 不包含年的发布时间(优先级低), 09-02
	RegexPublishDateNoYear = "" /* 174-byte string literal not displayed */

	// RegexEnPublishDate1 英文格式的正则1, 如 02 Sep 2022 11:40:53 pm
	RegexEnPublishDate1 = "" /* 335-byte string literal not displayed */

	// RegexEnPublishDate2 英文格式的正则2, 如 Sep 02 2022 11:40:53 pm
	RegexEnPublishDate2 = "" /* 335-byte string literal not displayed */

	// RegexEnUsPublishDate 英文美式格式的正则3, 如 8/30/2022 11:11:11
	RegexEnUsPublishDate = "" /* 180-byte string literal not displayed */

	// RegexTime 仅时间正则
	RegexTime = "([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?"

	// RegexZhPublishPrefix 中文的发布时间前缀
	RegexZhPublishPrefix = "(?i)(发布|创建|出版|发表|编辑)?(时间|日期)"

	// RegexZhPublishDate 中文的固定格式, 如 发布时间: xxx
	RegexZhPublishDate = RegexZhPublishPrefix + "[\\pP ]{1,8}" + RegexPublishShortDate

	// RegexScriptTitle Script 中的标题
	RegexScriptTitle = `(?i)"title"[\t ]{0,4}:[\t ]{0,4}"(.*)"`

	// RegexScriptTime Script 中的发布时间
	RegexScriptTime = `` /* 277-byte string literal not displayed */

	// RegexWxScriptTime 微信 Script 中的发布时间
	RegexWxScriptTime = `(?i)ct[\t ]{0,4}=[\t ]{0,4}"(1[2-9]\d{8})"`

	// RegexContentUrlPublishDate 内容页URL中隐藏的时间, 必须是非常完整标准的时间 20221003
	RegexContentUrlPublishDate = `(20[2-3]\d{1}[/]?(0[1-9]|1[0-2])[/]?(0[1-9]|[1-2][0-9]|3[0-1]))`

	// RegexFormatTime3 错误的时间格式, 用于过滤
	RegexFormatTime3 = `[:分]\d{3}$`

	// RegexFormatTime4 错误的时间格式, 用于过滤
	RegexFormatTime4 = `[:分]\d{4}$`

	// RegexZone 错误的时区格式, 用于过滤
	RegexZone = `(([\+-]\d{2})[:]?\d{2})$`

	// TitleSimZh 中文相似度阈值
	TitleSimZh = 0.3

	// TitleSimWord 单词相似度阈值
	TitleSimWord = 0.5
)

View Source

const (
	RegexIcp   = `` /* 175-byte string literal not displayed */
	RegexIcpGa = `` /* 167-byte string literal not displayed */
	RegexIcpDx = `` /* 158-byte string literal not displayed */
)

View Source

const (
	LinkTypeNone    LinkType = 0
	LinkTypeContent LinkType = 1
	LinkTypeList    LinkType = 2
	LinkTypeUnknown LinkType = 3

	RegexUrlPublishDate = `(20[2-3]\d{1}[/]?(0[1-9]|1[0-2]|[1-9])[/]?(0[1-9]|[1-2][0-9]|3[0-1]|[1-9])?)`

	RegexIndexSuffix = `^/index\.(html|shtml|htm|php|asp|aspx|jsp)$`

	RegexTitleZhBlack = "(经营|制作|信息服务|出版|出版服务|演出|视听节目|新闻|视听|新网)许可证"
)

View Source

const (
	RegexHostnameIp = `\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}`
)

Variables ¶

View Source

var (
	RegexIcpPattern   = regexp.MustCompile(RegexIcp)
	RegexIcpGaPattern = regexp.MustCompile(RegexIcpGa)
	RegexIcpDxPattern = regexp.MustCompile(RegexIcpDx)
)

View Source

var HostGovCountryMap = map[string]string{
	"hk": "中国",
	"tw": "中国",
	"mo": "中国",
	"jp": "日本",
	"kr": "韩国",
	"in": "印度",
	"uk": "英国",
	"us": "美国",
	"it": "意大利",
	"es": "西班牙",
	"ru": "俄罗斯",
	"de": "德国",
	"fr": "法国",
	"th": "泰国",
	"vn": "越南",
	"sg": "新加坡",
	"au": "澳大利亚",
	"ca": "加拿大",
	"il": "以色列",
	"mm": "缅甸",
	"dz": "阿尔及利亚",
	"pl": "波兰",
	"az": "南非",
	"ng": "尼日利亚",
	"kp": "朝鲜",
	"lb": "黎巴嫩",
	"ua": "乌克兰",
	"tr": "土耳其",
	"se": "瑞典",
	"lk": "斯里兰卡",
	"si": "斯洛文尼亚",
	"sk": "斯洛伐克",
	"ro": "罗马尼亚",
	"pt": "葡萄牙",
	"ph": "菲律宾",
	"pk": "巴基斯坦",
	"py": "巴拉圭",
	"np": "尼泊尔",
	"ma": "摩洛哥",
	"my": "马来西亚",
	"lt": "立陶宛",
	"ie": "爱尔兰",
	"iq": "伊拉克",
	"ir": "伊朗",
	"id": "印度尼西亚",
	"hu": "匈牙利",
	"gr": "希腊",
	"eg": "埃及",
	"cz": "捷克",
	"hr": "克罗地亚",
	"co": "哥伦比亚",
	"cl": "智利",
	"br": "巴西",
	"bg": "保加利亚",
	"be": "比利时",
	"bd": "孟加拉国",
	"aw": "阿鲁巴",
	"am": "亚美尼亚",
	"ai": "安圭拉",
	"ao": "安哥拉",
	"al": "阿尔巴尼亚",
	"af": "阿富汗",
	"sa": "沙特阿拉伯",
	"nl": "荷兰",
}

View Source

var (
	ProvinceShortMap = map[string]string{
		"京": "北京",
		"津": "天津",
		"沪": "上海",
		"渝": "重庆",
		"黑": "黑龙江",
		"吉": "吉林",
		"辽": "辽宁",
		"冀": "河北",
		"豫": "河南",
		"鲁": "山东",
		"晋": "山西",
		"陕": "陕西",
		"秦": "陕西",
		"蒙": "内蒙古",
		"宁": "宁夏",
		"陇": "甘肃",
		"甘": "甘肃",
		"新": "新疆",
		"青": "青海",
		"藏": "西藏",
		"鄂": "湖北",
		"皖": "安徽",
		"苏": "江苏",
		"浙": "浙江",
		"闽": "福建",
		"湘": "湖南",
		"赣": "江西",
		"川": "四川",
		"蜀": "四川",
		"黔": "贵州",
		"贵": "贵州",
		"滇": "云南",
		"云": "云南",
		"粤": "广东",
		"桂": "广西",
		"琼": "海南",
		"港": "中国香港",
		"澳": "中国澳门",
		"台": "中国台湾",
	}
)

View Source

var (
	RegexHostnameIpPattern = regexp.MustCompile(RegexHostnameIp)
)

Functions ¶

func DomainTop ¶

func DomainTop(d string) string

DomainTop 返回顶级域名

func DomainTopFromUrl ¶

func DomainTopFromUrl(urlStr string) string

DomainTopFromUrl 解析 URL 返回顶级域名

func Icp ¶

func Icp(doc *goquery.Document) (string, string)

Icp 返回网站备案相关的信息

func IcpFromText ¶

func IcpFromText(text string) (string, string)

IcpFromText 提取文本中备案相关的信息

func LinkIsContentByRegex ¶

func LinkIsContentByRegex(linkUrl *url.URL, rules LinkTypeRule) bool

func MetaFromHost ¶

func MetaFromHost(host string, lang string) (string, string, string)

MetaFromHost 根据域名尽可能返回一些固定信息

func WebContentTitleClean ¶ added in v0.4.0

func WebContentTitleClean(title string, lang string) string

WebContentTitleClean 返回内容页尽量清洗后的网页标题

func WebDescription ¶

func WebDescription(doc *goquery.Document, maxLength int) string

WebDescription 返回网页描述, 最大 384 个字符

func WebKeywords ¶

func WebKeywords(doc *goquery.Document) string

WebKeywords 返回网页 Keyword

func WebLinkTitles ¶

func WebLinkTitles(doc *goquery.Document, baseUrl *url.URL, strictDomain bool) (map[string]string, map[string]string)

WebLinkTitles 返回网页链接和锚文本

func WebTitle ¶

func WebTitle(doc *goquery.Document, maxLength int) string

WebTitle 返回网页标题, 最大 128 个字符

func WebTitleClean ¶

func WebTitleClean(title string, lang string) string

WebTitleClean 返回尽量清洗后的网页标题

Types ¶

type Content ¶

type Content struct {
	// 原始 Doc
	OriginDoc *goquery.Document
	// Doc
	Doc *goquery.Document
	// 原始标题, 来自于上级页面
	OriginTitle string
	// 原始链接, 来自于上级页面
	OriginUrl string
	// 语种
	Lang string
	// contains filtered or unexported fields
}

func NewContent ¶

func NewContent(docOrg *goquery.Document, lang string, originTitle string, originUrl string) *Content

func (*Content) Debug ¶ added in v0.4.0

func (c *Content) Debug()

func (*Content) ExtractNews ¶ added in v0.4.0

func (c *Content) ExtractNews() *News

type Domain ¶

type Domain struct {
	Subdomain, Domain, TLD string
	ICANN                  bool
}

func DomainParse ¶

func DomainParse(domain string) (*Domain, error)

DomainParse 解析域名, 返回 Domain

func DomainParseFromUrl ¶

func DomainParseFromUrl(urlStr string) (*Domain, error)

DomainParseFromUrl 解析域名, 返回 Domain

type LinkRes ¶

type LinkRes struct {
	// 内容页
	Content map[string]string
	// 列表页
	List map[string]string
	// 未知链接
	Unknown map[string]string
	// 过滤链接
	None map[string]string
}

func LinkTypes ¶

func LinkTypes(linkTitles map[string]string, lang string, rules LinkTypeRule) (*LinkRes, map[string]bool)

LinkTypes 返回链接分类结果

type LinkType ¶

type LinkType int

func LinkIsContentByTitle ¶ added in v0.2.0

func LinkIsContentByTitle(linkUrl *url.URL, title string, lang string) LinkType

type LinkTypeRule ¶ added in v0.2.0

type LinkTypeRule map[string][]string

type News ¶ added in v0.4.0

type News struct {
	// 标题
	Title string
	// 标题提取依据
	TitlePos string
	// 发布时间
	TimeLocal string
	// 原始时间
	Time string
	// 发布时间时间提取依据
	TimePos string
	// 正文纯文本
	Content string
	// 正文 Node 节点
	ContentNode *html.Node
	// 提取用时（毫秒）
	Spend int64
	// 语种
	Lang string
}

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL