extract

package
v0.25.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 19, 2024 License: Apache-2.0 Imports: 13 Imported by: 0

Documentation

Overview

Package extract 新闻要素抽取, 在 CEPF 算法基础上做了大量的优化 Refer to: 基于标签路径特征融合新闻内容抽取的 CEPF 算法 (吴共庆等) http://www.jos.org.cn/jos/article/abstract/4868

Index

Constants

View Source
const (
	ContentRemoveTags = "script,noscript,style,iframe,br,link,svg,textarea"

	// RegexPublishDate 完整的发布时间正则
	RegexPublishDate = "" /* 242-byte string literal not displayed */

	// RegexPublishShortDate 年份缩写发布时间正则, 如 22-09-02 11:11:11
	RegexPublishShortDate = "" /* 254-byte string literal not displayed */

	// RegexPublishDateNoYear 不包含年的发布时间(优先级低), 09-02
	RegexPublishDateNoYear = "" /* 174-byte string literal not displayed */

	// RegexEnPublishDate1 英文格式的正则1, 如 02 Sep 2022 11:40:53 pm
	RegexEnPublishDate1 = "" /* 335-byte string literal not displayed */

	// RegexEnPublishDate2 英文格式的正则2, 如 Sep 02 2022 11:40:53 pm
	RegexEnPublishDate2 = "" /* 335-byte string literal not displayed */

	// RegexEnUsPublishDate 英文美式格式的正则3, 如 8/30/2022 11:11:11
	RegexEnUsPublishDate = "" /* 180-byte string literal not displayed */

	// RegexTime 仅时间正则
	RegexTime = "([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?"

	// RegexZhPublishPrefix 中文的发布时间前缀
	RegexZhPublishPrefix = "(?i)(发布|创建|出版|发表|编辑)?(时间|日期)"

	// RegexZhPublishDate 中文的固定格式, 如 发布时间: xxx
	RegexZhPublishDate = RegexZhPublishPrefix + "[\\pP ]{1,8}" + RegexPublishShortDate

	// RegexScriptTitle Script 中的标题
	RegexScriptTitle = `(?i)"title"[\t ]{0,4}:[\t ]{0,4}"(.*)"`

	// RegexScriptTime Script 中的发布时间
	RegexScriptTime = `` /* 277-byte string literal not displayed */

	// RegexWxScriptTime 微信 Script 中的发布时间
	RegexWxScriptTime = `(?i)ct[\t ]{0,4}=[\t ]{0,4}"(1[2-9]\d{8})"`

	// RegexContentUrlPublishDate 内容页URL中隐藏的时间, 必须是非常完整标准的时间 20221003
	RegexContentUrlPublishDate = `(20[2-3]\d{1}[/]?(0[1-9]|1[0-2])[/]?(0[1-9]|[1-2][0-9]|3[0-1]))`

	// RegexFormatTime3 错误的时间格式, 用于过滤
	RegexFormatTime3 = `[:分]\d{3}$`

	// RegexFormatTime4 错误的时间格式, 用于过滤
	RegexFormatTime4 = `[:分]\d{4}$`

	// RegexZone 错误的时区格式, 用于过滤
	RegexZone = `(([\+-]\d{2})[:]?\d{2})$`

	// TitleSimZh 中文相似度阈值
	TitleSimZh = 0.3

	// TitleSimWord 单词相似度阈值
	TitleSimWord = 0.5
)
View Source
const (
	RegexIcp   = `` /* 175-byte string literal not displayed */
	RegexIcpGa = `` /* 167-byte string literal not displayed */
	RegexIcpDx = `` /* 158-byte string literal not displayed */
)
View Source
const (
	LinkTypeNone    LinkType = 0
	LinkTypeContent LinkType = 1
	LinkTypeList    LinkType = 2
	LinkTypeUnknown LinkType = 3

	RegexUrlPublishDate = `(20[2-3]\d{1}[/]?(0[1-9]|1[0-2]|[1-9])[/]?(0[1-9]|[1-2][0-9]|3[0-1]|[1-9])?)`

	RegexIndexSuffix = `^/index\.(html|shtml|htm|php|asp|aspx|jsp)$`

	RegexTitleZhBlack = "(经营|制作|信息服务|出版|出版服务|演出|视听节目|新闻|视听|新网)许可证"
)
View Source
const (
	RegexHostnameIp = `\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}`
)

Variables

View Source
var (
	RegexIcpPattern   = regexp.MustCompile(RegexIcp)
	RegexIcpGaPattern = regexp.MustCompile(RegexIcpGa)
	RegexIcpDxPattern = regexp.MustCompile(RegexIcpDx)
)
View Source
var HostGovCountryMap = map[string]string{
	"hk": "中国",
	"tw": "中国",
	"mo": "中国",
	"jp": "日本",
	"kr": "韩国",
	"in": "印度",
	"uk": "英国",
	"us": "美国",
	"it": "意大利",
	"es": "西班牙",
	"ru": "俄罗斯",
	"de": "德国",
	"fr": "法国",
	"th": "泰国",
	"vn": "越南",
	"sg": "新加坡",
	"au": "澳大利亚",
	"ca": "加拿大",
	"il": "以色列",
	"mm": "缅甸",
	"dz": "阿尔及利亚",
	"pl": "波兰",
	"az": "南非",
	"ng": "尼日利亚",
	"kp": "朝鲜",
	"lb": "黎巴嫩",
	"ua": "乌克兰",
	"tr": "土耳其",
	"se": "瑞典",
	"lk": "斯里兰卡",
	"si": "斯洛文尼亚",
	"sk": "斯洛伐克",
	"ro": "罗马尼亚",
	"pt": "葡萄牙",
	"ph": "菲律宾",
	"pk": "巴基斯坦",
	"py": "巴拉圭",
	"np": "尼泊尔",
	"ma": "摩洛哥",
	"my": "马来西亚",
	"lt": "立陶宛",
	"ie": "爱尔兰",
	"iq": "伊拉克",
	"ir": "伊朗",
	"id": "印度尼西亚",
	"hu": "匈牙利",
	"gr": "希腊",
	"eg": "埃及",
	"cz": "捷克",
	"hr": "克罗地亚",
	"co": "哥伦比亚",
	"cl": "智利",
	"br": "巴西",
	"bg": "保加利亚",
	"be": "比利时",
	"bd": "孟加拉国",
	"aw": "阿鲁巴",
	"am": "亚美尼亚",
	"ai": "安圭拉",
	"ao": "安哥拉",
	"al": "阿尔巴尼亚",
	"af": "阿富汗",
	"sa": "沙特阿拉伯",
	"nl": "荷兰",
}
View Source
var (
	ProvinceShortMap = map[string]string{
		"京": "北京",
		"津": "天津",
		"沪": "上海",
		"渝": "重庆",
		"黑": "黑龙江",
		"吉": "吉林",
		"辽": "辽宁",
		"冀": "河北",
		"豫": "河南",
		"鲁": "山东",
		"晋": "山西",
		"陕": "陕西",
		"秦": "陕西",
		"蒙": "内蒙古",
		"宁": "宁夏",
		"陇": "甘肃",
		"甘": "甘肃",
		"新": "新疆",
		"青": "青海",
		"藏": "西藏",
		"鄂": "湖北",
		"皖": "安徽",
		"苏": "江苏",
		"浙": "浙江",
		"闽": "福建",
		"湘": "湖南",
		"赣": "江西",
		"川": "四川",
		"蜀": "四川",
		"黔": "贵州",
		"贵": "贵州",
		"滇": "云南",
		"云": "云南",
		"粤": "广东",
		"桂": "广西",
		"琼": "海南",
		"港": "中国香港",
		"澳": "中国澳门",
		"台": "中国台湾",
	}
)
View Source
var (
	RegexHostnameIpPattern = regexp.MustCompile(RegexHostnameIp)
)

Functions

func DomainTop

func DomainTop(d string) string

DomainTop 返回顶级域名

func DomainTopFromUrl

func DomainTopFromUrl(urlStr string) string

DomainTopFromUrl 解析 URL 返回顶级域名

func Icp

func Icp(doc *goquery.Document) (string, string)

Icp 返回网站备案相关的信息

func IcpFromText

func IcpFromText(text string) (string, string)

IcpFromText 提取文本中备案相关的信息

func LinkIsContentByRegex

func LinkIsContentByRegex(linkUrl *url.URL, rules LinkTypeRule) bool

func MetaFromHost

func MetaFromHost(host string, lang string) (string, string, string)

MetaFromHost 根据域名尽可能返回一些固定信息

func WebContentTitleClean added in v0.4.0

func WebContentTitleClean(title string, lang string) string

WebContentTitleClean 返回内容页尽量清洗后的网页标题

func WebDescription

func WebDescription(doc *goquery.Document, maxLength int) string

WebDescription 返回网页描述, 最大 384 个字符

func WebKeywords

func WebKeywords(doc *goquery.Document) string

WebKeywords 返回网页 Keyword

func WebLinkTitles

func WebLinkTitles(doc *goquery.Document, baseUrl *url.URL, strictDomain bool) (map[string]string, map[string]string)

WebLinkTitles 返回网页链接和锚文本

func WebTitle

func WebTitle(doc *goquery.Document, maxLength int) string

WebTitle 返回网页标题, 最大 128 个字符

func WebTitleClean

func WebTitleClean(title string, lang string) string

WebTitleClean 返回尽量清洗后的网页标题

Types

type Content

type Content struct {
	// 原始 Doc
	OriginDoc *goquery.Document
	// Doc
	Doc *goquery.Document
	// 原始标题, 来自于上级页面
	OriginTitle string
	// 原始链接, 来自于上级页面
	OriginUrl string
	// 语种
	Lang string
	// contains filtered or unexported fields
}

func NewContent

func NewContent(docOrg *goquery.Document, lang string, originTitle string, originUrl string) *Content

func (*Content) Debug added in v0.4.0

func (c *Content) Debug()

func (*Content) ExtractNews added in v0.4.0

func (c *Content) ExtractNews() *News

type Domain

type Domain struct {
	Subdomain, Domain, TLD string
	ICANN                  bool
}

func DomainParse

func DomainParse(domain string) (*Domain, error)

DomainParse 解析域名, 返回 Domain

func DomainParseFromUrl

func DomainParseFromUrl(urlStr string) (*Domain, error)

DomainParseFromUrl 解析域名, 返回 Domain

type LinkRes

type LinkRes struct {
	// 内容页
	Content map[string]string
	// 列表页
	List map[string]string
	// 未知链接
	Unknown map[string]string
	// 过滤链接
	None map[string]string
}

func LinkTypes

func LinkTypes(linkTitles map[string]string, lang string, rules LinkTypeRule) (*LinkRes, map[string]bool)

LinkTypes 返回链接分类结果

type LinkType

type LinkType int

func LinkIsContentByTitle added in v0.2.0

func LinkIsContentByTitle(linkUrl *url.URL, title string, lang string) LinkType

type LinkTypeRule added in v0.2.0

type LinkTypeRule map[string][]string

type News added in v0.4.0

type News struct {
	// 标题
	Title string
	// 标题提取依据
	TitlePos string
	// 发布时间
	TimeLocal string
	// 原始时间
	Time string
	// 发布时间时间提取依据
	TimePos string
	// 正文纯文本
	Content string
	// 正文 Node 节点
	ContentNode *html.Node
	// 提取用时(毫秒)
	Spend int64
	// 语种
	Lang string
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL