Documentation ¶
Overview ¶
Package extract 新闻要素抽取, 在 CEPF 算法基础上做了大量的优化 Refer to: 基于标签路径特征融合新闻内容抽取的 CEPF 算法 (吴共庆等) http://www.jos.org.cn/jos/article/abstract/4868
Index ¶
- Constants
- Variables
- func DomainTop(d string) string
- func DomainTopFromUrl(urlStr string) string
- func Icp(doc *goquery.Document) (string, string)
- func IcpFromText(text string) (string, string)
- func LinkIsContentByRegex(linkUrl *url.URL, rules LinkTypeRule) bool
- func MetaFromHost(host string, lang string) (string, string, string)
- func WebContentTitleClean(title string, lang string) string
- func WebDescription(doc *goquery.Document, maxLength int) string
- func WebKeywords(doc *goquery.Document) string
- func WebLinkTitles(doc *goquery.Document, baseUrl *url.URL, strictDomain bool) (map[string]string, map[string]string)
- func WebTitle(doc *goquery.Document, maxLength int) string
- func WebTitleClean(title string, lang string) string
- type Content
- type Domain
- type LinkRes
- type LinkType
- type LinkTypeRule
- type News
Constants ¶
View Source
const ( ContentRemoveTags = "script,noscript,style,iframe,br,link,svg,textarea" // RegexPublishDate 完整的发布时间正则 RegexPublishDate = "" /* 242-byte string literal not displayed */ // RegexPublishShortDate 年份缩写发布时间正则, 如 22-09-02 11:11:11 RegexPublishShortDate = "" /* 254-byte string literal not displayed */ // RegexPublishDateNoYear 不包含年的发布时间(优先级低), 09-02 RegexPublishDateNoYear = "" /* 174-byte string literal not displayed */ // RegexEnPublishDate1 英文格式的正则1, 如 02 Sep 2022 11:40:53 pm RegexEnPublishDate1 = "" /* 335-byte string literal not displayed */ // RegexEnPublishDate2 英文格式的正则2, 如 Sep 02 2022 11:40:53 pm RegexEnPublishDate2 = "" /* 335-byte string literal not displayed */ // RegexEnUsPublishDate 英文美式格式的正则3, 如 8/30/2022 11:11:11 RegexEnUsPublishDate = "" /* 180-byte string literal not displayed */ // RegexTime 仅时间正则 RegexTime = "([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?" // RegexZhPublishPrefix 中文的发布时间前缀 RegexZhPublishPrefix = "(?i)(发布|创建|出版|发表|编辑)?(时间|日期)" // RegexZhPublishDate 中文的固定格式, 如 发布时间: xxx RegexZhPublishDate = RegexZhPublishPrefix + "[\\pP ]{1,8}" + RegexPublishShortDate // RegexScriptTitle Script 中的标题 RegexScriptTitle = `(?i)"title"[\t ]{0,4}:[\t ]{0,4}"(.*)"` // RegexScriptTime Script 中的发布时间 RegexScriptTime = `` /* 277-byte string literal not displayed */ // RegexWxScriptTime 微信 Script 中的发布时间 RegexWxScriptTime = `(?i)ct[\t ]{0,4}=[\t ]{0,4}"(1[2-9]\d{8})"` // RegexContentUrlPublishDate 内容页URL中隐藏的时间, 必须是非常完整标准的时间 20221003 RegexContentUrlPublishDate = `(20[2-3]\d{1}[/]?(0[1-9]|1[0-2])[/]?(0[1-9]|[1-2][0-9]|3[0-1]))` // RegexFormatTime3 错误的时间格式, 用于过滤 RegexFormatTime3 = `[:分]\d{3}$` // RegexFormatTime4 错误的时间格式, 用于过滤 RegexFormatTime4 = `[:分]\d{4}$` // RegexZone 错误的时区格式, 用于过滤 RegexZone = `(([\+-]\d{2})[:]?\d{2})$` // TitleSimZh 中文相似度阈值 TitleSimZh = 0.3 // TitleSimWord 单词相似度阈值 TitleSimWord = 0.5 )
View Source
const ( RegexIcp = `` /* 175-byte string literal not displayed */ RegexIcpGa = `` /* 167-byte string literal not displayed */ RegexIcpDx = `` /* 158-byte string literal not displayed */ )
View Source
const ( LinkTypeNone LinkType = 0 LinkTypeContent LinkType = 1 LinkTypeList LinkType = 2 LinkTypeUnknown LinkType = 3 RegexUrlPublishDate = `(20[2-3]\d{1}[/]?(0[1-9]|1[0-2]|[1-9])[/]?(0[1-9]|[1-2][0-9]|3[0-1]|[1-9])?)` RegexIndexSuffix = `^/index\.(html|shtml|htm|php|asp|aspx|jsp)$` RegexTitleZhBlack = "(经营|制作|信息服务|出版|出版服务|演出|视听节目|新闻|视听|新网)许可证" )
View Source
const (
RegexHostnameIp = `\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}`
)
Variables ¶
View Source
var ( RegexIcpPattern = regexp.MustCompile(RegexIcp) RegexIcpGaPattern = regexp.MustCompile(RegexIcpGa) RegexIcpDxPattern = regexp.MustCompile(RegexIcpDx) )
View Source
var HostGovCountryMap = map[string]string{
"hk": "中国",
"tw": "中国",
"mo": "中国",
"jp": "日本",
"kr": "韩国",
"in": "印度",
"uk": "英国",
"us": "美国",
"it": "意大利",
"es": "西班牙",
"ru": "俄罗斯",
"de": "德国",
"fr": "法国",
"th": "泰国",
"vn": "越南",
"sg": "新加坡",
"au": "澳大利亚",
"ca": "加拿大",
"il": "以色列",
"mm": "缅甸",
"dz": "阿尔及利亚",
"pl": "波兰",
"az": "南非",
"ng": "尼日利亚",
"kp": "朝鲜",
"lb": "黎巴嫩",
"ua": "乌克兰",
"tr": "土耳其",
"se": "瑞典",
"lk": "斯里兰卡",
"si": "斯洛文尼亚",
"sk": "斯洛伐克",
"ro": "罗马尼亚",
"pt": "葡萄牙",
"ph": "菲律宾",
"pk": "巴基斯坦",
"py": "巴拉圭",
"np": "尼泊尔",
"ma": "摩洛哥",
"my": "马来西亚",
"lt": "立陶宛",
"ie": "爱尔兰",
"iq": "伊拉克",
"ir": "伊朗",
"id": "印度尼西亚",
"hu": "匈牙利",
"gr": "希腊",
"eg": "埃及",
"cz": "捷克",
"hr": "克罗地亚",
"co": "哥伦比亚",
"cl": "智利",
"br": "巴西",
"bg": "保加利亚",
"be": "比利时",
"bd": "孟加拉国",
"aw": "阿鲁巴",
"am": "亚美尼亚",
"ai": "安圭拉",
"ao": "安哥拉",
"al": "阿尔巴尼亚",
"af": "阿富汗",
"sa": "沙特阿拉伯",
"nl": "荷兰",
}
View Source
var (
ProvinceShortMap = map[string]string{
"京": "北京",
"津": "天津",
"沪": "上海",
"渝": "重庆",
"黑": "黑龙江",
"吉": "吉林",
"辽": "辽宁",
"冀": "河北",
"豫": "河南",
"鲁": "山东",
"晋": "山西",
"陕": "陕西",
"秦": "陕西",
"蒙": "内蒙古",
"宁": "宁夏",
"陇": "甘肃",
"甘": "甘肃",
"新": "新疆",
"青": "青海",
"藏": "西藏",
"鄂": "湖北",
"皖": "安徽",
"苏": "江苏",
"浙": "浙江",
"闽": "福建",
"湘": "湖南",
"赣": "江西",
"川": "四川",
"蜀": "四川",
"黔": "贵州",
"贵": "贵州",
"滇": "云南",
"云": "云南",
"粤": "广东",
"桂": "广西",
"琼": "海南",
"港": "中国香港",
"澳": "中国澳门",
"台": "中国台湾",
}
)
View Source
var (
RegexHostnameIpPattern = regexp.MustCompile(RegexHostnameIp)
)
Functions ¶
func LinkIsContentByRegex ¶
func LinkIsContentByRegex(linkUrl *url.URL, rules LinkTypeRule) bool
func MetaFromHost ¶
MetaFromHost 根据域名尽可能返回一些固定信息
func WebContentTitleClean ¶ added in v0.4.0
WebContentTitleClean 返回内容页尽量清洗后的网页标题
func WebDescription ¶
WebDescription 返回网页描述, 最大 384 个字符
func WebLinkTitles ¶
func WebLinkTitles(doc *goquery.Document, baseUrl *url.URL, strictDomain bool) (map[string]string, map[string]string)
WebLinkTitles 返回网页链接和锚文本
func WebTitleClean ¶
WebTitleClean 返回尽量清洗后的网页标题
Types ¶
type Content ¶
type Content struct { // 原始 Doc OriginDoc *goquery.Document // Doc Doc *goquery.Document // 原始标题, 来自于上级页面 OriginTitle string // 原始链接, 来自于上级页面 OriginUrl string // 语种 Lang string // contains filtered or unexported fields }
func NewContent ¶
func (*Content) ExtractNews ¶ added in v0.4.0
type Domain ¶
func DomainParseFromUrl ¶
DomainParseFromUrl 解析域名, 返回 Domain
type LinkRes ¶
type LinkTypeRule ¶ added in v0.2.0
Click to show internal directories.
Click to hide internal directories.