simhash

package module
v0.0.0-...-45500e8 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 12, 2019 License: MIT Imports: 6 Imported by: 0

README

Project

A text similarity by simhash

About text similarity

https://cloud.tencent.com/developer/article/1389446

How to test

$ go test -v -test.run TestSimHashSimilar

=== RUN TestSimHashSimilar

srcWordsWeight: [{区块链 58.69602153541771} {货币 42.49228769} {分布式 31.1229513822} {比特 30.7892744766} {密码学 26.4150609428} {数字 25.9012790598} {虚拟 25.18603834812} {数据结构 21.285162228} {链式 21.066763644} {利用 20.01581093792} {方式 19.20596931748} {数据 19.12542671356} {顺序 15.15263737918} {来讲 14.73377762808} {基础架构 12.8020653633} {一种 12.394908587969999} {2009 11.739204307083542} {中本聪 11.739204307083542} {最早 11.7133864415} {区块 11.5027823792} {保证 11.32904398058} {不可 10.95866957244} {数据传输 10.604840786} {账本 10.1871055853} {组合成 10.0720362555} {以太 10.0505300503} {篡改 9.96885201925} {莱特 9.87532596124} {编程 9.84023464143} {发明者 9.69598503258}]

dstWordsWeight: [{区块链 58.69602153541771} {篡改 29.906556057750002} {数据 19.12542671356} {技术 18.87782871428} {节点 18.29417492174} {信息 15.76158207831} {金融 15.752110047990001} {缺陷 15.05813160948} {交易 14.735571450600002} {互联网 14.24378550858} {信任 14.15260796386} {痛点 12.8020653633} {中心化 12.8020653633} {有三大 11.739204307083542} {区块 11.5027823792} {假冒伪劣 11.2616203224} {不可 10.95866957244} {解决目前 10.9049453784} {领域 10.82459108482} {大有裨益 10.604840786} {分布式 10.3743171274} {银团 10.3171587135} {工业 10.1355963834} {记账 10.1164880181} {资产 10.08319666818} {内置 9.96885201925} {数据链 9.89334446674} {生命周期 9.72629038208} {金融业务 9.62401153296} {讲课 9.61021821083}]

srcWords:[{区块链 58.69602153541771} {货币 42.49228769} {分布式 31.1229513822} {比特 30.7892744766} {密码学 26.4150609428} {数字 25.9012790598} {虚拟 25.18603834812} {数据结构 21.285162228} {链式 21.066763644} {利用 20.01581093792} {方式 19.20596931748} {数据 19.12542671356} {顺序 15.15263737918} {来讲 14.73377762808} {基础架构 12.8020653633} {一种 12.394908587969999} {2009 11.739204307083542} {中本聪 11.739204307083542} {最早 11.7133864415} {区块 11.5027823792} {保证 11.32904398058} {不可 10.95866957244} {数据传输 10.604840786} {账本 10.1871055853} {组合成 10.0720362555} {以太 10.0505300503} {篡改 9.96885201925} {莱特 9.87532596124} {编程 9.84023464143} {发明者 9.69598503258}]

dstWords:[{区块链 58.69602153541771} {篡改 29.906556057750002} {数据 19.12542671356} {技术 18.87782871428} {节点 18.29417492174} {信息 15.76158207831} {金融 15.752110047990001} {缺陷 15.05813160948} {交易 14.735571450600002} {互联网 14.24378550858} {信任 14.15260796386} {痛点 12.8020653633} {中心化 12.8020653633} {有三大 11.739204307083542} {区块 11.5027823792} {假冒伪劣 11.2616203224} {不可 10.95866957244} {解决目前 10.9049453784} {领域 10.82459108482} {大有裨益 10.604840786} {分布式 10.3743171274} {银团 10.3171587135} {工业 10.1355963834} {记账 10.1164880181} {资产 10.08319666818} {内置 9.96885201925} {数据链 9.89334446674} {生命周期 9.72629038208} {金融业务 9.62401153296} {讲课 9.61021821083}]

srcFingerPrint: [1 0 1 1 0 1 1 0 0 1 1 1 1 0 1 0 1 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1]

dstFingerPrint: [1 0 1 1 0 1 0 1 0 1 1 0 1 0 0 0 1 1 0 1 1 1 1 0 0 0 1 1 0 1 0 1]

--- PASS: TestSimHashSimilar (0.85s)

similarity_test.go:56: SimHashSimilar distance: 8

PASS

Documentation

Index

Constants

View Source
const (
	SIMILAR_DISTANCE = 3
)

Variables

This section is empty.

Functions

func RemoveHtml

func RemoveHtml(src string) string

func SimHashSimilar

func SimHashSimilar(srcWordWeighs, dstWordWeights []WordWeight) (distance int, err error)

Types

type GoJieba

type GoJieba struct {
	C *gojieba.Jieba
}
var GJB *GoJieba

func NewGoJieba

func NewGoJieba() *GoJieba

func (*GoJieba) AddWords

func (this *GoJieba) AddWords(words []string)

func (*GoJieba) Close

func (this *GoJieba) Close()

func (*GoJieba) JiebaCut

func (this *GoJieba) JiebaCut(rawStr string, useHmm bool, cutAll bool) (words []string)

func (*GoJieba) JiebaCutForSearch

func (this *GoJieba) JiebaCutForSearch(rawStr string, useHmm bool)

func (*GoJieba) JiebaCutWithFrequency

func (this *GoJieba) JiebaCutWithFrequency(rawStr string, useHmm bool, cutAll bool) (wordsFreqs map[string]int)

type WordWeight

type WordWeight struct {
	Word   string
	Weight float64
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL