bloomfilter

package module
v0.0.0-...-533b673 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 7, 2022 License: MIT Imports: 14 Imported by: 2

README

布隆过滤器

什么是布隆过滤器

**布隆过滤器本质上是一个数据结构,它可以用来判断某个元素是否在集合内,具有运行快速,内存占用小的特点。 而高效插入和查询的代价就是,Bloom Filter 是一个基于概率的数据结构:它只能告诉我们一个元素绝对不在集合内或可能在集合内。

A Bloom filter is a data structure designed to tell you, rapidly and memory-efficiently, whether an element is present in a set. The price paid for this efficiency is that a Bloom filter is a probabilistic data structure: it tells us that the element either definitely is not in the set or may be in the set.**

过滤器类型 FilterTypes

类型 Type 特点 Feature
MemoryFilter 基于内存的过滤器,可以通过将字节内容加载到内存后进行操作 Filter based on memory. All the operations would be done in memory.
CachedRedisFilter 基于MemoryFilter, 在使用Write方法时上传数据到Redis Filter based on MemoryFilter, the result will be uploaded to redis server when the Write function is called.
SQLFilter 基于MemoryFilter, 在使用Write方法时上传数据到Sql数据库。 Filter based on MemoryFilter, the result will be uploaded to sql database when the Write function is called.
InteractiveRedisFilter 交互式的Redis布隆过滤器,使用PushExists方法时,都将实时与Redis服务器通讯。 Interactive filter which will update the redis BitMap through SetBit and GetBit by Push and Exists function.
使用方法 Usage
MemoryFilter
func TestMemoryFilter(t *testing.T) {
	// Initial a memory filter
	memFilter := bloomfilter.NewMemoryFilter(make([]byte, 10240), bloomfilter.DefaultHash...)
	// Push 2000-3000 numbers to the filter.
	// 把2000-3000的数字压入过滤器
	fillNums(memFilter, 2000, 3000)
	// Check whether 2500-3000 and 3001-3500 exist in the filter or not.
	// 查看2500-3000,以及3001-3500是否存在于过滤器中
	for i := 2500; i < 3500; i++ {
		t.Logf("%d, %t", i, memFilter.Exists([]byte(strconv.Itoa(i))))
	}
}

CachedRedisFilter
var options = &redis.Options{
	Addr:     "192.168.20.101:6379",
	Username: "",
	Password: "",
	DB:       0,
}

var key = "test"

func TestRedisCachedFilter(t *testing.T) {
	cli := redis.NewClient(options)
	cachedFilter, err := bloomfilter.NewRedisFilter(context.TODO(), cli, bloomfilter.RedisFilterType_Cached, key, 10240, bloomfilter.DefaultHash...)
	if err != nil {
		t.Fatal(err)
	}
	t.Log("误判率,False positive rate:", bloomfilter.GetFlasePositiveRate(10240 * 8, 3, 2)) // 5.364385288193686e-11
	fillNums(cachedFilter, 250, 300)
	t.Log(cachedFilter.Exists([]byte(strconv.Itoa(290)))) // true
	t.Log(cachedFilter.Exists([]byte(strconv.Itoa(299)))) // true
	t.Log(cachedFilter.Exists([]byte(strconv.Itoa(350)))) // false
	// must use write to save the data to redis.
	// 必须使用write方法将数据全部提交到redis服务器
	cachedFilter.Write()
}
SQLFilter
var sqlDSN = "user:password@tcp(ip:port)/database?charset=utf8mb4&parseTime=True&loc=Local"

func TestSqlFilter(t *testing.T) {
	// Init gorm.DB
	db, err := gorm.Open(mysql.Open(sqlDSN))
	if err != nil {
		t.Fatal(err)
	}
	// Init SQLFilter
	sqlFilter, err := bloomfilter.SqlFilter(db, "test", 1000, bloomfilter.DefaultHash...)
	if err != nil {
		t.Fatal(err)
	}
	// Push 250-300 numbers to the filter.
	// 把250-300的数字压入过滤器
	fillNums(sqlFilter, 250, 300)
	sqlFilter.Write()
}

func TestSqlFilterExist(t *testing.T) {
	db, err := gorm.Open(mysql.Open(sqlDSN))
	if err != nil {
		t.Fatal(err)
	}
	sqlFilter, err := bloomfilter.SqlFilter(db, "test", 1000, bloomfilter.DefaultHash...)
	if err != nil {
		t.Fatal(err)
	}
	// 280-300 should exist in filter, and 301-320 doesn't.
	// 280-300应该在过滤器中,而301-320不应该在。
	for i:=280;i<320;i++{
		t.Logf("%d: %t", i, sqlFilter.Exists([]byte(strconv.Itoa(i))))
	}
}

InteractiveRedisFilter
func TestInteractiveFilter(t *testing.T) {
	cli := redis.NewClient(options)
	interactiveFilter, err := bloomfilter.NewRedisFilter(context.TODO(), cli, bloomfilter.RedisFilterType_Interactive, key, 10240, bloomfilter.DefaultHash...)
	if err != nil {
		t.Fatal(err)
	}
	fillNums(interactiveFilter, 250, 300)

	anotherFilter, _ := bloomfilter.NewRedisFilter(context.TODO(), cli, bloomfilter.RedisFilterType_Interactive, key, 10240, bloomfilter.DefaultHash...)

	t.Log(interactiveFilter.Exists([]byte(strconv.Itoa(290)))) // true
	t.Log(anotherFilter.Exists([]byte(strconv.Itoa(290))))     // true

	t.Log(interactiveFilter.Exists([]byte(strconv.Itoa(299)))) // true
	t.Log(anotherFilter.Exists([]byte(strconv.Itoa(299))))     // true

	t.Log(interactiveFilter.Exists([]byte(strconv.Itoa(320)))) // false
	t.Log(anotherFilter.Exists([]byte(strconv.Itoa(320))) )    // false
}

计算误差 Calculate the false positive rate

func TestMemFalsePositiveRate(t *testing.T) {
	memFilter := bloomfilter.NewMemoryFilter(make([]byte, 10240), bloomfilter.DefaultHash...).(*memory.Filter)
	testFalsePositiveRate(t, memFilter, 10240 * 8, 1000, 3, 100000000)
	//=== RUN   TestMemFalsePositiveRate
	//    common.go:37: 理论误判率 Theoretical false positive rate: 0.0018230817954481005
	//    common.go:49: 实际误判率 Real false positive rate:0.00047424474244742447
	//--- PASS: TestMemFalsePositiveRate (7.43s)
	//PASS
}

计算信息熵,用来测试哈希函数是否是均匀分布的。 Calculate the Information Entropy to test if the hash function is uniformly distributed.

func TestIsHashFuncUniformlyDistributed(t *testing.T) {
	for _, f := range bloomfilter.DefaultHash{
		t.Log(bloomfilter.CalculateInformationEntropy(f))
	}
	//不满足的hash,已舍弃
	t.Log("abandoned hash function:")
	t.Log(bloomfilter.CalculateInformationEntropy(func() hash.Hash64{return crc64.New(crc64.MakeTable(crc64.ISO))}))
}

联系方式 Contact

讲解视频 https://www.bilibili.com/video/BV1bJ411t7A8/ 邮箱 [mailto:hugh2632@hotmail.com]

End

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	DefaultHash = []global.HashFunc{
		func() hash.Hash64 { return fnv.New64() },

		func() hash.Hash64 { return adapter.NewMD5() },
		func() hash.Hash64 { return adapter.NewSha1() },
		func() hash.Hash64 { return adapter.NewSha256() },
		func() hash.Hash64 { return adapter.NewSha512() },
	}
	IsDebug = false
)

Functions

func CalculateInformationEntropy

func CalculateInformationEntropy(f global.HashFunc) float64
	get the information entropy to test if a hash function is uniformly distributed. 获取信息熵以测试一个哈希方法是不是均匀分布的
 	-**-** H = -sum(p*log2(p))
	the target result should be: -64*(1/64*log2(1/64))=6. 理想的均匀分布最终得值是-64*(1/64*log2(1/64))=6=6。

func GetFalsePositiveRate

func GetFalsePositiveRate(biteCount uint, existedCount uint, hashCount uint) float64

Calculate the false positive rate. biteCount-The length of bits, existedCount-The existed elements in the filter, hashCount-The count of hash functions. 计算误判率, biteCount-比特长度, existedCount-元素, hashCount-哈希函数个数

Types

type IFilter

type IFilter interface {
	//添加内容到过滤器中
	Push(content []byte)
	//将数据持久化
	Write() error
	//判断内容是否存在
	Exists(content []byte) bool
	//判断是否为空
	IsEmpty() bool
	//清除数据
	Clear() error
	//关闭
	Close() error
}

增量过滤器

func NewMemoryFilter

func NewMemoryFilter(bytes []byte, hashes ...global.HashFunc) IFilter

Memory Filter

func NewRedisFilter

func NewRedisFilter(context context.Context, cli *redis.Client, tp RedisFilterType, key string, byteLen uint64, hashes ...global.HashFunc) (res IFilter, err error)

RedisFilter. CachedFilter's data is stored in hash map. And you need use "Write" method to truly write the bytes to redis server. InteractiveFilter will commit the contents to redis server whenever 'Push' function is called. 数据保存到redis的过滤器。 如果选择CachedFilter, 其实是在本地维护了一个MemoryFilter, 只有在使用Write方法时才会真正提交到Redis服务器。 如果选择InteractiveFilter, 将在每次Push数据的时候,直接提交到redis服务器。(集群中需要使用分布式锁)

func SqlFilter

func SqlFilter(db *gorm.DB, key string, byteLen uint64, hashes ...global.HashFunc) (IFilter, error)

MemoryBased bloom filter. Synchronize to the sql database when 'Write' method is called. The default tableName is 'bloom' 基于内存的过滤器,在使用Write方法时,将同步到SQL数据库中。默认的表名为'bloom'

type RedisFilterType

type RedisFilterType uint32
const (
	// 缓存模式
	RedisFilterType_Cached RedisFilterType = 0
	// 交互模式
	RedisFilterType_Interactive RedisFilterType = 1
)

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL