stalefish

package module
v0.0.0-...-e3ec724 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 27, 2021 License: MIT Imports: 15 Imported by: 5

README

stalefish

build test

stalefish is a toy full text search engine written in Go. MySQL is used for data persistence now. Document has only one field.

Specification

  • Indexing Documents
  • Search by MatchAllQuery
  • Search by MatchQuery(AND,OR)
  • Search by PhraseQuery
  • Multiple types of analyzers

Setup

# Setup MySQL
$ docker-compose up

# Test
$ make test

Example1

package main

import (
	"fmt"
	"log"

	"github.com/kotaroooo0/stalefish"
)

func main() {
	db, err := stalefish.NewDBClient(stalefish.NewDBConfig("root", "password", "127.0.0.1", "3306", "stalefish"))
	if err != nil {
		log.Fatal(err)
	}
	storage := stalefish.NewStorageRdbImpl(db)
	analyzer := stalefish.NewAnalyzer([]stalefish.CharFilter{}, stalefish.NewStandardTokenizer(), []stalefish.TokenFilter{stalefish.NewLowercaseFilter()})

	indexer := stalefish.NewIndexer(storage, analyzer, 1)
	for _, body := range []string{"Ruby PHP JS", "Go Ruby", "Ruby Go PHP", "Go PHP"} {
		if err := indexer.AddDocument(stalefish.NewDocument(body)); err != nil {
			log.Fatal(err)
		}
	}

	sorter := stalefish.NewTfIdfSorter(storage)
	mq := stalefish.NewMatchQuery("GO Ruby", stalefish.OR, analyzer, sorter)
	mseacher := mq.Searcher(storage)
	result, err := mseacher.Search()
	if err != nil {
		log.Fatal(err)
	}
	fmt.Println(result) // [{2 Go Ruby 2} {3 Ruby Go PHP 3} {4 Go PHP 2} {1 Ruby PHP JS 3}]

	pq := stalefish.NewPhraseQuery("go RUBY", analyzer, nil)
	pseacher := pq.Searcher(storage)
	result, err = pseacher.Search()
	if err != nil {
		log.Fatal(err)
	}
	fmt.Println(result) // [{2 Go Ruby 2}
}

Example2

package main

import (
	"fmt"

	"github.com/kotaroooo0/stalefish"
)

func main() {
	analyzer := stalefish.NewAnalyzer(
		[]stalefish.CharFilter{stalefish.NewMappingCharFilter(map[string]string{":(": "sad"})},
		stalefish.NewStandardTokenizer(),
		[]stalefish.TokenFilter{stalefish.NewLowercaseFilter(), stalefish.NewStemmerFilter(), stalefish.NewStopWordFilter([]string{"i", "my", "me", "the", "a", "for"})},
	)
	fmt.Println(analyzer.Analyze("I feel TIRED :(")) // {[{0 feel } {0 tire } {0 sad }]}
}

Development Task

  • Scoring with TF/IDF
  • Sorting
  • Setting document fields
  • Replacing MySQL with another DB
  • Preformance Tuning

Author

kotaroooo0

LICENSE

MIT

Documentation

Overview

Package stalefish is a generated GoMock package.

Package stalefish is a generated GoMock package.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func NewDBClient

func NewDBClient(dbConfig *DBConfig) (*sqlx.DB, error)

func NewDocumentScore

func NewDocumentScore(doc Document, score float64) documentScore

Types

type Analyzer

type Analyzer struct {
	// contains filtered or unexported fields
}

func NewAnalyzer

func NewAnalyzer(charFilters []CharFilter, tokenizer Tokenizer, tokenFilters []TokenFilter) Analyzer

func (Analyzer) Analyze

func (a Analyzer) Analyze(s string) TokenStream

type CharFilter

type CharFilter interface {
	Filter(string) string
}

type DBConfig

type DBConfig struct {
	User     string
	Password string
	Addr     string
	Port     string
	DB       string
}

func NewDBConfig

func NewDBConfig(user, password, addr, port, db string) *DBConfig

type Document

type Document struct {
	ID         DocumentID
	Body       string `db:"body"`
	TokenCount int    `db:"token_count"`
}

func NewDocument

func NewDocument(body string) Document

type DocumentID

type DocumentID uint64

type EncodedInvertedIndex

type EncodedInvertedIndex struct {
	TokenID     TokenID `db:"token_id"`     // トークンID
	PostingList []byte  `db:"posting_list"` // トークンを含むポスティングスリスト
}

func NewEncodedInvertedIndex

func NewEncodedInvertedIndex(id TokenID, pl []byte) EncodedInvertedIndex

type Indexer

type Indexer struct {
	// contains filtered or unexported fields
}

func NewIndexer

func NewIndexer(storage Storage, analyzer Analyzer, indexSizeThreshold int) *Indexer

func (*Indexer) AddDocument

func (i *Indexer) AddDocument(doc Document) error

転置インデックスにドキュメントを追加する

type InvertedIndex

type InvertedIndex map[TokenID]PostingList

転置インデックス TokenIDー>ポスティングリストのマップ

func NewInvertedIndex

func NewInvertedIndex(m map[TokenID]PostingList) InvertedIndex

func (InvertedIndex) TokenIDs

func (ii InvertedIndex) TokenIDs() []TokenID

type KanaReadingformFilter

type KanaReadingformFilter struct{}

func NewKanaReadingformFilter

func NewKanaReadingformFilter() KanaReadingformFilter

func (KanaReadingformFilter) Filter

func (f KanaReadingformFilter) Filter(tokenStream TokenStream) TokenStream

type Logic

type Logic int
const (
	AND Logic = iota + 1
	OR
)

type LowercaseFilter

type LowercaseFilter struct{}

func NewLowercaseFilter

func NewLowercaseFilter() LowercaseFilter

func (LowercaseFilter) Filter

func (f LowercaseFilter) Filter(tokenStream TokenStream) TokenStream

type MappingCharFilter

type MappingCharFilter struct {
	// contains filtered or unexported fields
}

func NewMappingCharFilter

func NewMappingCharFilter(mapper map[string]string) MappingCharFilter

func (MappingCharFilter) Filter

func (c MappingCharFilter) Filter(s string) string

type MatchAllQuery

type MatchAllQuery struct{}

func NewMatchAllQuery

func NewMatchAllQuery() MatchAllQuery

func (MatchAllQuery) Searcher

func (q MatchAllQuery) Searcher(storage Storage) Searcher

type MatchAllSearcher

type MatchAllSearcher struct {
	// contains filtered or unexported fields
}

func NewMatchAllSearcher

func NewMatchAllSearcher(storage Storage) MatchAllSearcher

func (MatchAllSearcher) Search

func (ms MatchAllSearcher) Search() ([]Document, error)

type MatchQuery

type MatchQuery struct {
	// contains filtered or unexported fields
}

func NewMatchQuery

func NewMatchQuery(keyword string, logic Logic, analyzer Analyzer, sorter Sorter) MatchQuery

func (MatchQuery) Searcher

func (q MatchQuery) Searcher(storage Storage) Searcher

type MatchSearcher

type MatchSearcher struct {
	// contains filtered or unexported fields
}

func NewMatchSearcher

func NewMatchSearcher(tokenStream TokenStream, logic Logic, storage Storage, sorter Sorter) MatchSearcher

func (MatchSearcher) Search

func (ms MatchSearcher) Search() ([]Document, error)

type MockMorphology

type MockMorphology struct {
	// contains filtered or unexported fields
}

MockMorphology is a mock of Morphology interface.

func NewMockMorphology

func NewMockMorphology(ctrl *gomock.Controller) *MockMorphology

NewMockMorphology creates a new mock instance.

func (*MockMorphology) Analyze

func (m *MockMorphology) Analyze(arg0 string) []morphology.MorphologyToken

Analyze mocks base method.

func (*MockMorphology) EXPECT

EXPECT returns an object that allows the caller to indicate expected use.

type MockMorphologyMockRecorder

type MockMorphologyMockRecorder struct {
	// contains filtered or unexported fields
}

MockMorphologyMockRecorder is the mock recorder for MockMorphology.

func (*MockMorphologyMockRecorder) Analyze

func (mr *MockMorphologyMockRecorder) Analyze(arg0 interface{}) *gomock.Call

Analyze indicates an expected call of Analyze.

type MockStorage

type MockStorage struct {
	// contains filtered or unexported fields
}

MockStorage is a mock of Storage interface.

func NewMockStorage

func NewMockStorage(ctrl *gomock.Controller) *MockStorage

NewMockStorage creates a new mock instance.

func (*MockStorage) AddDocument

func (m *MockStorage) AddDocument(arg0 Document) (DocumentID, error)

AddDocument mocks base method.

func (*MockStorage) AddToken

func (m *MockStorage) AddToken(token Token) (TokenID, error)

AddToken mocks base method.

func (*MockStorage) CountDocuments

func (m *MockStorage) CountDocuments() (int, error)

CountDocuments mocks base method.

func (*MockStorage) EXPECT

func (m *MockStorage) EXPECT() *MockStorageMockRecorder

EXPECT returns an object that allows the caller to indicate expected use.

func (*MockStorage) GetAllDocuments

func (m *MockStorage) GetAllDocuments() ([]Document, error)

GetAllDocuments mocks base method.

func (*MockStorage) GetDocuments

func (m *MockStorage) GetDocuments(arg0 []DocumentID) ([]Document, error)

GetDocuments mocks base method.

func (*MockStorage) GetInvertedIndexByTokenIDs

func (m *MockStorage) GetInvertedIndexByTokenIDs(arg0 []TokenID) (InvertedIndex, error)

GetInvertedIndexByTokenIDs mocks base method.

func (*MockStorage) GetTokenByTerm

func (m *MockStorage) GetTokenByTerm(arg0 string) (*Token, error)

GetTokenByTerm mocks base method.

func (*MockStorage) GetTokensByTerms

func (m *MockStorage) GetTokensByTerms(arg0 []string) ([]Token, error)

GetTokensByTerms mocks base method.

func (*MockStorage) UpsertInvertedIndex

func (m *MockStorage) UpsertInvertedIndex(arg0 InvertedIndex) error

UpsertInvertedIndex mocks base method.

type MockStorageMockRecorder

type MockStorageMockRecorder struct {
	// contains filtered or unexported fields
}

MockStorageMockRecorder is the mock recorder for MockStorage.

func (*MockStorageMockRecorder) AddDocument

func (mr *MockStorageMockRecorder) AddDocument(arg0 interface{}) *gomock.Call

AddDocument indicates an expected call of AddDocument.

func (*MockStorageMockRecorder) AddToken

func (mr *MockStorageMockRecorder) AddToken(token interface{}) *gomock.Call

AddToken indicates an expected call of AddToken.

func (*MockStorageMockRecorder) CountDocuments

func (mr *MockStorageMockRecorder) CountDocuments() *gomock.Call

CountDocuments indicates an expected call of CountDocuments.

func (*MockStorageMockRecorder) GetAllDocuments

func (mr *MockStorageMockRecorder) GetAllDocuments() *gomock.Call

GetAllDocuments indicates an expected call of GetAllDocuments.

func (*MockStorageMockRecorder) GetDocuments

func (mr *MockStorageMockRecorder) GetDocuments(arg0 interface{}) *gomock.Call

GetDocuments indicates an expected call of GetDocuments.

func (*MockStorageMockRecorder) GetInvertedIndexByTokenIDs

func (mr *MockStorageMockRecorder) GetInvertedIndexByTokenIDs(arg0 interface{}) *gomock.Call

GetInvertedIndexByTokenIDs indicates an expected call of GetInvertedIndexByTokenIDs.

func (*MockStorageMockRecorder) GetTokenByTerm

func (mr *MockStorageMockRecorder) GetTokenByTerm(arg0 interface{}) *gomock.Call

GetTokenByTerm indicates an expected call of GetTokenByTerm.

func (*MockStorageMockRecorder) GetTokensByTerms

func (mr *MockStorageMockRecorder) GetTokensByTerms(arg0 interface{}) *gomock.Call

GetTokensByTerms indicates an expected call of GetTokensByTerms.

func (*MockStorageMockRecorder) UpsertInvertedIndex

func (mr *MockStorageMockRecorder) UpsertInvertedIndex(arg0 interface{}) *gomock.Call

UpsertInvertedIndex indicates an expected call of UpsertInvertedIndex.

type MorphologicalTokenizer

type MorphologicalTokenizer struct {
	// contains filtered or unexported fields
}

func NewMorphologicalTokenizer

func NewMorphologicalTokenizer(morphology morphology.Morphology) MorphologicalTokenizer

func (MorphologicalTokenizer) Tokenize

type NgramTokenizer

type NgramTokenizer struct {
	// contains filtered or unexported fields
}

func NewNgramTokenizer

func NewNgramTokenizer(n int) NgramTokenizer

func (NgramTokenizer) Tokenize

func (t NgramTokenizer) Tokenize(s string) TokenStream

type PhraseQuery

type PhraseQuery struct {
	// contains filtered or unexported fields
}

func NewPhraseQuery

func NewPhraseQuery(phrase string, analyzer Analyzer, sorter Sorter) PhraseQuery

func (PhraseQuery) Searcher

func (q PhraseQuery) Searcher(storage Storage) Searcher

type PhraseSearcher

type PhraseSearcher struct {
	// contains filtered or unexported fields
}

func NewPhraseSearcher

func NewPhraseSearcher(tokenStream TokenStream, storage Storage, sorter Sorter) PhraseSearcher

func (PhraseSearcher) Search

func (ps PhraseSearcher) Search() ([]Document, error)

type PostingList

type PostingList struct {
	Postings *Postings // トークンごとのポスティングリスト
}

ポスティングリスト

func NewPostingList

func NewPostingList(pl *Postings) PostingList

func (PostingList) AppearanceCountInDocument

func (p PostingList) AppearanceCountInDocument(docID DocumentID) int

func (PostingList) Size

func (p PostingList) Size() int

type Postings

type Postings struct {
	DocumentID DocumentID // ドキュメントのID
	Positions  []uint64   // ドキュメント中での位置情報
	Next       *Postings  // 次のポスティングへのポインタ
}

ポスティング(ドキュメントID等を含むリンクリスト)

func NewPostings

func NewPostings(documentID DocumentID, positions []uint64, next *Postings) *Postings

func (*Postings) PushBack

func (p *Postings) PushBack(e *Postings)

type RomajiReadingformFilter

type RomajiReadingformFilter struct{}

func NewRomajiReadingformFilter

func NewRomajiReadingformFilter() RomajiReadingformFilter

func (RomajiReadingformFilter) Filter

func (f RomajiReadingformFilter) Filter(tokenStream TokenStream) TokenStream

type Searcher

type Searcher interface {
	Search() ([]Document, error)
}

type Sorter

type Sorter interface {
	Sort([]Document, InvertedIndex, []Token) ([]Document, error)
}

type StandardTokenizer

type StandardTokenizer struct{}

func NewStandardTokenizer

func NewStandardTokenizer() StandardTokenizer

func (StandardTokenizer) Tokenize

func (t StandardTokenizer) Tokenize(s string) TokenStream

type StemmerFilter

type StemmerFilter struct{}

func NewStemmerFilter

func NewStemmerFilter() StemmerFilter

func (StemmerFilter) Filter

func (f StemmerFilter) Filter(tokenStream TokenStream) TokenStream

type StopWordFilter

type StopWordFilter struct {
	// contains filtered or unexported fields
}

func NewStopWordFilter

func NewStopWordFilter(stopWords []string) StopWordFilter

func (StopWordFilter) Filter

func (f StopWordFilter) Filter(tokenStream TokenStream) TokenStream

type Storage

type Storage interface {
	CountDocuments() (int, error)
	GetAllDocuments() ([]Document, error)                        // 全てのドキュメントを返す
	GetDocuments([]DocumentID) ([]Document, error)               // 複数IDから複数ドキュメントを返す
	AddDocument(Document) (DocumentID, error)                    // ドキュメントを挿入する。挿入したドキュメントのIDを返す
	AddToken(token Token) (TokenID, error)                       // トークンを挿入する。挿入したドキュメントのIDを返す
	GetTokenByTerm(string) (*Token, error)                       // 語句からトークンを取得する
	GetTokensByTerms([]string) ([]Token, error)                  // 複数の語句から複数トークンを取得する
	GetInvertedIndexByTokenIDs([]TokenID) (InvertedIndex, error) // 複数トークンIDから転置インデックスを取得する
	UpsertInvertedIndex(InvertedIndex) error                     // 転置リストを更新する
}

type StorageRdbImpl

type StorageRdbImpl struct {
	DB *sqlx.DB
}

func NewStorageRdbImpl

func NewStorageRdbImpl(db *sqlx.DB) StorageRdbImpl

func (StorageRdbImpl) AddDocument

func (s StorageRdbImpl) AddDocument(doc Document) (DocumentID, error)

func (StorageRdbImpl) AddToken

func (s StorageRdbImpl) AddToken(token Token) (TokenID, error)

func (StorageRdbImpl) CountDocuments

func (s StorageRdbImpl) CountDocuments() (int, error)

func (StorageRdbImpl) GetAllDocuments

func (s StorageRdbImpl) GetAllDocuments() ([]Document, error)

func (StorageRdbImpl) GetDocuments

func (s StorageRdbImpl) GetDocuments(ids []DocumentID) ([]Document, error)

func (StorageRdbImpl) GetInvertedIndexByTokenIDs

func (s StorageRdbImpl) GetInvertedIndexByTokenIDs(ids []TokenID) (InvertedIndex, error)

func (StorageRdbImpl) GetTokenByTerm

func (s StorageRdbImpl) GetTokenByTerm(term string) (*Token, error)

func (StorageRdbImpl) GetTokensByTerms

func (s StorageRdbImpl) GetTokensByTerms(terms []string) ([]Token, error)

func (StorageRdbImpl) UpsertInvertedIndex

func (s StorageRdbImpl) UpsertInvertedIndex(inverted InvertedIndex) error

type TfIdfSorter

type TfIdfSorter struct {
	// contains filtered or unexported fields
}

func NewTfIdfSorter

func NewTfIdfSorter(storage Storage) *TfIdfSorter

func (*TfIdfSorter) Sort

func (s *TfIdfSorter) Sort(docs []Document, invertedIndex InvertedIndex, tokens []Token) ([]Document, error)

type Token

type Token struct {
	ID   TokenID `db:"id"`
	Term string  `db:"term"`
	Kana string  `db:"kana"`
}

func NewToken

func NewToken(term string, options ...TokenOption) Token

type TokenFilter

type TokenFilter interface {
	Filter(TokenStream) TokenStream
}

type TokenID

type TokenID uint64

type TokenOption

type TokenOption func(*Token)

type TokenStream

type TokenStream struct {
	Tokens []Token
}

func NewTokenStream

func NewTokenStream(tokens []Token) TokenStream

func (TokenStream) Size

func (ts TokenStream) Size() int

func (TokenStream) Terms

func (ts TokenStream) Terms() []string

type Tokenizer

type Tokenizer interface {
	Tokenize(string) TokenStream
}

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL