gcse

package module

v0.0.0-...-0ae9947 Latest Latest Go to latest Published: Oct 15, 2014 License: BSD-2-Clause, BSD-3-Clause Imports: 33 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/xladykiller/gcse

Links

Open Source Insights

README ¶

Go Search

A keyword search engine helping people to find popular and relevant Go packages.

Online service: Go Search

This is the root package with shared functions.

Sub packages are commands for running:

HTTP Server: Searching and web service
ToCrawl: Find packages to crawl.
Crawler: Crawling package files.
MergeDocs: Merge crawled package files with doc DB.
Indexer: Analyzing package information and generating indexed data for searching.

Development

You'll need to perform the following steps to get a basic server running:

Create a basic conf.json file, limiting the crawler to a one minute run: { "crawler": { "due_per_run": "1m" } }
Run the package finder: go run tocrawl/*.go
Run the crawler: go run crawler/*.go
Merge the crawled docs: go run mergedocs/*.go
Run the indexer: go run indexer/*.go
Run the server: go run server/*.go
Visit http://localhost:8080 in your browser

LICENSE

BSD license.

Documentation ¶

Overview ¶

Package gcse is the core supporting library for go-code-search-engine (GCSE). Its exported types and functions are mainly for sub packages. If you want some of the function, copy the code away.

Sub-projects ¶

crawler crawling packages

indexer creating index data for web-server

server providing web services, including home/top/search services.

Data-flows ¶

project Read Write ------- ---- ----- crawler fnCrawlerDB fnCrawlerDB

        fnDocDB       fnDocDB
		              DBOutSegments

indexer DBOutSegments IndexSegments

server IndexSegments

Index ¶

Constants
Variables
func AppendPackages(pkgs []string) bool
func AppendTokens(tokens villa.StrSet, text []byte) villa.StrSet
func AuthorOfPackage(pkg string) string
func CalcMatchScore(doc *HitInfo, tokenList []string, textIdfs, nameIdfs []float64) float64
func CalcPackagePartition(pkg string, totalParts int) int
func CalcStaticScore(doc *HitInfo) float64
func CalcTestStaticScore(doc *HitInfo, realImported []string) float64
func CheckCamel(last, current rune) index.RuneType
func CheckRuneType(last, current rune) index.RuneType
func ChooseImportantSentenses(text string, name, pkg string) []string
func ClearWatcherEvents(watcher *fsnotify.Watcher)
func DumpMemStats()
func FetchAllPackagesInGodoc(httpClient doc.HttpClient) ([]string, error)
func FullProjectOfPackage(pkg string) string
func GenHttpClient(proxy string) doc.HttpClient
func GithubUpdates() (map[string]time.Time, error)
func HostOfPackage(pkg string) string
func IdOfPerson(site, username string) string
func Index(docDB mr.Input) (*index.TokenSetSearcher, error)
func IsBadPackage(err error) bool
func LikeButton(httpClient doc.HttpClient, Url string) (int, error)
func NewDocInfo() sophie.Sophier
func NewNewDocAction() sophie.Sophier
func NormWord(word string) string
func ParsePersonId(id string) (site, username string)
func Plusone(httpClient doc.HttpClient, url string) (int, error)
func ProjectOfPackage(pkg string) string
func ReadJsonFile(fn villa.Path, data interface{}) error
func ReadPackages(segm Segment) (pkgs []string, err error)
func ReadmeToText(fn, data string) string
func SegmentLess(a, b Segment) bool
func SplitSentences(text string) []string
func TrimPackageName(pkg string) string
func WaitForWatcherEvents(watcher *fsnotify.Watcher)
func WriteJsonFile(fn villa.Path, data interface{}) error
type BlackRequest
- func (br *BlackRequest) Do(req *http.Request) (*http.Response, error)
type CrawlerDB
- func LoadCrawlerDB() *CrawlerDB
- func (cdb *CrawlerDB) AppendPackage(pkg string, inDocs func(pkg string) bool)
- func (cdb *CrawlerDB) AppendPerson(site, username string) bool
- func (cdb *CrawlerDB) SchedulePackage(pkg string, sTime time.Time, etag string) error
- func (cdb *CrawlerDB) SchedulePerson(id string, sTime time.Time) error
- func (cdb *CrawlerDB) Sync() error
type CrawlingEntry
- func (c *CrawlingEntry) ReadFrom(r sophie.Reader, l int) error
- func (c *CrawlingEntry) WriteTo(w sophie.Writer) error
type DocDB
type DocInfo
- func (d *DocInfo) ReadFrom(r sophie.Reader, l int) error
- func (d *DocInfo) WriteTo(w sophie.Writer) error
type HitInfo
type MemDB
- func NewMemDB(root villa.Path, kind string) *MemDB
- func (mdb *MemDB) Count() int
- func (mdb *MemDB) Delete(key string)
- func (mdb *MemDB) Export(root villa.Path, kind string) error
- func (mdb *MemDB) Get(key string, data interface{}) bool
- func (mdb *MemDB) Iterate(output func(key string, val interface{}) error) error
- func (mdb *MemDB) LastModified() time.Time
- func (mdb *MemDB) Load() error
- func (mdb *MemDB) Modified() bool
- func (mdb *MemDB) Put(key string, data interface{})
- func (mdb *MemDB) Sync() error
type NewDocAction
- func (nda *NewDocAction) ReadFrom(r sophie.Reader, l int) error
- func (nda *NewDocAction) WriteTo(w sophie.Writer) error
type Package
- func CrawlPackage(httpClient doc.HttpClient, pkg string, etag string) (p *Package, err error)
type PackedDocDB
- func (db PackedDocDB) Get(key string, data interface{}) bool
- func (db PackedDocDB) Iterate(output func(key string, val interface{}) error) error
- func (db PackedDocDB) Put(key string, data interface{})
type Person
- func CrawlPerson(httpClient doc.HttpClient, id string) (*Person, error)
type Segment
type Segments
type Size
- func (s Size) String() string
type TokenIndexer
- func NewTokenIndexer(root villa.Path, kind string) *TokenIndexer
- func (ti *TokenIndexer) Export(root villa.Path, kind string) error
- func (ti *TokenIndexer) IdsOfToken(token string) []string
- func (ti *TokenIndexer) LastModified() time.Time
- func (ti *TokenIndexer) Load() error
- func (ti *TokenIndexer) Modified() bool
- func (ti *TokenIndexer) Put(id string, tokens villa.StrSet)
- func (ti *TokenIndexer) Sync() error
- func (ti *TokenIndexer) TokensOfId(id string) []string

Constants ¶

View Source

const (
	KindIndex = "index"
	IndexFn   = KindIndex + ".gob"

	KindDocDB = "docdb"

	FnCrawlerDB = "crawler"
	KindPackage = "package"
	KindPerson  = "person"
	KindToCheck = "tocheck"

	FnToCrawl = "tocrawl"
	FnPackage = "package"
	FnPerson  = "person"
	// key: RawString, value: DocInfo
	FnDocs    = "docs"
	FnNewDocs = "newdocs"
)

View Source

const (
	NDA_UPDATE = iota
	NDA_STARS
	NDA_DEL
)

View Source

const (
	IndexTextField = "text"
	IndexNameField = "name"
	IndexPkgField  = "pkg"
)

View Source

const (
	DOCS_PARTS = 128
)

Variables ¶

View Source

var (
	ServerAddr = ":8080"
	ServerRoot = villa.Path("./server/")

	LoadTemplatePass = ""
	AutoLoadTemplate = false

	DataRoot      = villa.Path("./data/")
	CrawlerDBPath = DataRoot.Join(FnCrawlerDB)
	DocsDBPath    = DataRoot.Join(FnDocs)

	// producer: server, consumer: crawler
	ImportPath     villa.Path
	ImportSegments Segments

	// producer: crawler, consumer: indexer
	DBOutPath     villa.Path
	DBOutSegments Segments

	// producer: indexer, consumer: server.
	// server never delete index segments, indexer clear updated segments.
	IndexPath     villa.Path
	IndexSegments Segments

	// configures of crawler
	CrawlByGodocApi   = true
	CrawlGithubUpdate = true
	CrawlerDuePerRun  = 1 * time.Hour

	/*
		Increase this to ignore etag of last versions to crawl and parse all
		packages.

		ChangeLog:
		    0    First version
		    1    Add TestImports/XTestImports to Imports
		    2    Parse markdown readme to text before selecting synopsis
			     from it
			3    Add exported tokens to indexes
			4    Move TestImports/XTestImports out of Imports, to TestImports
			4    A bug of checking CrawlerVersion is fixed
	*/
	CrawlerVersion = 5
)

View Source

var (
	ErrPackageNotModifed = errors.New("package not modified")
	ErrInvalidPackage    = errors.New("invalid package")
)

Functions ¶

func AppendPackages ¶

func AppendPackages(pkgs []string) bool

AppendPackages appends a list packages to imports folder for crawler backend to read

func AppendTokens ¶

func AppendTokens(tokens villa.StrSet, text []byte) villa.StrSet

func AuthorOfPackage ¶

func AuthorOfPackage(pkg string) string

func CalcMatchScore ¶

func CalcMatchScore(doc *HitInfo, tokenList []string,
	textIdfs, nameIdfs []float64) float64

func CalcPackagePartition ¶

func CalcPackagePartition(pkg string, totalParts int) int

func CalcStaticScore ¶

func CalcStaticScore(doc *HitInfo) float64

func CalcTestStaticScore ¶

func CalcTestStaticScore(doc *HitInfo, realImported []string) float64

func CheckCamel ¶

func CheckCamel(last, current rune) index.RuneType

func CheckRuneType ¶

func CheckRuneType(last, current rune) index.RuneType

func ChooseImportantSentenses ¶

func ChooseImportantSentenses(text string, name, pkg string) []string

func ClearWatcherEvents ¶

func ClearWatcherEvents(watcher *fsnotify.Watcher)

func DumpMemStats ¶

func DumpMemStats()

func FetchAllPackagesInGodoc ¶

func FetchAllPackagesInGodoc(httpClient doc.HttpClient) ([]string, error)

FetchAllPackagesInGodoc fetches the list of all packages on godoc.org

func FullProjectOfPackage ¶

func FullProjectOfPackage(pkg string) string

func GenHttpClient ¶

func GenHttpClient(proxy string) doc.HttpClient

func GithubUpdates ¶

func GithubUpdates() (map[string]time.Time, error)

func HostOfPackage ¶

func HostOfPackage(pkg string) string

func IdOfPerson ¶

func IdOfPerson(site, username string) string

func Index ¶

func Index(docDB mr.Input) (*index.TokenSetSearcher, error)

func IsBadPackage ¶

func IsBadPackage(err error) bool

func LikeButton ¶

func LikeButton(httpClient doc.HttpClient, Url string) (int, error)

func NewDocInfo ¶

func NewDocInfo() sophie.Sophier

Returns a new instance of DocInfo as a sophie.Sophier

func NewNewDocAction ¶

func NewNewDocAction() sophie.Sophier

Returns a new instance of *NewDocAction as a Sophier

func NormWord ¶

func NormWord(word string) string

func ParsePersonId ¶

func ParsePersonId(id string) (site, username string)

func Plusone ¶

func Plusone(httpClient doc.HttpClient, url string) (int, error)

func ProjectOfPackage ¶

func ProjectOfPackage(pkg string) string

core project of a packaage

func ReadJsonFile ¶

func ReadJsonFile(fn villa.Path, data interface{}) error

func ReadPackages ¶

func ReadPackages(segm Segment) (pkgs []string, err error)

func ReadmeToText ¶

func ReadmeToText(fn, data string) string

func SegmentLess ¶

func SegmentLess(a, b Segment) bool

func SplitSentences ¶

func SplitSentences(text string) []string

func TrimPackageName ¶

func TrimPackageName(pkg string) string

func WaitForWatcherEvents ¶

func WaitForWatcherEvents(watcher *fsnotify.Watcher)

func WriteJsonFile ¶

func WriteJsonFile(fn villa.Path, data interface{}) error

Types ¶

type BlackRequest ¶

type BlackRequest struct {
	sync.RWMutex
	// contains filtered or unexported fields
}

func (*BlackRequest) Do ¶

func (br *BlackRequest) Do(req *http.Request) (*http.Response, error)

type CrawlerDB ¶

type CrawlerDB struct {
	PackageDB *MemDB
	PersonDB  *MemDB
}

* CrawlerDB including all crawler entires database.

func LoadCrawlerDB ¶

func LoadCrawlerDB() *CrawlerDB

LoadCrawlerDB loads PackageDB and PersonDB and returns a new *CrawlerDB

func (*CrawlerDB) AppendPackage ¶

func (cdb *CrawlerDB) AppendPackage(pkg string,
	inDocs func(pkg string) bool)

AppendPackage appends a package. If the package did not exist in either PackageDB or Docs, shedulet it (immediately).

func (*CrawlerDB) AppendPerson ¶

func (cdb *CrawlerDB) AppendPerson(site, username string) bool

AppendPerson appends a person to the PersonDB, schedules to crawl immediately for a new person

func (*CrawlerDB) SchedulePackage ¶

func (cdb *CrawlerDB) SchedulePackage(pkg string, sTime time.Time,
	etag string) error

SchedulePackage schedules a package to be crawled at a specific time.

func (*CrawlerDB) SchedulePerson ¶

func (cdb *CrawlerDB) SchedulePerson(id string, sTime time.Time) error

SchedulePerson schedules a person to be crawled at a specific time.

func (*CrawlerDB) Sync ¶

func (cdb *CrawlerDB) Sync() error

Sync syncs both PackageDB and PersonDB. Returns error if any of the sync failed.

type CrawlingEntry ¶

type CrawlingEntry struct {
	ScheduleTime time.Time
	// if gcse.CrawlerVersion is different from this value, etag is ignored
	Version int
	Etag    string
}

func (*CrawlingEntry) ReadFrom ¶

func (c *CrawlingEntry) ReadFrom(r sophie.Reader, l int) error

func (*CrawlingEntry) WriteTo ¶

func (c *CrawlingEntry) WriteTo(w sophie.Writer) error

type DocDB ¶

type DocDB interface {
	Sync() error
	Export(root villa.Path, kind string) error

	Get(key string, data interface{}) bool
	Put(key string, data interface{})
	Delete(key string)
	Iterate(output func(key string, val interface{}) error) error
}

type DocInfo ¶

type DocInfo struct {
	Name        string
	Package     string
	Author      string
	LastUpdated time.Time
	StarCount   int
	Synopsis    string
	Description string
	ProjectURL  string
	ReadmeFn    string
	ReadmeData  string
	Imports     []string
	TestImports []string
	Exported    []string // exported tokens(funcs/types)
}

DocInfo is the information stored in backend docDB

func (*DocInfo) ReadFrom ¶

func (d *DocInfo) ReadFrom(r sophie.Reader, l int) error

func (*DocInfo) WriteTo ¶

func (d *DocInfo) WriteTo(w sophie.Writer) error

type HitInfo ¶

type HitInfo struct {
	DocInfo

	Imported           []string
	TestImported       []string
	ImportantSentences []string

	AssignedStarCount float64
	StaticScore       float64
	TestStaticScore   float64
	StaticRank        int // zero-based
}

HitInfo is the information provided to frontend

type MemDB ¶

type MemDB struct {
	sync.RWMutex
	// contains filtered or unexported fields
}

func NewMemDB ¶

func NewMemDB(root villa.Path, kind string) *MemDB

func (*MemDB) Count ¶

func (mdb *MemDB) Count() int

Count returns the number of entries in the DB

func (*MemDB) Delete ¶

func (mdb *MemDB) Delete(key string)

func (*MemDB) Export ¶

func (mdb *MemDB) Export(root villa.Path, kind string) error

Export saves the data to some space, but not affecting the modified property.

func (*MemDB) Get ¶

func (mdb *MemDB) Get(key string, data interface{}) bool

Get fetches an entry of specified key. data is a pointer. Return false if not exists

func (*MemDB) Iterate ¶

func (mdb *MemDB) Iterate(output func(key string, val interface{}) error) error

func (*MemDB) LastModified ¶

func (mdb *MemDB) LastModified() time.Time

func (*MemDB) Load ¶

func (mdb *MemDB) Load() error

func (*MemDB) Modified ¶

func (mdb *MemDB) Modified() bool

func (*MemDB) Put ¶

func (mdb *MemDB) Put(key string, data interface{})

func (*MemDB) Sync ¶

func (mdb *MemDB) Sync() error

type NewDocAction ¶

type NewDocAction struct {
	Action sophie.VInt
	DocInfo
}

* If Action equals NDA_DEL, DocInfo is undefined.

func (*NewDocAction) ReadFrom ¶

func (nda *NewDocAction) ReadFrom(r sophie.Reader, l int) error

func (*NewDocAction) WriteTo ¶

func (nda *NewDocAction) WriteTo(w sophie.Writer) error

type Package ¶

type Package struct {
	Package     string
	Name        string
	Synopsis    string
	Doc         string
	ProjectURL  string
	StarCount   int
	ReadmeFn    string
	ReadmeData  string
	Imports     []string
	TestImports []string
	Exported    []string // exported tokens(funcs/types)

	References []string
	Etag       string
}

Package stores information from crawler

func CrawlPackage ¶

func CrawlPackage(httpClient doc.HttpClient, pkg string,
	etag string) (p *Package, err error)

type PackedDocDB ¶

type PackedDocDB struct {
	*MemDB
}

func (PackedDocDB) Get ¶

func (db PackedDocDB) Get(key string, data interface{}) bool

func (PackedDocDB) Iterate ¶

func (db PackedDocDB) Iterate(
	output func(key string, val interface{}) error) error

func (PackedDocDB) Put ¶

func (db PackedDocDB) Put(key string, data interface{})

type Person ¶

type Person struct {
	Id       string
	Packages []string
}

func CrawlPerson ¶

func CrawlPerson(httpClient doc.HttpClient, id string) (*Person, error)

type Segment ¶

type Segment interface {
	Name() string
	Join(name string) villa.Path
	IsDone() bool
	Done() error
	ListFiles() ([]villa.Path, error)
	Remove() error
}

type Segments ¶

type Segments interface {
	Watch(watcher *fsnotify.Watcher) error
	ListAll() ([]Segment, error)
	// all done
	ListDones() ([]Segment, error)
	// max done
	FindMaxDone() (Segment, error)
	// generates an arbitrary new segment
	GenNewSegment() (Segment, error)
	// generates a segment greated than all existence
	GenMaxSegment() (Segment, error)
	// clear
	ClearUndones() error
}

type Size ¶

type Size int64

func (Size) String ¶

func (s Size) String() string

type TokenIndexer ¶

type TokenIndexer struct {
	index.TokenIndexer

	sync.RWMutex
	// contains filtered or unexported fields
}

TokenIndexer is thread-safe.

func NewTokenIndexer ¶

func NewTokenIndexer(root villa.Path, kind string) *TokenIndexer

func (*TokenIndexer) Export ¶

func (ti *TokenIndexer) Export(root villa.Path, kind string) error

func (*TokenIndexer) IdsOfToken ¶

func (ti *TokenIndexer) IdsOfToken(token string) []string

func (*TokenIndexer) LastModified ¶

func (ti *TokenIndexer) LastModified() time.Time

func (*TokenIndexer) Load ¶

func (ti *TokenIndexer) Load() error

func (*TokenIndexer) Modified ¶

func (ti *TokenIndexer) Modified() bool

func (*TokenIndexer) Put ¶

func (ti *TokenIndexer) Put(id string, tokens villa.StrSet)

func (*TokenIndexer) Sync ¶

func (ti *TokenIndexer) Sync() error

func (*TokenIndexer) TokensOfId ¶

func (ti *TokenIndexer) TokensOfId(id string) []string

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
crawler GCSE Crawler background program.	GCSE Crawler background program.
exps
indexer
mergedocs
server GCSE HTTP server.	GCSE HTTP server.
tocrawl
tools

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL