tokenize

package
v0.0.0-...-6616cf9 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 10, 2020 License: MIT Imports: 2 Imported by: 2

README

tokenize GitHub Actions Status codecov GoDoc

Simlpe tokenize chain

Example:

package main

import t "github.com/rekki/go-query-analyze/tokenize"

func main() {
  tokenizer := []t.Tokenizer{
    t.NewWhitespace(),
    t.NewLeftEdge(1),
    t.NewUnique(),
  }
  tokens := t.Tokenize("hello world", tokenizer...)

  fmt.Printf("%v", tokens) // [h he hel hell hello w wo wor worl world]
}

Documentation

Overview

Package tokenize provides the means to create a tokenizer chain

Index

Constants

View Source
const BASE_SOUNDEX = "0000"

Variables

This section is empty.

Functions

func EncodeSoundex

func EncodeSoundex(word string) string

func Tokenize

func Tokenize(s string, tokenizers ...Tokenizer) []string

Types

type CharNgram

type CharNgram struct {
	// contains filtered or unexported fields
}

func NewCharNgram

func NewCharNgram(size int) *CharNgram

func (*CharNgram) Apply

func (w *CharNgram) Apply(current []Token) []Token

type Custom

type Custom struct {
	// contains filtered or unexported fields
}

func NewCustom

func NewCustom(f func([]Token) []Token) *Custom

func (*Custom) Apply

func (l *Custom) Apply(s []Token) []Token

type LeftEdge

type LeftEdge struct {
	// contains filtered or unexported fields
}

func NewLeftEdge

func NewLeftEdge(n int) *LeftEdge

func (*LeftEdge) Apply

func (e *LeftEdge) Apply(current []Token) []Token

type Noop

type Noop struct{}

func NewNoop

func NewNoop() *Noop

func (*Noop) Apply

func (w *Noop) Apply(current []Token) []Token

type Shingles

type Shingles struct {
	// contains filtered or unexported fields
}

Shingles tokenizer (n-gram for words)

func NewShingles

func NewShingles(size int) *Shingles

NewShingles creates new Shingles struct

func (*Shingles) Apply

func (shingles *Shingles) Apply(current []Token) []Token

Apply applies semi shingles tokenizer it creates permutations "new","york","city" -> "new","newyork","york","yorkcity" it is very handy becuase when people search sometimes they just dont put space

type Soundex

type Soundex struct {
}

func NewSoundex

func NewSoundex() *Soundex

func (*Soundex) Apply

func (w *Soundex) Apply(current []Token) []Token

type Surround

type Surround struct {
	// contains filtered or unexported fields
}

NewSurround("$").Apply([]string{"h","he","hel"}) -> []string{"$h","he","hel$"}

func NewSurround

func NewSurround(s string) *Surround

func (*Surround) Apply

func (w *Surround) Apply(current []Token) []Token

type Token

type Token struct {
	Text     string
	Position int
	LineNo   int
}

func TokenizeT

func TokenizeT(s string, tokenizers ...Tokenizer) []Token

func (Token) Clone

func (t Token) Clone(s string) Token

type Tokenizer

type Tokenizer interface {
	Apply([]Token) []Token
}

type Unique

type Unique struct {
}

func NewUnique

func NewUnique() *Unique

func (*Unique) Apply

func (w *Unique) Apply(current []Token) []Token

type Whitespace

type Whitespace struct{}

func NewWhitespace

func NewWhitespace() *Whitespace

func (*Whitespace) Apply

func (w *Whitespace) Apply(current []Token) []Token

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL