metrics

package

v0.3.1 Latest Latest Go to latest Published: Sep 27, 2023 License: MIT Imports: 5 Imported by: 55

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/adrg/strutil

Links

Open Source Insights

Documentation ¶

Index ¶

type Hamming
- func NewHamming() *Hamming
- func (m *Hamming) Compare(a, b string) float64
- func (m *Hamming) Distance(a, b string) int
type Jaccard
- func NewJaccard() *Jaccard
- func (m *Jaccard) Compare(a, b string) float64
type Jaro
- func NewJaro() *Jaro
- func (m *Jaro) Compare(a, b string) float64
type JaroWinkler
- func NewJaroWinkler() *JaroWinkler
- func (m *JaroWinkler) Compare(a, b string) float64
type Levenshtein
- func NewLevenshtein() *Levenshtein
- func (m *Levenshtein) Compare(a, b string) float64
- func (m *Levenshtein) Distance(a, b string) int
type MatchMismatch
- func (m MatchMismatch) Compare(a []rune, idxA int, b []rune, idxB int) float64
- func (m MatchMismatch) Max() float64
- func (m MatchMismatch) Min() float64
type OverlapCoefficient
- func NewOverlapCoefficient() *OverlapCoefficient
- func (m *OverlapCoefficient) Compare(a, b string) float64
type SmithWatermanGotoh
- func NewSmithWatermanGotoh() *SmithWatermanGotoh
- func (m *SmithWatermanGotoh) Compare(a, b string) float64
type SorensenDice
- func NewSorensenDice() *SorensenDice
- func (m *SorensenDice) Compare(a, b string) float64
type Substitution

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type Hamming ¶ added in v0.2.2

type Hamming struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool
}

Hamming represents the Hamming metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Hamming_distance.

Example ¶

package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	h := metrics.NewHamming()

	sim := h.Compare("text", "test")
	fmt.Printf("(text, test) similarity: %.2f\n", sim)

	dist := h.Distance("text", "test")
	fmt.Printf("(text, test) distance: %d\n", dist)

	// Custom options.
	h.CaseSensitive = false

	sim = h.Compare("ONE", "once")
	fmt.Printf("(ONE, once) similarity: %.2f\n", sim)

	dist = h.Distance("one", "once")
	fmt.Printf("(ONE, once) distance: %d\n", dist)

}

Output:

(text, test) similarity: 0.75
(text, test) distance: 1
(ONE, once) similarity: 0.50
(ONE, once) distance: 2

func NewHamming ¶ added in v0.2.2

func NewHamming() *Hamming

NewHamming returns a new Hamming string metric.

Default options:

CaseSensitive: true

func (*Hamming) Compare ¶ added in v0.2.2

func (m *Hamming) Compare(a, b string) float64

Compare returns the Hamming similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.

func (*Hamming) Distance ¶ added in v0.2.2

func (m *Hamming) Distance(a, b string) int

Distance returns the Hamming distance between a and b. Lower distances indicate closer matches. A distance of 0 means the strings are identical.

type Jaccard ¶ added in v0.2.0

type Jaccard struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool

	// NgramSize represents the size (in characters) of the tokens generated
	// when comparing the input sequences.
	NgramSize int
}

Jaccard represents the Jaccard index for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Jaccard_index.

Example ¶

package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	j := metrics.NewJaccard()
	sim := j.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

	// Custom options.
	j.CaseSensitive = false
	j.NgramSize = 3

	sim = j.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

}

Output:

(night, alright) similarity: 0.43
(night, alright) similarity: 0.33

func NewJaccard ¶ added in v0.2.0

func NewJaccard() *Jaccard

NewJaccard returns a new Jaccard string metric.

Default options:

CaseSensitive: true
NGramSize: 2

func (*Jaccard) Compare ¶ added in v0.2.0

func (m *Jaccard) Compare(a, b string) float64

Compare returns the Jaccard similarity coefficient of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches. An n-gram size of 2 is used if the provided size is less than or equal to 0.

type Jaro ¶

type Jaro struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool
}

Jaro represents the Jaro metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance.

Example ¶

package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	jaro := metrics.NewJaro()
	sim := jaro.Compare("sort", "shirt")
	fmt.Printf("(sort, shirt) similarity: %.2f\n", sim)

}

Output:

(sort, shirt) similarity: 0.78

func NewJaro ¶

func NewJaro() *Jaro

NewJaro returns a new Jaro string metric.

Default options:

CaseSensitive: true

func (*Jaro) Compare ¶

func (m *Jaro) Compare(a, b string) float64

Compare returns the Jaro similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.

type JaroWinkler ¶

type JaroWinkler struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool
}

JaroWinkler represents the Jaro-Winkler metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance.

Example ¶

package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	jw := metrics.NewJaroWinkler()
	sim := jw.Compare("sort", "shirt")
	fmt.Printf("(sort, shirt) similarity: %.2f\n", sim)

}

Output:

(sort, shirt) similarity: 0.80

func NewJaroWinkler ¶

func NewJaroWinkler() *JaroWinkler

NewJaroWinkler returns a new Jaro-Winkler string metric.

Default options:

CaseSensitive: true

func (*JaroWinkler) Compare ¶

func (m *JaroWinkler) Compare(a, b string) float64

Compare returns the Jaro-Winkler similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.

type Levenshtein ¶

type Levenshtein struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool

	// InsertCost represents the Levenshtein cost of a character insertion.
	InsertCost int

	// InsertCost represents the Levenshtein cost of a character deletion.
	DeleteCost int

	// InsertCost represents the Levenshtein cost of a character substitution.
	ReplaceCost int
}

Levenshtein represents the Levenshtein metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Levenshtein_distance.

Example ¶

package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	lev := metrics.NewLevenshtein()

	sim := lev.Compare("book", "brick")
	fmt.Printf("(book, brick) similarity: %.2f\n", sim)

	dist := lev.Distance("book", "brick")
	fmt.Printf("(book, brick) distance: %d\n", dist)

	// Custom options.
	lev.CaseSensitive = false
	lev.ReplaceCost = 2

	sim = lev.Compare("HELLO", "jello")
	fmt.Printf("(HELLO, jello) similarity: %.2f\n", sim)

	dist = lev.Distance("HELLO", "jello")
	fmt.Printf("(HELLO, jello) distance: %d\n", dist)

}

Output:

(book, brick) similarity: 0.40
(book, brick) distance: 3
(HELLO, jello) similarity: 0.60
(HELLO, jello) distance: 2

func NewLevenshtein ¶

func NewLevenshtein() *Levenshtein

NewLevenshtein returns a new Levenshtein string metric.

Default options:

CaseSensitive: true
InsertCost: 1
DeleteCost: 1
ReplaceCost: 1

func (*Levenshtein) Compare ¶

func (m *Levenshtein) Compare(a, b string) float64

Compare returns the Levenshtein similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.

func (*Levenshtein) Distance ¶

func (m *Levenshtein) Distance(a, b string) int

Distance returns the Levenshtein distance between a and b. Lower distances indicate closer matches. A distance of 0 means the strings are identical.

type MatchMismatch ¶

type MatchMismatch struct {
	// Match represents the score of equal character substitutions.
	Match float64

	// Mismatch represents the score of unequal character substitutions.
	Mismatch float64
}

MatchMismatch represents a substitution function which returns the match or mismatch value depeding on the equality of the compared characters. The match value must be greater than the mismatch value.

func (MatchMismatch) Compare ¶

func (m MatchMismatch) Compare(a []rune, idxA int, b []rune, idxB int) float64

Compare returns the match value if a[idxA] is equal to b[idxB] or the mismatch value otherwise.

func (MatchMismatch) Max ¶

func (m MatchMismatch) Max() float64

Max returns the match value.

func (MatchMismatch) Min ¶

func (m MatchMismatch) Min() float64

Min returns the mismatch value.

type OverlapCoefficient ¶ added in v0.2.0

type OverlapCoefficient struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool

	// NgramSize represents the size (in characters) of the tokens generated
	// when comparing the input sequences.
	NgramSize int
}

OverlapCoefficient represents the overlap coefficient for measuring the similarity between sequences. The metric is also know as the Szymkiewicz-Simpson coefficient.

For more information see https://en.wikipedia.org/wiki/Overlap_coefficient.

Example ¶

package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	oc := metrics.NewOverlapCoefficient()
	sim := oc.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

	// Subset comparison.
	sim = oc.Compare("aa", "aaaa")
	fmt.Printf("(aa, aaaa) similarity: %.2f\n", sim)

	// Custom options.
	oc.CaseSensitive = false
	oc.NgramSize = 3

	sim = oc.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

}

Output:

(night, alright) similarity: 0.75
(aa, aaaa) similarity: 1.00
(night, alright) similarity: 0.67

func NewOverlapCoefficient ¶ added in v0.2.0

func NewOverlapCoefficient() *OverlapCoefficient

NewOverlapCoefficient returns a new overlap coefficient string metric.

Default options:

CaseSensitive: true
NGramSize: 2

func (*OverlapCoefficient) Compare ¶ added in v0.2.0

func (m *OverlapCoefficient) Compare(a, b string) float64

Compare returns the OverlapCoefficient similarity coefficient of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches. An n-gram size of 2 is used if the provided size is less than or equal to 0.

type SmithWatermanGotoh ¶

type SmithWatermanGotoh struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool

	// GapPenalty defines a score penalty for character insertions or deletions.
	// For relevant results, the gap penalty should be a non-positive number.
	GapPenalty float64

	// Substitution represents a substitution function which is used to
	// calculate a score for character substitutions.
	Substitution Substitution
}

SmithWatermanGotoh represents the Smith-Waterman-Gotoh metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Smith-Waterman_algorithm.

Example ¶

package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	swg := metrics.NewSmithWatermanGotoh()

	sim := swg.Compare("a pink kitten", "a kitten")
	fmt.Printf("(a pink kitten, a kitten) similarity: %.2f\n", sim)

	// Custom options.
	swg.CaseSensitive = false
	swg.GapPenalty = -0.1
	swg.Substitution = metrics.MatchMismatch{
		Match:    1,
		Mismatch: -0.5,
	}

	sim = swg.Compare("a pink kitten", "A KITTEN")
	fmt.Printf("(a pink kitten, A KITTEN) similarity: %.2f\n", sim)

}

Output:

(a pink kitten, a kitten) similarity: 0.88
(a pink kitten, A KITTEN) similarity: 0.94

func NewSmithWatermanGotoh ¶

func NewSmithWatermanGotoh() *SmithWatermanGotoh

NewSmithWatermanGotoh returns a new Smith-Waterman-Gotoh string metric.

Default options:

CaseSensitive: true
GapPenalty: -0.5
Substitution: MatchMismatch{
	Match:    1,
	Mismatch: -2,
},

func (*SmithWatermanGotoh) Compare ¶

func (m *SmithWatermanGotoh) Compare(a, b string) float64

Compare returns the Smith-Waterman-Gotoh similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.

type SorensenDice ¶

type SorensenDice struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool

	// NgramSize represents the size (in characters) of the tokens generated
	// when comparing the input sequences.
	NgramSize int
}

SorensenDice represents the Sorensen-Dice metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Sorensen-Dice_coefficient.

Example ¶

package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	sd := metrics.NewSorensenDice()
	sim := sd.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

	// Custom options.
	sd.CaseSensitive = false
	sd.NgramSize = 3

	sim = sd.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

}

Output:

(night, alright) similarity: 0.60
(night, alright) similarity: 0.50

func NewSorensenDice ¶

func NewSorensenDice() *SorensenDice

NewSorensenDice returns a new Sorensen-Dice string metric.

Default options:

CaseSensitive: true
NGramSize: 2

func (*SorensenDice) Compare ¶

func (m *SorensenDice) Compare(a, b string) float64

Compare returns the Sorensen-Dice similarity coefficient of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches. An n-gram size of 2 is used if the provided size is less than or equal to 0.

type Substitution ¶

type Substitution interface {
	// Compare returns the substitution score of characters a[idxA] and b[idxB].
	Compare(a []rune, idxA int, b []rune, idxB int) float64

	// Returns the maximum score of a character substitution operation.
	Max() float64

	// Returns the minimum score of a character substitution operation.
	Min() float64
}

Substitution represents a substitution function which is used to calculate a score for character substitutions.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL