metrics

package
v0.3.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 27, 2023 License: MIT Imports: 5 Imported by: 55

Documentation

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Hamming added in v0.2.2

type Hamming struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool
}

Hamming represents the Hamming metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Hamming_distance.
Example
package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	h := metrics.NewHamming()

	sim := h.Compare("text", "test")
	fmt.Printf("(text, test) similarity: %.2f\n", sim)

	dist := h.Distance("text", "test")
	fmt.Printf("(text, test) distance: %d\n", dist)

	// Custom options.
	h.CaseSensitive = false

	sim = h.Compare("ONE", "once")
	fmt.Printf("(ONE, once) similarity: %.2f\n", sim)

	dist = h.Distance("one", "once")
	fmt.Printf("(ONE, once) distance: %d\n", dist)

}
Output:

(text, test) similarity: 0.75
(text, test) distance: 1
(ONE, once) similarity: 0.50
(ONE, once) distance: 2

func NewHamming added in v0.2.2

func NewHamming() *Hamming

NewHamming returns a new Hamming string metric.

Default options:

CaseSensitive: true

func (*Hamming) Compare added in v0.2.2

func (m *Hamming) Compare(a, b string) float64

Compare returns the Hamming similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.

func (*Hamming) Distance added in v0.2.2

func (m *Hamming) Distance(a, b string) int

Distance returns the Hamming distance between a and b. Lower distances indicate closer matches. A distance of 0 means the strings are identical.

type Jaccard added in v0.2.0

type Jaccard struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool

	// NgramSize represents the size (in characters) of the tokens generated
	// when comparing the input sequences.
	NgramSize int
}

Jaccard represents the Jaccard index for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Jaccard_index.
Example
package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	j := metrics.NewJaccard()
	sim := j.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

	// Custom options.
	j.CaseSensitive = false
	j.NgramSize = 3

	sim = j.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

}
Output:

(night, alright) similarity: 0.43
(night, alright) similarity: 0.33

func NewJaccard added in v0.2.0

func NewJaccard() *Jaccard

NewJaccard returns a new Jaccard string metric.

Default options:

CaseSensitive: true
NGramSize: 2

func (*Jaccard) Compare added in v0.2.0

func (m *Jaccard) Compare(a, b string) float64

Compare returns the Jaccard similarity coefficient of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches. An n-gram size of 2 is used if the provided size is less than or equal to 0.

type Jaro

type Jaro struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool
}

Jaro represents the Jaro metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance.
Example
package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	jaro := metrics.NewJaro()
	sim := jaro.Compare("sort", "shirt")
	fmt.Printf("(sort, shirt) similarity: %.2f\n", sim)

}
Output:

(sort, shirt) similarity: 0.78

func NewJaro

func NewJaro() *Jaro

NewJaro returns a new Jaro string metric.

Default options:

CaseSensitive: true

func (*Jaro) Compare

func (m *Jaro) Compare(a, b string) float64

Compare returns the Jaro similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.

type JaroWinkler

type JaroWinkler struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool
}

JaroWinkler represents the Jaro-Winkler metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance.
Example
package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	jw := metrics.NewJaroWinkler()
	sim := jw.Compare("sort", "shirt")
	fmt.Printf("(sort, shirt) similarity: %.2f\n", sim)

}
Output:

(sort, shirt) similarity: 0.80

func NewJaroWinkler

func NewJaroWinkler() *JaroWinkler

NewJaroWinkler returns a new Jaro-Winkler string metric.

Default options:

CaseSensitive: true

func (*JaroWinkler) Compare

func (m *JaroWinkler) Compare(a, b string) float64

Compare returns the Jaro-Winkler similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.

type Levenshtein

type Levenshtein struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool

	// InsertCost represents the Levenshtein cost of a character insertion.
	InsertCost int

	// InsertCost represents the Levenshtein cost of a character deletion.
	DeleteCost int

	// InsertCost represents the Levenshtein cost of a character substitution.
	ReplaceCost int
}

Levenshtein represents the Levenshtein metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Levenshtein_distance.
Example
package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	lev := metrics.NewLevenshtein()

	sim := lev.Compare("book", "brick")
	fmt.Printf("(book, brick) similarity: %.2f\n", sim)

	dist := lev.Distance("book", "brick")
	fmt.Printf("(book, brick) distance: %d\n", dist)

	// Custom options.
	lev.CaseSensitive = false
	lev.ReplaceCost = 2

	sim = lev.Compare("HELLO", "jello")
	fmt.Printf("(HELLO, jello) similarity: %.2f\n", sim)

	dist = lev.Distance("HELLO", "jello")
	fmt.Printf("(HELLO, jello) distance: %d\n", dist)

}
Output:

(book, brick) similarity: 0.40
(book, brick) distance: 3
(HELLO, jello) similarity: 0.60
(HELLO, jello) distance: 2

func NewLevenshtein

func NewLevenshtein() *Levenshtein

NewLevenshtein returns a new Levenshtein string metric.

Default options:

CaseSensitive: true
InsertCost: 1
DeleteCost: 1
ReplaceCost: 1

func (*Levenshtein) Compare

func (m *Levenshtein) Compare(a, b string) float64

Compare returns the Levenshtein similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.

func (*Levenshtein) Distance

func (m *Levenshtein) Distance(a, b string) int

Distance returns the Levenshtein distance between a and b. Lower distances indicate closer matches. A distance of 0 means the strings are identical.

type MatchMismatch

type MatchMismatch struct {
	// Match represents the score of equal character substitutions.
	Match float64

	// Mismatch represents the score of unequal character substitutions.
	Mismatch float64
}

MatchMismatch represents a substitution function which returns the match or mismatch value depeding on the equality of the compared characters. The match value must be greater than the mismatch value.

func (MatchMismatch) Compare

func (m MatchMismatch) Compare(a []rune, idxA int, b []rune, idxB int) float64

Compare returns the match value if a[idxA] is equal to b[idxB] or the mismatch value otherwise.

func (MatchMismatch) Max

func (m MatchMismatch) Max() float64

Max returns the match value.

func (MatchMismatch) Min

func (m MatchMismatch) Min() float64

Min returns the mismatch value.

type OverlapCoefficient added in v0.2.0

type OverlapCoefficient struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool

	// NgramSize represents the size (in characters) of the tokens generated
	// when comparing the input sequences.
	NgramSize int
}

OverlapCoefficient represents the overlap coefficient for measuring the similarity between sequences. The metric is also know as the Szymkiewicz-Simpson coefficient.

For more information see https://en.wikipedia.org/wiki/Overlap_coefficient.
Example
package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	oc := metrics.NewOverlapCoefficient()
	sim := oc.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

	// Subset comparison.
	sim = oc.Compare("aa", "aaaa")
	fmt.Printf("(aa, aaaa) similarity: %.2f\n", sim)

	// Custom options.
	oc.CaseSensitive = false
	oc.NgramSize = 3

	sim = oc.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

}
Output:

(night, alright) similarity: 0.75
(aa, aaaa) similarity: 1.00
(night, alright) similarity: 0.67

func NewOverlapCoefficient added in v0.2.0

func NewOverlapCoefficient() *OverlapCoefficient

NewOverlapCoefficient returns a new overlap coefficient string metric.

Default options:

CaseSensitive: true
NGramSize: 2

func (*OverlapCoefficient) Compare added in v0.2.0

func (m *OverlapCoefficient) Compare(a, b string) float64

Compare returns the OverlapCoefficient similarity coefficient of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches. An n-gram size of 2 is used if the provided size is less than or equal to 0.

type SmithWatermanGotoh

type SmithWatermanGotoh struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool

	// GapPenalty defines a score penalty for character insertions or deletions.
	// For relevant results, the gap penalty should be a non-positive number.
	GapPenalty float64

	// Substitution represents a substitution function which is used to
	// calculate a score for character substitutions.
	Substitution Substitution
}

SmithWatermanGotoh represents the Smith-Waterman-Gotoh metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Smith-Waterman_algorithm.
Example
package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	swg := metrics.NewSmithWatermanGotoh()

	sim := swg.Compare("a pink kitten", "a kitten")
	fmt.Printf("(a pink kitten, a kitten) similarity: %.2f\n", sim)

	// Custom options.
	swg.CaseSensitive = false
	swg.GapPenalty = -0.1
	swg.Substitution = metrics.MatchMismatch{
		Match:    1,
		Mismatch: -0.5,
	}

	sim = swg.Compare("a pink kitten", "A KITTEN")
	fmt.Printf("(a pink kitten, A KITTEN) similarity: %.2f\n", sim)

}
Output:

(a pink kitten, a kitten) similarity: 0.88
(a pink kitten, A KITTEN) similarity: 0.94

func NewSmithWatermanGotoh

func NewSmithWatermanGotoh() *SmithWatermanGotoh

NewSmithWatermanGotoh returns a new Smith-Waterman-Gotoh string metric.

Default options:

CaseSensitive: true
GapPenalty: -0.5
Substitution: MatchMismatch{
	Match:    1,
	Mismatch: -2,
},

func (*SmithWatermanGotoh) Compare

func (m *SmithWatermanGotoh) Compare(a, b string) float64

Compare returns the Smith-Waterman-Gotoh similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.

type SorensenDice

type SorensenDice struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool

	// NgramSize represents the size (in characters) of the tokens generated
	// when comparing the input sequences.
	NgramSize int
}

SorensenDice represents the Sorensen-Dice metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Sorensen-Dice_coefficient.
Example
package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	sd := metrics.NewSorensenDice()
	sim := sd.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

	// Custom options.
	sd.CaseSensitive = false
	sd.NgramSize = 3

	sim = sd.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

}
Output:

(night, alright) similarity: 0.60
(night, alright) similarity: 0.50

func NewSorensenDice

func NewSorensenDice() *SorensenDice

NewSorensenDice returns a new Sorensen-Dice string metric.

Default options:

CaseSensitive: true
NGramSize: 2

func (*SorensenDice) Compare

func (m *SorensenDice) Compare(a, b string) float64

Compare returns the Sorensen-Dice similarity coefficient of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches. An n-gram size of 2 is used if the provided size is less than or equal to 0.

type Substitution

type Substitution interface {
	// Compare returns the substitution score of characters a[idxA] and b[idxB].
	Compare(a []rune, idxA int, b []rune, idxB int) float64

	// Returns the maximum score of a character substitution operation.
	Max() float64

	// Returns the minimum score of a character substitution operation.
	Min() float64
}

Substitution represents a substitution function which is used to calculate a score for character substitutions.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL