Documentation
¶
Index ¶
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type Hamming ¶ added in v0.2.2
type Hamming struct {
// CaseSensitive specifies if the string comparison is case sensitive.
CaseSensitive bool
}
Hamming represents the Hamming metric for measuring the similarity between sequences.
For more information see https://en.wikipedia.org/wiki/Hamming_distance.
Example ¶
package main
import (
"fmt"
"github.com/adrg/strutil/metrics"
)
func main() {
// Default options.
h := metrics.NewHamming()
sim := h.Compare("text", "test")
fmt.Printf("(text, test) similarity: %.2f\n", sim)
dist := h.Distance("text", "test")
fmt.Printf("(text, test) distance: %d\n", dist)
// Custom options.
h.CaseSensitive = false
sim = h.Compare("ONE", "once")
fmt.Printf("(ONE, once) similarity: %.2f\n", sim)
dist = h.Distance("one", "once")
fmt.Printf("(ONE, once) distance: %d\n", dist)
}
Output: (text, test) similarity: 0.75 (text, test) distance: 1 (ONE, once) similarity: 0.50 (ONE, once) distance: 2
func NewHamming ¶ added in v0.2.2
func NewHamming() *Hamming
NewHamming returns a new Hamming string metric.
Default options:
CaseSensitive: true
type Jaccard ¶ added in v0.2.0
type Jaccard struct {
// CaseSensitive specifies if the string comparison is case sensitive.
CaseSensitive bool
// NgramSize represents the size (in characters) of the tokens generated
// when comparing the input sequences.
NgramSize int
}
Jaccard represents the Jaccard index for measuring the similarity between sequences.
For more information see https://en.wikipedia.org/wiki/Jaccard_index.
Example ¶
package main
import (
"fmt"
"github.com/adrg/strutil/metrics"
)
func main() {
// Default options.
j := metrics.NewJaccard()
sim := j.Compare("night", "alright")
fmt.Printf("(night, alright) similarity: %.2f\n", sim)
// Custom options.
j.CaseSensitive = false
j.NgramSize = 3
sim = j.Compare("night", "alright")
fmt.Printf("(night, alright) similarity: %.2f\n", sim)
}
Output: (night, alright) similarity: 0.43 (night, alright) similarity: 0.33
func NewJaccard ¶ added in v0.2.0
func NewJaccard() *Jaccard
NewJaccard returns a new Jaccard string metric.
Default options:
CaseSensitive: true NGramSize: 2
type Jaro ¶
type Jaro struct {
// CaseSensitive specifies if the string comparison is case sensitive.
CaseSensitive bool
}
Jaro represents the Jaro metric for measuring the similarity between sequences.
For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance.
Example ¶
package main
import (
"fmt"
"github.com/adrg/strutil/metrics"
)
func main() {
jaro := metrics.NewJaro()
sim := jaro.Compare("sort", "shirt")
fmt.Printf("(sort, shirt) similarity: %.2f\n", sim)
}
Output: (sort, shirt) similarity: 0.78
type JaroWinkler ¶
type JaroWinkler struct {
// CaseSensitive specifies if the string comparison is case sensitive.
CaseSensitive bool
}
JaroWinkler represents the Jaro-Winkler metric for measuring the similarity between sequences.
For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance.
Example ¶
package main
import (
"fmt"
"github.com/adrg/strutil/metrics"
)
func main() {
jw := metrics.NewJaroWinkler()
sim := jw.Compare("sort", "shirt")
fmt.Printf("(sort, shirt) similarity: %.2f\n", sim)
}
Output: (sort, shirt) similarity: 0.80
func NewJaroWinkler ¶
func NewJaroWinkler() *JaroWinkler
NewJaroWinkler returns a new Jaro-Winkler string metric.
Default options:
CaseSensitive: true
func (*JaroWinkler) Compare ¶
func (m *JaroWinkler) Compare(a, b string) float64
Compare returns the Jaro-Winkler similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.
type Levenshtein ¶
type Levenshtein struct {
// CaseSensitive specifies if the string comparison is case sensitive.
CaseSensitive bool
// InsertCost represents the Levenshtein cost of a character insertion.
InsertCost int
// InsertCost represents the Levenshtein cost of a character deletion.
DeleteCost int
// InsertCost represents the Levenshtein cost of a character substitution.
ReplaceCost int
}
Levenshtein represents the Levenshtein metric for measuring the similarity between sequences.
For more information see https://en.wikipedia.org/wiki/Levenshtein_distance.
Example ¶
package main
import (
"fmt"
"github.com/adrg/strutil/metrics"
)
func main() {
// Default options.
lev := metrics.NewLevenshtein()
sim := lev.Compare("book", "brick")
fmt.Printf("(book, brick) similarity: %.2f\n", sim)
dist := lev.Distance("book", "brick")
fmt.Printf("(book, brick) distance: %d\n", dist)
// Custom options.
lev.CaseSensitive = false
lev.ReplaceCost = 2
sim = lev.Compare("HELLO", "jello")
fmt.Printf("(HELLO, jello) similarity: %.2f\n", sim)
dist = lev.Distance("HELLO", "jello")
fmt.Printf("(HELLO, jello) distance: %d\n", dist)
}
Output: (book, brick) similarity: 0.40 (book, brick) distance: 3 (HELLO, jello) similarity: 0.60 (HELLO, jello) distance: 2
func NewLevenshtein ¶
func NewLevenshtein() *Levenshtein
NewLevenshtein returns a new Levenshtein string metric.
Default options:
CaseSensitive: true InsertCost: 1 DeleteCost: 1 ReplaceCost: 1
func (*Levenshtein) Compare ¶
func (m *Levenshtein) Compare(a, b string) float64
Compare returns the Levenshtein similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.
func (*Levenshtein) Distance ¶
func (m *Levenshtein) Distance(a, b string) int
Distance returns the Levenshtein distance between a and b. Lower distances indicate closer matches. A distance of 0 means the strings are identical.
type MatchMismatch ¶
type MatchMismatch struct {
// Match represents the score of equal character substitutions.
Match float64
// Mismatch represents the score of unequal character substitutions.
Mismatch float64
}
MatchMismatch represents a substitution function which returns the match or mismatch value depeding on the equality of the compared characters. The match value must be greater than the mismatch value.
type OverlapCoefficient ¶ added in v0.2.0
type OverlapCoefficient struct {
// CaseSensitive specifies if the string comparison is case sensitive.
CaseSensitive bool
// NgramSize represents the size (in characters) of the tokens generated
// when comparing the input sequences.
NgramSize int
}
OverlapCoefficient represents the overlap coefficient for measuring the similarity between sequences. The metric is also know as the Szymkiewicz-Simpson coefficient.
For more information see https://en.wikipedia.org/wiki/Overlap_coefficient.
Example ¶
package main
import (
"fmt"
"github.com/adrg/strutil/metrics"
)
func main() {
// Default options.
oc := metrics.NewOverlapCoefficient()
sim := oc.Compare("night", "alright")
fmt.Printf("(night, alright) similarity: %.2f\n", sim)
// Subset comparison.
sim = oc.Compare("aa", "aaaa")
fmt.Printf("(aa, aaaa) similarity: %.2f\n", sim)
// Custom options.
oc.CaseSensitive = false
oc.NgramSize = 3
sim = oc.Compare("night", "alright")
fmt.Printf("(night, alright) similarity: %.2f\n", sim)
}
Output: (night, alright) similarity: 0.75 (aa, aaaa) similarity: 1.00 (night, alright) similarity: 0.67
func NewOverlapCoefficient ¶ added in v0.2.0
func NewOverlapCoefficient() *OverlapCoefficient
NewOverlapCoefficient returns a new overlap coefficient string metric.
Default options:
CaseSensitive: true NGramSize: 2
func (*OverlapCoefficient) Compare ¶ added in v0.2.0
func (m *OverlapCoefficient) Compare(a, b string) float64
Compare returns the OverlapCoefficient similarity coefficient of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches. An n-gram size of 2 is used if the provided size is less than or equal to 0.
type SmithWatermanGotoh ¶
type SmithWatermanGotoh struct {
// CaseSensitive specifies if the string comparison is case sensitive.
CaseSensitive bool
// GapPenalty defines a score penalty for character insertions or deletions.
// For relevant results, the gap penalty should be a non-positive number.
GapPenalty float64
// Substitution represents a substitution function which is used to
// calculate a score for character substitutions.
Substitution Substitution
}
SmithWatermanGotoh represents the Smith-Waterman-Gotoh metric for measuring the similarity between sequences.
For more information see https://en.wikipedia.org/wiki/Smith-Waterman_algorithm.
Example ¶
package main
import (
"fmt"
"github.com/adrg/strutil/metrics"
)
func main() {
// Default options.
swg := metrics.NewSmithWatermanGotoh()
sim := swg.Compare("a pink kitten", "a kitten")
fmt.Printf("(a pink kitten, a kitten) similarity: %.2f\n", sim)
// Custom options.
swg.CaseSensitive = false
swg.GapPenalty = -0.1
swg.Substitution = metrics.MatchMismatch{
Match: 1,
Mismatch: -0.5,
}
sim = swg.Compare("a pink kitten", "A KITTEN")
fmt.Printf("(a pink kitten, A KITTEN) similarity: %.2f\n", sim)
}
Output: (a pink kitten, a kitten) similarity: 0.88 (a pink kitten, A KITTEN) similarity: 0.94
func NewSmithWatermanGotoh ¶
func NewSmithWatermanGotoh() *SmithWatermanGotoh
NewSmithWatermanGotoh returns a new Smith-Waterman-Gotoh string metric.
Default options:
CaseSensitive: true
GapPenalty: -0.5
Substitution: MatchMismatch{
Match: 1,
Mismatch: -2,
},
func (*SmithWatermanGotoh) Compare ¶
func (m *SmithWatermanGotoh) Compare(a, b string) float64
Compare returns the Smith-Waterman-Gotoh similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.
type SorensenDice ¶
type SorensenDice struct {
// CaseSensitive specifies if the string comparison is case sensitive.
CaseSensitive bool
// NgramSize represents the size (in characters) of the tokens generated
// when comparing the input sequences.
NgramSize int
}
SorensenDice represents the Sorensen-Dice metric for measuring the similarity between sequences.
For more information see https://en.wikipedia.org/wiki/Sorensen-Dice_coefficient.
Example ¶
package main
import (
"fmt"
"github.com/adrg/strutil/metrics"
)
func main() {
// Default options.
sd := metrics.NewSorensenDice()
sim := sd.Compare("night", "alright")
fmt.Printf("(night, alright) similarity: %.2f\n", sim)
// Custom options.
sd.CaseSensitive = false
sd.NgramSize = 3
sim = sd.Compare("night", "alright")
fmt.Printf("(night, alright) similarity: %.2f\n", sim)
}
Output: (night, alright) similarity: 0.60 (night, alright) similarity: 0.50
func NewSorensenDice ¶
func NewSorensenDice() *SorensenDice
NewSorensenDice returns a new Sorensen-Dice string metric.
Default options:
CaseSensitive: true NGramSize: 2
func (*SorensenDice) Compare ¶
func (m *SorensenDice) Compare(a, b string) float64
Compare returns the Sorensen-Dice similarity coefficient of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches. An n-gram size of 2 is used if the provided size is less than or equal to 0.
type Substitution ¶
type Substitution interface {
// Compare returns the substitution score of characters a[idxA] and b[idxB].
Compare(a []rune, idxA int, b []rune, idxB int) float64
// Returns the maximum score of a character substitution operation.
Max() float64
// Returns the minimum score of a character substitution operation.
Min() float64
}
Substitution represents a substitution function which is used to calculate a score for character substitutions.