tfidf

package module
v1.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 17, 2020 License: MIT Imports: 3 Imported by: 4

README

tfidf GoDoc Build Status Coverage Status Go Report Card

package tfidf provides lingo-friendly TF-IDF functionality.

See example for more

Documentation

Overview

package tfidf is a lingo-friendly TF-IDF library

Example
package main

import (
	"fmt"
	"math"
	"sort"
	"strings"

	. "github.com/go-nlp/tfidf"
	"github.com/xtgo/set"
	"gorgonia.org/tensor"
)

var mobydick = []string{
	"Call me Ishmael .",
	"Some years ago -- never mind how long precisely -- having little or no money in my purse , and nothing particular to interest me on shore , I thought I would sail about a little and see the watery part of the world .",
	"It is a way I have of driving off the spleen and regulating the circulation .",
	"Whenever I find myself growing grim about the mouth ; ",
	"whenever it is a damp , drizzly November in my soul ; ",
	"whenever I find myself involuntarily pausing before coffin warehouses , and bringing up the rear of every funeral I meet ; ",
	"and especially whenever my hypos get such an upper hand of me , that it requires a strong moral principle to prevent me from deliberately stepping into the street , and methodically knocking people's hats off -- then , I account it high time to get to sea as soon as I can .",
	"This is my substitute for pistol and ball . ",
	"With a philosophical flourish Cato throws himself upon his sword ; ",
	"I quietly take to the ship . There is nothing surprising in this .",
	"If they but knew it , almost all men in their degree , some time or other , cherish very nearly the same feelings towards the ocean with me .",
}

type doc []int

func (d doc) IDs() []int { return []int(d) }

func makeCorpus(a []string) (map[string]int, []string) {
	retVal := make(map[string]int)
	invRetVal := make([]string, 0)
	var id int
	for _, s := range a {
		for _, f := range strings.Fields(s) {
			f = strings.ToLower(f)
			if _, ok := retVal[f]; !ok {
				retVal[f] = id
				invRetVal = append(invRetVal, f)
				id++
			}
		}
	}
	return retVal, invRetVal
}

func makeDocuments(a []string, c map[string]int) []Document {
	retVal := make([]Document, 0, len(a))
	for _, s := range a {
		var ts []int
		for _, f := range strings.Fields(s) {
			f = strings.ToLower(f)
			id := c[f]
			ts = append(ts, id)
		}
		retVal = append(retVal, doc(ts))
	}
	return retVal
}

type docScore struct {
	id    int
	score float64
}

type docScores []docScore

func (ds docScores) Len() int           { return len(ds) }
func (ds docScores) Less(i, j int) bool { return ds[i].score < ds[j].score }
func (ds docScores) Swap(i, j int) {
	ds[i].score, ds[j].score = ds[j].score, ds[i].score
	ds[i].id, ds[j].id = ds[j].id, ds[i].id
}

func cosineSimilarity(queryScore []float64, docIDs []int, relVec []float64) docScores {
	// special case
	if len(docIDs) == 1 {
		// even more special case!
		if len(queryScore) == 1 {
			return docScores{
				{docIDs[0], queryScore[0] * relVec[0]},
			}
		}

		q := tensor.New(tensor.WithBacking(queryScore))
		m := tensor.New(tensor.WithBacking(relVec))
		score, err := q.Inner(m)
		if err != nil {
			panic(err)
		}
		return docScores{
			{docIDs[0], score.(float64)},
		}
	}

	m := tensor.New(tensor.WithShape(len(docIDs), len(queryScore)), tensor.WithBacking(relVec))
	q := tensor.New(tensor.WithShape(len(queryScore)), tensor.WithBacking(queryScore))
	dp, err := m.MatVecMul(q)
	if err != nil {
		panic(err)
	}

	m2, err := tensor.Square(m)
	if err != nil {
		panic(err)
	}

	normDocs, err := tensor.Sum(m2, 1)
	if err != nil {
		panic(err)
	}

	normDocs, err = tensor.Sqrt(normDocs)
	if err != nil {
		panic(err)
	}

	q2, err := tensor.Square(q)
	if err != nil {
		panic(err)
	}
	normQt, err := tensor.Sum(q2)
	if err != nil {
		panic(err)
	}
	normQ := normQt.Data().(float64)
	normQ = math.Sqrt(normQ)

	norms, err := tensor.Mul(normDocs, normQ)
	if err != nil {
		panic(err)
	}

	cosineSim, err := tensor.Div(dp, norms)
	if err != nil {
		panic(err)
	}
	csData := cosineSim.Data().([]float64)

	var ds docScores
	for i, id := range docIDs {
		score := csData[i]
		ds = append(ds, docScore{id: id, score: score})
	}
	return ds

}

func contains(query Document, in []Document, tf *TFIDF) (docIDs []int, relVec []float64) {
	q := query.IDs()
	q = set.Ints(q) // unique words only
	for i := range in {
		doc := in[i].IDs()

		var count int
		var relevant []float64
		for _, wq := range q {
		inner:
			for _, wd := range doc {
				if wq == wd {
					count++
					break inner
				}
			}
		}
		if count == len(q) {
			// calculate the score of the doc
			score := tf.Score(in[i])
			// get the  scores of the relevant words
			for _, wq := range q {
			inner2:
				for j, wd := range doc {
					if wd == wq {
						relevant = append(relevant, score[j])
						break inner2
					}
				}
			}
			docIDs = append(docIDs, i)
			relVec = append(relVec, relevant...)
		}
	}
	return
}

func main() {
	corpus, invCorpus := makeCorpus(mobydick)
	docs := makeDocuments(mobydick, corpus)
	tf := New()

	for _, doc := range docs {
		tf.Add(doc)
	}
	tf.CalculateIDF()

	fmt.Println("IDF:")
	for i, w := range invCorpus {
		fmt.Printf("\t%q: %1.1f\n", w, tf.IDF[i])
		if i >= 10 {
			break
		}
	}

	// now we search

	// "ishmael" is a query
	ishmael := doc{corpus["ishmael"]}

	// "whenever i find" is another query
	whenever := doc{corpus["whenever"], corpus["i"], corpus["find"]}

	// step1: score the queries
	ishmaelScore := tf.Score(ishmael)
	wheneverScore := tf.Score(whenever)

	// step2: find the docs that contains the queries.
	// if there are no docs, oops.
	ishmaelDocs, ishmaelRelVec := contains(ishmael, docs, tf)
	wheneverDocs, wheneverRelVec := contains(whenever, docs, tf)

	// step3: calculate the cosine similarity
	ishmaelRes := cosineSimilarity(ishmaelScore, ishmaelDocs, ishmaelRelVec)
	wheneverRes := cosineSimilarity(wheneverScore, wheneverDocs, wheneverRelVec)

	// step4: sort the results
	sort.Sort(sort.Reverse(ishmaelRes))
	sort.Sort(sort.Reverse(wheneverRes))

	fmt.Printf("Relevant Docs to \"Ishmael\":\n")
	for _, d := range ishmaelRes {
		fmt.Printf("\tID   : %d\n\tScore: %1.3f\n\tDoc  : %q\n", d.id, d.score, mobydick[d.id])
	}
	fmt.Println("")
	fmt.Printf("Relevant Docs to \"whenever i find\":\n")
	for _, d := range wheneverRes {
		fmt.Printf("\tID   : %d\n\tScore: %1.3f\n\tDoc  : %q\n", d.id, d.score, mobydick[d.id])
	}
}
Output:

IDF:
	"call": 2.4
	"me": 1.0
	"ishmael": 2.4
	".": 0.5
	"some": 1.7
	"years": 2.4
	"ago": 2.4
	"--": 1.7
	"never": 2.4
	"mind": 2.4
	"how": 2.4
Relevant Docs to "Ishmael":
	ID   : 0
	Score: 1.437
	Doc  : "Call me Ishmael ."

Relevant Docs to "whenever i find":
	ID   : 5
	Score: 0.985
	Doc  : "whenever I find myself involuntarily pausing before coffin warehouses , and bringing up the rear of every funeral I meet ; "
	ID   : 3
	Score: 0.962
	Doc  : "Whenever I find myself growing grim about the mouth ; "

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

func BOW

func BOW(doc Document) []int

BOW turns a document into a bag of words. The words of the document will have been deduplicated. A unique list of word IDs is then returned.

func TF

func TF(doc Document) []float64

TF calculates the term frequencies of term. This is useful for scoring functions. It does not make it a unique bag of words.

Types

type Document

type Document interface {
	IDs() []int
}

Document is a representation of a document.

type ScoreFn

type ScoreFn func(tf *TFIDF, doc Document) []float64

ScoreFn is any function that returns a score of the document.

type TFIDF

type TFIDF struct {
	// Term Frequency
	TF map[int]float64

	// Inverse Document Frequency
	IDF map[int]float64

	// Docs is the count of documents
	Docs int

	// Len is the total length of docs
	Len int

	sync.Mutex
}

TFIDF is a structure holding the relevant state information about TF/IDF

func New

func New() *TFIDF

New creates a new TFIDF structure

func (*TFIDF) Add

func (tf *TFIDF) Add(doc Document)

Add adds a document to the state

func (*TFIDF) CalculateIDF

func (tf *TFIDF) CalculateIDF()

CalculateIDF calculates the inverse document frequency

func (*TFIDF) Score

func (tf *TFIDF) Score(doc Document) []float64

Score calculates the TFIDF score (TF * IDF) for the document without adding the document to the tracked document count.

This function is only useful for a handful of cases. It's recommended you write your own scoring functions.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL