123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 |
- package multibayes
- import (
- "math"
- )
- var (
- smoother = 1 // laplace
- defaultMinClassSize = 5
- )
- type Classifier struct {
- Tokenizer *tokenizer `json:"-"`
- Matrix *sparseMatrix `json:"matrix"`
- MinClassSize int
- }
- // Create a new multibayes classifier.
- func NewClassifier() *Classifier {
- tokenize, _ := newTokenizer(&tokenizerConf{
- NGramSize: 1,
- })
- sparse := newSparseMatrix()
- return &Classifier{
- Tokenizer: tokenize,
- Matrix: sparse,
- MinClassSize: defaultMinClassSize,
- }
- }
- // Train the classifier with a new document and its classes.
- func (c *Classifier) Add(document string, classes []string) {
- ngrams := c.Tokenizer.Parse(document)
- c.Matrix.Add(ngrams, classes)
- }
- // Calculate the posterior probability for a new document on each
- // class from the training set.
- func (c *Classifier) Posterior(document string) map[string]float64 {
- tokens := c.Tokenizer.Parse(document)
- predictions := make(map[string]float64)
- for class, classcolumn := range c.Matrix.Classes {
- if len(classcolumn.Data) < c.MinClassSize {
- continue
- }
- n := classcolumn.Count()
- smoothN := n + (smoother * 2)
- priors := []float64{
- float64(n+smoother) / float64(c.Matrix.N+(smoother*2)), // P(C=Y)
- float64(c.Matrix.N-n+smoother) / float64(c.Matrix.N+(smoother*2)), // P(C=N)
- }
- loglikelihood := []float64{1.0, 1.0}
- // check if each token is in our token sparse matrix
- for _, token := range tokens {
- if tokencolumn, ok := c.Matrix.Tokens[token.String()]; ok {
- // conditional probability the token occurs for the class
- joint := intersection(tokencolumn.Data, classcolumn.Data)
- conditional := float64(joint+smoother) / float64(smoothN) // P(F|C=Y)
- loglikelihood[0] += math.Log(conditional)
- // conditional probability the token occurs if the class doesn't apply
- not := len(tokencolumn.Data) - joint
- notconditional := float64(not+smoother) / float64(smoothN) // P(F|C=N)
- loglikelihood[1] += math.Log(notconditional)
- }
- }
- likelihood := []float64{
- math.Exp(loglikelihood[0]),
- math.Exp(loglikelihood[1]),
- }
- prob := bayesRule(priors, likelihood) // P(C|F)
- predictions[class] = prob[0]
- }
- return predictions
- }
- func bayesRule(prior, likelihood []float64) []float64 {
- posterior := make([]float64, len(prior))
- sum := 0.0
- for i, _ := range prior {
- combined := prior[i] * likelihood[i]
- posterior[i] = combined
- sum += combined
- }
- // scale the likelihoods
- for i, _ := range posterior {
- posterior[i] /= sum
- }
- return posterior
- }
- // elements that are in both array1 and array2
- func intersection(array1, array2 []int) int {
- var count int
- for _, elem1 := range array1 {
- for _, elem2 := range array2 {
- if elem1 == elem2 {
- count++
- break
- }
- }
- }
- return count
- }
|