bayes.go 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. package multibayes
  2. import (
  3. "math"
  4. )
  5. var (
  6. smoother = 1 // laplace
  7. defaultMinClassSize = 5
  8. )
  9. type Classifier struct {
  10. Tokenizer *tokenizer `json:"-"`
  11. Matrix *sparseMatrix `json:"matrix"`
  12. MinClassSize int
  13. }
  14. // Create a new multibayes classifier.
  15. func NewClassifier() *Classifier {
  16. tokenize, _ := newTokenizer(&tokenizerConf{
  17. NGramSize: 1,
  18. })
  19. sparse := newSparseMatrix()
  20. return &Classifier{
  21. Tokenizer: tokenize,
  22. Matrix: sparse,
  23. MinClassSize: defaultMinClassSize,
  24. }
  25. }
  26. // Train the classifier with a new document and its classes.
  27. func (c *Classifier) Add(document string, classes []string) {
  28. ngrams := c.Tokenizer.Parse(document)
  29. c.Matrix.Add(ngrams, classes)
  30. }
  31. // Calculate the posterior probability for a new document on each
  32. // class from the training set.
  33. func (c *Classifier) Posterior(document string) map[string]float64 {
  34. tokens := c.Tokenizer.Parse(document)
  35. predictions := make(map[string]float64)
  36. for class, classcolumn := range c.Matrix.Classes {
  37. if len(classcolumn.Data) < c.MinClassSize {
  38. continue
  39. }
  40. n := classcolumn.Count()
  41. smoothN := n + (smoother * 2)
  42. priors := []float64{
  43. float64(n+smoother) / float64(c.Matrix.N+(smoother*2)), // P(C=Y)
  44. float64(c.Matrix.N-n+smoother) / float64(c.Matrix.N+(smoother*2)), // P(C=N)
  45. }
  46. loglikelihood := []float64{1.0, 1.0}
  47. // check if each token is in our token sparse matrix
  48. for _, token := range tokens {
  49. if tokencolumn, ok := c.Matrix.Tokens[token.String()]; ok {
  50. // conditional probability the token occurs for the class
  51. joint := intersection(tokencolumn.Data, classcolumn.Data)
  52. conditional := float64(joint+smoother) / float64(smoothN) // P(F|C=Y)
  53. loglikelihood[0] += math.Log(conditional)
  54. // conditional probability the token occurs if the class doesn't apply
  55. not := len(tokencolumn.Data) - joint
  56. notconditional := float64(not+smoother) / float64(smoothN) // P(F|C=N)
  57. loglikelihood[1] += math.Log(notconditional)
  58. }
  59. }
  60. likelihood := []float64{
  61. math.Exp(loglikelihood[0]),
  62. math.Exp(loglikelihood[1]),
  63. }
  64. prob := bayesRule(priors, likelihood) // P(C|F)
  65. predictions[class] = prob[0]
  66. }
  67. return predictions
  68. }
  69. func bayesRule(prior, likelihood []float64) []float64 {
  70. posterior := make([]float64, len(prior))
  71. sum := 0.0
  72. for i, _ := range prior {
  73. combined := prior[i] * likelihood[i]
  74. posterior[i] = combined
  75. sum += combined
  76. }
  77. // scale the likelihoods
  78. for i, _ := range posterior {
  79. posterior[i] /= sum
  80. }
  81. return posterior
  82. }
  83. // elements that are in both array1 and array2
  84. func intersection(array1, array2 []int) int {
  85. var count int
  86. for _, elem1 := range array1 {
  87. for _, elem2 := range array2 {
  88. if elem1 == elem2 {
  89. count++
  90. break
  91. }
  92. }
  93. }
  94. return count
  95. }