Browse Source

Switching to enthropy

Loweel 4 years ago
parent
commit
210380c11b
4 changed files with 109 additions and 81 deletions
  1. 1 25
      bayes.json
  2. 10 2
      classifier.go
  3. 8 26
      file.go
  4. 90 28
      matrix.go

+ 1 - 25
bayes.json

@@ -1,25 +1 @@
-{
- "LastUpdate": "2019-12-04 14:32:16.403693322 +0100 CET m=+0.016948951",
- "GOOD": [
-  {
-   "Token": "/Gun/good",
-   "LastSeen": "2019-12-04 14:32:16.390739816 +0100 CET",
-   "Age": "13.201146ms"
-  }
- ],
- "BAD": [
-  {
-   "Token": "/Penis/bad",
-   "LastSeen": "2019-12-04 14:32:16.389706996 +0100 CET",
-   "Age": "14.183289ms"
-  }
- ],
- "MEH": [
-  {
-   "Token": "Dildo",
-   "LastSeen": "2019-12-04 14:32:16.390823335 +0100 CET",
-   "Age": "13.128746ms"
-  }
- ],
- "STATS": null
-}
+{}

+ 10 - 2
classifier.go

@@ -84,11 +84,16 @@ func feedRequest(req *http.Request, class string) {
 
 	feed := formatRequest(req)
 
+	tokens := strings.Fields(sanitizeHeaders(feed))
+
 	if class == "BAD" {
 
 		log.Println("Feeding BAD token: ", feed)
 
-		ControPlane.BadTokens <- sanitizeHeaders(feed)
+		for _, tk := range tokens {
+
+			ControPlane.BadTokens <- tk
+		}
 
 	}
 
@@ -96,7 +101,10 @@ func feedRequest(req *http.Request, class string) {
 
 		log.Println("Feeding GOOD Token:", feed)
 
-		ControPlane.GoodTokens <- sanitizeHeaders(feed)
+		for _, tk := range tokens {
+
+			ControPlane.GoodTokens <- tk
+		}
 
 	}
 

+ 8 - 26
file.go

@@ -1,7 +1,6 @@
 package main
 
 import (
-	"bytes"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -37,43 +36,26 @@ func saveBayesToFile() {
 
 	log.Println("Trying to write json file")
 	defer handlepanic()
-	
 
-	var jsnBuf = new(bytes.Buffer)
-	var tmpJSON string
+	var tmpJSON []byte
 
-	Classifier.busy.Lock()
-	DumpJSON, err := Classifier.bayez.MarshalJSON()
-	if err != nil {
-		DumpJSON = []byte(err.Error())
-	}
-	Classifier.busy.Unlock()
+	Classifier.Matrix.busy.Lock()
+	defer Classifier.Matrix.busy.Unlock()
 
-	log.Println("Raw dump of Classifier: ", string(DumpJSON))
-
-	jerr := json.Indent(jsnBuf, DumpJSON, "", " ")
-	if jerr == nil {
-		tmpJSON = jsnBuf.String()
-	} else {
-		tmpJSON = jerr.Error()
+	tmpJSON, err := json.MarshalIndent(Classifier.Matrix.bScores, "", " ")
+	if err != nil {
+		tmpJSON = []byte(err.Error())
 	}
 
-	fmt.Println(time.Now().String())
-
-	Classifier.STATS.Range(func(key interface{}, value interface{}) bool {
-		fmt.Printf("%s : %d\n", key.(string), value.(int64))
-		return true
-	})
-
 	dumpfile := os.Getenv("DUMPFILE")
 	if dumpfile == "" {
 		dumpfile = "bayes.json"
 	}
 
 	if DebugLog {
-		log.Println(tmpJSON)
+		log.Println("DUMP: ", string(tmpJSON))
 	} else {
-		writeToFile(dumpfile, tmpJSON)
+		writeToFile(dumpfile, string(tmpJSON))
 
 		log.Println("File saved: ", dumpfile)
 	}

+ 90 - 28
matrix.go

@@ -2,13 +2,12 @@ package main
 
 import (
 	"bufio"
-	"fmt"
+
 	"log"
+	"math"
 	"os"
-
+	"strings"
 	"sync"
-
-	"github.com/lytics/multibayes"
 )
 
 //ByControlPlane contains all the channels we need.
@@ -18,14 +17,23 @@ type ByControlPlane struct {
 	StatsTokens chan string
 }
 
+type bScore struct {
+	BadScore  float64
+	GoodScore float64
+}
+
+type bMap struct {
+	bScores map[string]bScore
+	busy    sync.Mutex
+}
+
 //ControPlane is the variabile
 var ControPlane ByControlPlane
 
 //ByClassifier is the structure containing our Pseudo-Bayes classifier.
 type ByClassifier struct {
-	STATS sync.Map
-	bayez *multibayes.Classifier
-	busy  sync.Mutex
+	STATS  sync.Map
+	Matrix bMap
 }
 
 //AddStats adds the statistics after proper blocking.
@@ -44,39 +52,94 @@ func (c *ByClassifier) AddStats(action string) {
 //IsBAD inserts a bad key in the right place.
 func (c *ByClassifier) IsBAD(key string) {
 
-	c.busy.Lock()
-	defer c.busy.Unlock()
+	c.Matrix.busy.Lock()
+	defer c.Matrix.busy.Unlock()
 
-	c.bayez.Add(key, []string{"BAD"})
+	var t bScore
+
+	if val, ok := c.Matrix.bScores[key]; ok {
+		t.BadScore = val.BadScore + 1
+		t.GoodScore = val.GoodScore
+	} else {
+		t.BadScore = 1
+		t.GoodScore = 0
+	}
+
+	c.Matrix.bScores[key] = t
 
 }
 
 //IsGOOD inserts the key in the right place.
 func (c *ByClassifier) IsGOOD(key string) {
 
-	c.busy.Lock()
-	defer c.busy.Unlock()
+	c.Matrix.busy.Lock()
+	defer c.Matrix.busy.Unlock()
+
+	var t bScore
+
+	if val, ok := c.Matrix.bScores[key]; ok {
+		t.GoodScore = val.GoodScore + 1
+		t.BadScore = val.BadScore
+	} else {
+		t.BadScore = 0
+		t.GoodScore = 1
+	}
 
-	c.bayez.Add(key, []string{"GOOD"})
+	c.Matrix.bScores[key] = t
 
 }
 
 //Posterior calculates the posterior probabilities in pseudo-bayes.
-func (c *ByClassifier) Posterior(hdr string) (ff map[string]float64) {
+func (c *ByClassifier) Posterior(hdr string) map[string]float64 {
+
+	c.Matrix.busy.Lock()
+	defer c.Matrix.busy.Unlock()
+
+	tokens := strings.Fields(sanitizeHeaders(hdr))
+	lenTokens := float64(len(tokens))
 
-	defer func() {
+	ff := make(map[string]float64)
 
-		if a := recover(); a != nil {
-			fmt.Println("OPS!: Recovering from:", a)
-			ff = make(map[string]float64)
-			ff["BAD"] = 0.5
-			ff["GOOD"] = 0.5
+	if lenTokens == 0 {
+		ff["BAD"] = 0.5
+		ff["GOOD"] = 0.5
+		return ff
+	}
+
+	log.Println("Start classification of: ", tokens)
+
+	var hBadM, hGoodM float64
+
+	for _, tk := range tokens {
+
+		if val, ok := c.Matrix.bScores[tk]; ok {
+			log.Println("Classifier found: ", tk)
+			if val.BadScore > 0 {
+				hBadM += val.BadScore * math.Log2(val.BadScore)
+
+			}
+
+			if val.GoodScore > 0 {
+				hGoodM += val.GoodScore * math.Log2(val.GoodScore)
+
+			}
 		}
-	}()
 
-	c.busy.Lock()
-	ff = c.bayez.Posterior(hdr)
-	defer c.busy.Unlock()
+	}
+
+	hBadM = math.Log2(lenTokens) - (hBadM / lenTokens)
+	hGoodM = math.Log2(lenTokens) - (hGoodM / lenTokens)
+
+	if math.Abs(hGoodM) >= math.Abs(hBadM) {
+		ff["GOOD"] = 1
+		ff["BAD"] = 0
+	} else {
+		ff["GOOD"] = 0
+		ff["BAD"] = 1
+	}
+
+	log.Println("Entropies: ", ff)
+
 	return ff
 
 }
@@ -87,10 +150,9 @@ func (c *ByClassifier) enroll() {
 	ControPlane.GoodTokens = make(chan string, 2048)
 	ControPlane.StatsTokens = make(chan string, 2048)
 
-	c.busy.Lock()
-	c.bayez = multibayes.NewClassifier()
-	c.bayez.MinClassSize = 0
-	c.busy.Unlock()
+	c.Matrix.busy.Lock()
+	c.Matrix.bScores = make(map[string]bScore)
+	c.Matrix.busy.Unlock()
 
 	c.readInitList("blacklist.txt", "BAD")
 	c.readInitList("whitelist.txt", "GOOD")