zardoz/matrix.go
2019-12-11 12:10:36 +01:00

230 lines
4.5 KiB
Go

package main
import (
"bufio"
"log"
"os"
"strings"
"sync"
"time"
)
const (
Good Class = "GOOD"
Bad Class = "BAD"
)
//ByControlPlane contains all the channels we need.
type ByControlPlane struct {
BadTokens chan string
GoodTokens chan string
StatsTokens chan string
}
type safeClassifier struct {
bayez *Classifier
busy sync.Mutex
}
//ControPlane is the variabile
var ControPlane ByControlPlane
//ByClassifier is the structure containing our Pseudo-Bayes classifier.
type ByClassifier struct {
STATS sync.Map
Learning safeClassifier
Working safeClassifier
Generation int64
}
//AddStats adds the statistics after proper blocking.
func (c *ByClassifier) AddStats(action string) {
var one int64 = 1
if v, ok := c.STATS.Load(action); ok {
c.STATS.Store(action, v.(int64)+1)
} else {
c.STATS.Store(action, one)
}
}
//IsBAD inserts a bad key in the right place.
func (c *ByClassifier) IsBAD(key string) {
k := strings.Fields(key)
log.Println("BAD Received", k)
c.Learning.busy.Lock()
defer c.Learning.busy.Unlock()
c.Learning.bayez.Learn(k, Bad)
log.Println("BAD Learned", k)
}
//IsGOOD inserts the key in the right place.
func (c *ByClassifier) IsGOOD(key string) {
k := strings.Fields(key)
log.Println("GOOD Received", k)
c.Learning.busy.Lock()
defer c.Learning.busy.Unlock()
c.Learning.bayez.Learn(k, Good)
log.Println("GOOD Learned", k)
}
//Posterior calculates Shannon based entropy using bad and good as different distributions
func (c *ByClassifier) Posterior(hdr string) map[string]float64 {
tokens := sanitizeHeaders(hdr)
ff := make(map[string]float64)
if c.Generation == 0 {
ff["BAD"] = 0.5
ff["GOOD"] = 0.5
return ff
}
log.Println("Posterior locking the Working Bayesian")
c.Working.busy.Lock()
defer c.Working.busy.Unlock()
log.Println("Going to calculate the Scores")
scores, _, _, err := c.Working.bayez.SafeProbScores(strings.Fields(tokens))
log.Println("Scores calculated")
if err == ErrUnderflow {
ff["BAD"] = 0.5
ff["GOOD"] = 0.5
return ff
}
ff["GOOD"] = scores[0]
ff["BAD"] = scores[1]
return ff
}
func (c *ByClassifier) enroll() {
ControPlane.BadTokens = make(chan string, 2048)
ControPlane.GoodTokens = make(chan string, 2048)
ControPlane.StatsTokens = make(chan string, 2048)
c.Generation = 0
c.Learning.bayez = NewClassifierTfIdf(Good, Bad)
c.Working.bayez = NewClassifierTfIdf(Good, Bad)
c.readInitList("blacklist.txt", "BAD")
c.readInitList("whitelist.txt", "GOOD")
go c.readBadTokens()
go c.readGoodTokens()
go c.readStatsTokens()
go c.updateLearners()
log.Println("Classifier populated...")
}
func (c *ByClassifier) readBadTokens() {
log.Println("Start reading BAD tokens")
for token := range ControPlane.BadTokens {
log.Println("Received BAD Token: ", token)
c.IsBAD(token)
}
}
func (c *ByClassifier) readGoodTokens() {
log.Println("Start reading GOOD tokens")
for token := range ControPlane.GoodTokens {
log.Println("Received GOOD Token: ", token)
c.IsGOOD(token)
}
}
func (c *ByClassifier) readStatsTokens() {
log.Println("Start reading STATS tokens")
for token := range ControPlane.StatsTokens {
c.AddStats(token)
}
}
func (c *ByClassifier) readInitList(filePath, class string) {
inFile, err := os.Open(filePath)
if err != nil {
log.Println(err.Error() + `: ` + filePath)
return
}
defer inFile.Close()
scanner := bufio.NewScanner(inFile)
for scanner.Scan() {
if len(scanner.Text()) > 3 {
switch class {
case "BAD":
log.Println("Loading into Blacklist: ", scanner.Text()) // the line
c.IsBAD(scanner.Text())
case "GOOD":
log.Println("Loading into Whitelist: ", scanner.Text()) // the line
c.IsGOOD(scanner.Text())
}
}
}
}
func (c *ByClassifier) updateLearners() {
log.Println("Bayes Updater Start...")
ticker := time.NewTicker(10 * time.Second)
for ; true; <-ticker.C {
var currentGen int64
log.Println("Maturity is:", Maturity)
log.Println("Seniority is:", ProxyFlow.seniority)
if Maturity > 0 {
currentGen = ProxyFlow.seniority / Maturity
} else {
currentGen = 0
}
log.Println("Current Generation is: ", currentGen)
log.Println("Working Generation is: ", c.Generation)
if currentGen > c.Generation {
c.Learning.busy.Lock()
c.Working.busy.Lock()
c.Working.bayez = c.Learning.bayez
c.Working.bayez.ConvertTermsFreqToTfIdf()
c.Learning.bayez = NewClassifierTfIdf(Good, Bad)
c.Generation = currentGen
log.Println("Generation Updated to: ", c.Generation)
c.Learning.busy.Unlock()
c.Working.busy.Unlock()
}
}
}