4 years ago · 87e96aaa64
--- a/README.md
+++ b/README.md
@@ -101,7 +101,9 @@ If DEBUG is set to "false" or not set, minute Zardoz will dump the sparse matrix
 
				 
			
 
				 DEBUG : if set to "true", Zardoz will create a folder "logs" and log what happens, together with the dump of sparse matrix. If set to "false" or not set, sparse matrix will be available  on disk for post-mortem.
			
 
				 
			
 
				+**CREDIT**
			
 
				 
			
 
				+Credits for the Bayesian Implementation to Jake Brukhman : https://github.com/jbrukh/bayesian
			
 
				 
			
 
				 
			
 
				 ## TODO:
			
--- a/alloc.go
+++ b/alloc.go
@@ -23,7 +23,7 @@ var DebugLog bool
 
				 var ProxyFlow HTTPFlow
			
 
				 
			
 
				 //Classifier is our bayesian classifier
			
 
				-var Classifier *ByClassifier
			
 
				+var ZClassifier *ByClassifier
			
 
				 
			
 
				 //BlockMessage is the messgae we return when blocking
			
 
				 var BlockMessage string
			
@@ -33,8 +33,8 @@ var Maturity int64
 
				 
			
 
				 func init() {
			
 
				 
			
 
				-	Classifier = new(ByClassifier)
			
 
				-	Classifier.enroll()
			
 
				+	ZClassifier = new(ByClassifier)
			
 
				+	ZClassifier.enroll()
			
 
				 
			
 
				 	ProxyFlow.sensitivity = 0.5
			
 
				 	ProxyFlow.seniority = 0
			
--- a/bayesian.go
+++ b/bayesian.go
@@ -0,0 +1,547 @@
 
				+package main
			
 
				+
			
 
				+import (
			
 
				+	"encoding/gob"
			
 
				+	"errors"
			
 
				+	"io"
			
 
				+	"math"
			
 
				+	"os"
			
 
				+	"path/filepath"
			
 
				+	"sync/atomic"
			
 
				+)
			
 
				+
			
 
				+// defaultProb is the tiny non-zero probability that a word
			
 
				+// we have not seen before appears in the class.
			
 
				+const defaultProb = 0.00000000001
			
 
				+
			
 
				+// ErrUnderflow is returned when an underflow is detected.
			
 
				+var ErrUnderflow = errors.New("possible underflow detected")
			
 
				+
			
 
				+// Class defines a class that the classifier will filter:
			
 
				+// C = {C_1, ..., C_n}. You should define your classes as a
			
 
				+// set of constants, for example as follows:
			
 
				+//
			
 
				+//    const (
			
 
				+//        Good Class = "Good"
			
 
				+//        Bad Class = "Bad
			
 
				+//    )
			
 
				+//
			
 
				+// Class values should be unique.
			
 
				+type Class string
			
 
				+
			
 
				+// Classifier implements the Naive Bayesian Classifier.
			
 
				+type Classifier struct {
			
 
				+	Classes         []Class
			
 
				+	learned         int   // docs learned
			
 
				+	seen            int32 // docs seen
			
 
				+	datas           map[Class]*classData
			
 
				+	tfIdf           bool
			
 
				+	DidConvertTfIdf bool // we can't classify a TF-IDF classifier if we haven't yet
			
 
				+	// called ConverTermsFreqToTfIdf
			
 
				+}
			
 
				+
			
 
				+// serializableClassifier represents a container for
			
 
				+// Classifier objects whose fields are modifiable by
			
 
				+// reflection and are therefore writeable by gob.
			
 
				+type serializableClassifier struct {
			
 
				+	Classes         []Class
			
 
				+	Learned         int
			
 
				+	Seen            int
			
 
				+	Datas           map[Class]*classData
			
 
				+	TfIdf           bool
			
 
				+	DidConvertTfIdf bool
			
 
				+}
			
 
				+
			
 
				+// classData holds the frequency data for words in a
			
 
				+// particular class. In the future, we may replace this
			
 
				+// structure with a trie-like structure for more
			
 
				+// efficient storage.
			
 
				+type classData struct {
			
 
				+	Freqs   map[string]float64
			
 
				+	FreqTfs map[string][]float64
			
 
				+	Total   int
			
 
				+}
			
 
				+
			
 
				+// newClassData creates a new empty classData node.
			
 
				+func newClassData() *classData {
			
 
				+	return &classData{
			
 
				+		Freqs:   make(map[string]float64),
			
 
				+		FreqTfs: make(map[string][]float64),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// getWordProb returns P(W|C_j) -- the probability of seeing
			
 
				+// a particular word W in a document of this class.
			
 
				+func (d *classData) getWordProb(word string) float64 {
			
 
				+	value, ok := d.Freqs[word]
			
 
				+	if !ok {
			
 
				+		return defaultProb
			
 
				+	}
			
 
				+	return float64(value) / float64(d.Total)
			
 
				+}
			
 
				+
			
 
				+// getWordsProb returns P(D|C_j) -- the probability of seeing
			
 
				+// this set of words in a document of this class.
			
 
				+//
			
 
				+// Note that words should not be empty, and this method of
			
 
				+// calulation is prone to underflow if there are many words
			
 
				+// and their individual probabilties are small.
			
 
				+func (d *classData) getWordsProb(words []string) (prob float64) {
			
 
				+	prob = 1
			
 
				+	for _, word := range words {
			
 
				+		prob *= d.getWordProb(word)
			
 
				+	}
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				+// NewClassifierTfIdf returns a new classifier. The classes provided
			
 
				+// should be at least 2 in number and unique, or this method will
			
 
				+// panic.
			
 
				+func NewClassifierTfIdf(classes ...Class) (c *Classifier) {
			
 
				+	n := len(classes)
			
 
				+
			
 
				+	// check size
			
 
				+	if n < 2 {
			
 
				+		panic("provide at least two classes")
			
 
				+	}
			
 
				+
			
 
				+	// check uniqueness
			
 
				+	check := make(map[Class]bool, n)
			
 
				+	for _, class := range classes {
			
 
				+		check[class] = true
			
 
				+	}
			
 
				+	if len(check) != n {
			
 
				+		panic("classes must be unique")
			
 
				+	}
			
 
				+	// create the classifier
			
 
				+	c = &Classifier{
			
 
				+		Classes: classes,
			
 
				+		datas:   make(map[Class]*classData, n),
			
 
				+		tfIdf:   true,
			
 
				+	}
			
 
				+	for _, class := range classes {
			
 
				+		c.datas[class] = newClassData()
			
 
				+	}
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				+// NewClassifier returns a new classifier. The classes provided
			
 
				+// should be at least 2 in number and unique, or this method will
			
 
				+// panic.
			
 
				+func NewClassifier(classes ...Class) (c *Classifier) {
			
 
				+	n := len(classes)
			
 
				+
			
 
				+	// check size
			
 
				+	if n < 2 {
			
 
				+		panic("provide at least two classes")
			
 
				+	}
			
 
				+
			
 
				+	// check uniqueness
			
 
				+	check := make(map[Class]bool, n)
			
 
				+	for _, class := range classes {
			
 
				+		check[class] = true
			
 
				+	}
			
 
				+	if len(check) != n {
			
 
				+		panic("classes must be unique")
			
 
				+	}
			
 
				+	// create the classifier
			
 
				+	c = &Classifier{
			
 
				+		Classes:         classes,
			
 
				+		datas:           make(map[Class]*classData, n),
			
 
				+		tfIdf:           false,
			
 
				+		DidConvertTfIdf: false,
			
 
				+	}
			
 
				+	for _, class := range classes {
			
 
				+		c.datas[class] = newClassData()
			
 
				+	}
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				+// NewClassifierFromFile loads an existing classifier from
			
 
				+// file. The classifier was previously saved with a call
			
 
				+// to c.WriteToFile(string).
			
 
				+func NewClassifierFromFile(name string) (c *Classifier, err error) {
			
 
				+	file, err := os.Open(name)
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+	return NewClassifierFromReader(file)
			
 
				+}
			
 
				+
			
 
				+// NewClassifierFromReader: This actually does the deserializing of a Gob encoded classifier
			
 
				+func NewClassifierFromReader(r io.Reader) (c *Classifier, err error) {
			
 
				+	dec := gob.NewDecoder(r)
			
 
				+	w := new(serializableClassifier)
			
 
				+	err = dec.Decode(w)
			
 
				+
			
 
				+	return &Classifier{w.Classes, w.Learned, int32(w.Seen), w.Datas, w.TfIdf, w.DidConvertTfIdf}, err
			
 
				+}
			
 
				+
			
 
				+// getPriors returns the prior probabilities for the
			
 
				+// classes provided -- P(C_j).
			
 
				+//
			
 
				+// TODO: There is a way to smooth priors, currently
			
 
				+// not implemented here.
			
 
				+func (c *Classifier) getPriors() (priors []float64) {
			
 
				+	n := len(c.Classes)
			
 
				+	priors = make([]float64, n, n)
			
 
				+	sum := 0
			
 
				+	for index, class := range c.Classes {
			
 
				+		total := c.datas[class].Total
			
 
				+		priors[index] = float64(total)
			
 
				+		sum += total
			
 
				+	}
			
 
				+	if sum != 0 {
			
 
				+		for i := 0; i < n; i++ {
			
 
				+			priors[i] /= float64(sum)
			
 
				+		}
			
 
				+	}
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				+// Learned returns the number of documents ever learned
			
 
				+// in the lifetime of this classifier.
			
 
				+func (c *Classifier) Learned() int {
			
 
				+	return c.learned
			
 
				+}
			
 
				+
			
 
				+// Seen returns the number of documents ever classified
			
 
				+// in the lifetime of this classifier.
			
 
				+func (c *Classifier) Seen() int {
			
 
				+	return int(atomic.LoadInt32(&c.seen))
			
 
				+}
			
 
				+
			
 
				+// IsTfIdf returns true if we are a classifier of type TfIdf
			
 
				+func (c *Classifier) IsTfIdf() bool {
			
 
				+	return c.tfIdf
			
 
				+}
			
 
				+
			
 
				+// WordCount returns the number of words counted for
			
 
				+// each class in the lifetime of the classifier.
			
 
				+func (c *Classifier) WordCount() (result []int) {
			
 
				+	result = make([]int, len(c.Classes))
			
 
				+	for inx, class := range c.Classes {
			
 
				+		data := c.datas[class]
			
 
				+		result[inx] = data.Total
			
 
				+	}
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				+// Observe should be used when word-frequencies have been already been learned
			
 
				+// externally (e.g., hadoop)
			
 
				+func (c *Classifier) Observe(word string, count int, which Class) {
			
 
				+	data := c.datas[which]
			
 
				+	data.Freqs[word] += float64(count)
			
 
				+	data.Total += count
			
 
				+}
			
 
				+
			
 
				+// Learn will accept new training documents for
			
 
				+// supervised learning.
			
 
				+func (c *Classifier) Learn(document []string, which Class) {
			
 
				+
			
 
				+	// If we are a tfidf classifier we first need to get terms as
			
 
				+	// terms frequency and store that to work out the idf part later
			
 
				+	// in ConvertToIDF().
			
 
				+	if c.tfIdf {
			
 
				+		if c.DidConvertTfIdf {
			
 
				+			panic("Cannot call ConvertTermsFreqToTfIdf more than once. Reset and relearn to reconvert.")
			
 
				+		}
			
 
				+
			
 
				+		// Term Frequency: word count in document / document length
			
 
				+		docTf := make(map[string]float64)
			
 
				+		for _, word := range document {
			
 
				+			docTf[word]++
			
 
				+		}
			
 
				+
			
 
				+		docLen := float64(len(document))
			
 
				+
			
 
				+		for wIndex, wCount := range docTf {
			
 
				+			docTf[wIndex] = wCount / docLen
			
 
				+			// add the TF sample, after training we can get IDF values.
			
 
				+			c.datas[which].FreqTfs[wIndex] = append(c.datas[which].FreqTfs[wIndex], docTf[wIndex])
			
 
				+		}
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	data := c.datas[which]
			
 
				+	for _, word := range document {
			
 
				+		data.Freqs[word]++
			
 
				+		data.Total++
			
 
				+	}
			
 
				+	c.learned++
			
 
				+}
			
 
				+
			
 
				+// ConvertTermsFreqToTfIdf uses all the TF samples for the class and converts
			
 
				+// them to TF-IDF https://en.wikipedia.org/wiki/Tf%E2%80%93idf
			
 
				+// once we have finished learning all the classes and have the totals.
			
 
				+func (c *Classifier) ConvertTermsFreqToTfIdf() {
			
 
				+
			
 
				+	if c.DidConvertTfIdf {
			
 
				+		panic("Cannot call ConvertTermsFreqToTfIdf more than once. Reset and relearn to reconvert.")
			
 
				+	}
			
 
				+
			
 
				+	for className := range c.datas {
			
 
				+
			
 
				+		for wIndex := range c.datas[className].FreqTfs {
			
 
				+			tfIdfAdder := float64(0)
			
 
				+
			
 
				+			for tfSampleIndex := range c.datas[className].FreqTfs[wIndex] {
			
 
				+
			
 
				+				// we always want a possitive TF-IDF score.
			
 
				+				tf := c.datas[className].FreqTfs[wIndex][tfSampleIndex]
			
 
				+				c.datas[className].FreqTfs[wIndex][tfSampleIndex] = math.Log1p(tf) * math.Log1p(float64(c.learned)/float64(c.datas[className].Total))
			
 
				+				tfIdfAdder += c.datas[className].FreqTfs[wIndex][tfSampleIndex]
			
 
				+			}
			
 
				+			// convert the 'counts' to TF-IDF's
			
 
				+			c.datas[className].Freqs[wIndex] = tfIdfAdder
			
 
				+		}
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	// sanity check
			
 
				+	c.DidConvertTfIdf = true
			
 
				+
			
 
				+}
			
 
				+
			
 
				+// LogScores produces "log-likelihood"-like scores that can
			
 
				+// be used to classify documents into classes.
			
 
				+//
			
 
				+// The value of the score is proportional to the likelihood,
			
 
				+// as determined by the classifier, that the given document
			
 
				+// belongs to the given class. This is true even when scores
			
 
				+// returned are negative, which they will be (since we are
			
 
				+// taking logs of probabilities).
			
 
				+//
			
 
				+// The index j of the score corresponds to the class given
			
 
				+// by c.Classes[j].
			
 
				+//
			
 
				+// Additionally returned are "inx" and "strict" values. The
			
 
				+// inx corresponds to the maximum score in the array. If more
			
 
				+// than one of the scores holds the maximum values, then
			
 
				+// strict is false.
			
 
				+//
			
 
				+// Unlike c.Probabilities(), this function is not prone to
			
 
				+// floating point underflow and is relatively safe to use.
			
 
				+func (c *Classifier) LogScores(document []string) (scores []float64, inx int, strict bool) {
			
 
				+	if c.tfIdf && !c.DidConvertTfIdf {
			
 
				+		panic("Using a TF-IDF classifier. Please call ConvertTermsFreqToTfIdf before calling LogScores.")
			
 
				+	}
			
 
				+
			
 
				+	n := len(c.Classes)
			
 
				+	scores = make([]float64, n, n)
			
 
				+	priors := c.getPriors()
			
 
				+
			
 
				+	// calculate the score for each class
			
 
				+	for index, class := range c.Classes {
			
 
				+		data := c.datas[class]
			
 
				+		// c is the sum of the logarithms
			
 
				+		// as outlined in the refresher
			
 
				+		score := math.Log(priors[index])
			
 
				+		for _, word := range document {
			
 
				+			score += math.Log(data.getWordProb(word))
			
 
				+		}
			
 
				+		scores[index] = score
			
 
				+	}
			
 
				+	inx, strict = findMax(scores)
			
 
				+	atomic.AddInt32(&c.seen, 1)
			
 
				+	return scores, inx, strict
			
 
				+}
			
 
				+
			
 
				+// ProbScores works the same as LogScores, but delivers
			
 
				+// actual probabilities as discussed above. Note that float64
			
 
				+// underflow is possible if the word list contains too
			
 
				+// many words that have probabilities very close to 0.
			
 
				+//
			
 
				+// Notes on underflow: underflow is going to occur when you're
			
 
				+// trying to assess large numbers of words that you have
			
 
				+// never seen before. Depending on the application, this
			
 
				+// may or may not be a concern. Consider using SafeProbScores()
			
 
				+// instead.
			
 
				+func (c *Classifier) ProbScores(doc []string) (scores []float64, inx int, strict bool) {
			
 
				+	if c.tfIdf && !c.DidConvertTfIdf {
			
 
				+		panic("Using a TF-IDF classifier. Please call ConvertTermsFreqToTfIdf before calling ProbScores.")
			
 
				+	}
			
 
				+	n := len(c.Classes)
			
 
				+	scores = make([]float64, n, n)
			
 
				+	priors := c.getPriors()
			
 
				+	sum := float64(0)
			
 
				+	// calculate the score for each class
			
 
				+	for index, class := range c.Classes {
			
 
				+		data := c.datas[class]
			
 
				+		// c is the sum of the logarithms
			
 
				+		// as outlined in the refresher
			
 
				+		score := priors[index]
			
 
				+		for _, word := range doc {
			
 
				+			score *= data.getWordProb(word)
			
 
				+		}
			
 
				+		scores[index] = score
			
 
				+		sum += score
			
 
				+	}
			
 
				+	for i := 0; i < n; i++ {
			
 
				+		scores[i] /= sum
			
 
				+	}
			
 
				+	inx, strict = findMax(scores)
			
 
				+	atomic.AddInt32(&c.seen, 1)
			
 
				+	return scores, inx, strict
			
 
				+}
			
 
				+
			
 
				+// SafeProbScores works the same as ProbScores, but is
			
 
				+// able to detect underflow in those cases where underflow
			
 
				+// results in the reverse classification. If an underflow is detected,
			
 
				+// this method returns an ErrUnderflow, allowing the user to deal with it as
			
 
				+// necessary. Note that underflow, under certain rare circumstances,
			
 
				+// may still result in incorrect probabilities being returned,
			
 
				+// but this method guarantees that all error-less invokations
			
 
				+// are properly classified.
			
 
				+//
			
 
				+// Underflow detection is more costly because it also
			
 
				+// has to make additional log score calculations.
			
 
				+func (c *Classifier) SafeProbScores(doc []string) (scores []float64, inx int, strict bool, err error) {
			
 
				+	if c.tfIdf && !c.DidConvertTfIdf {
			
 
				+		panic("Using a TF-IDF classifier. Please call ConvertTermsFreqToTfIdf before calling SafeProbScores.")
			
 
				+	}
			
 
				+
			
 
				+	n := len(c.Classes)
			
 
				+	scores = make([]float64, n, n)
			
 
				+	logScores := make([]float64, n, n)
			
 
				+	priors := c.getPriors()
			
 
				+	sum := float64(0)
			
 
				+	// calculate the score for each class
			
 
				+	for index, class := range c.Classes {
			
 
				+		data := c.datas[class]
			
 
				+		// c is the sum of the logarithms
			
 
				+		// as outlined in the refresher
			
 
				+		score := priors[index]
			
 
				+		logScore := math.Log(priors[index])
			
 
				+		for _, word := range doc {
			
 
				+			p := data.getWordProb(word)
			
 
				+			score *= p
			
 
				+			logScore += math.Log(p)
			
 
				+		}
			
 
				+		scores[index] = score
			
 
				+		logScores[index] = logScore
			
 
				+		sum += score
			
 
				+	}
			
 
				+	for i := 0; i < n; i++ {
			
 
				+		scores[i] /= sum
			
 
				+	}
			
 
				+	inx, strict = findMax(scores)
			
 
				+	logInx, logStrict := findMax(logScores)
			
 
				+
			
 
				+	// detect underflow -- the size
			
 
				+	// relation between scores and logScores
			
 
				+	// must be preserved or something is wrong
			
 
				+	if inx != logInx || strict != logStrict {
			
 
				+		err = ErrUnderflow
			
 
				+	}
			
 
				+	atomic.AddInt32(&c.seen, 1)
			
 
				+	return scores, inx, strict, err
			
 
				+}
			
 
				+
			
 
				+// WordFrequencies returns a matrix of word frequencies that currently
			
 
				+// exist in the classifier for each class state for the given input
			
 
				+// words. In other words, if you obtain the frequencies
			
 
				+//
			
 
				+//    freqs := c.WordFrequencies(/* [j]string */)
			
 
				+//
			
 
				+// then the expression freq[i][j] represents the frequency of the j-th
			
 
				+// word within the i-th class.
			
 
				+func (c *Classifier) WordFrequencies(words []string) (freqMatrix [][]float64) {
			
 
				+	n, l := len(c.Classes), len(words)
			
 
				+	freqMatrix = make([][]float64, n)
			
 
				+	for i := range freqMatrix {
			
 
				+		arr := make([]float64, l)
			
 
				+		data := c.datas[c.Classes[i]]
			
 
				+		for j := range arr {
			
 
				+			arr[j] = data.getWordProb(words[j])
			
 
				+		}
			
 
				+		freqMatrix[i] = arr
			
 
				+	}
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				+// WordsByClass returns a map of words and their probability of
			
 
				+// appearing in the given class.
			
 
				+func (c *Classifier) WordsByClass(class Class) (freqMap map[string]float64) {
			
 
				+	freqMap = make(map[string]float64)
			
 
				+	for word, cnt := range c.datas[class].Freqs {
			
 
				+		freqMap[word] = float64(cnt) / float64(c.datas[class].Total)
			
 
				+	}
			
 
				+
			
 
				+	return freqMap
			
 
				+}
			
 
				+
			
 
				+// WriteToFile serializes this classifier to a file.
			
 
				+func (c *Classifier) WriteToFile(name string) (err error) {
			
 
				+	file, err := os.OpenFile(name, os.O_WRONLY|os.O_CREATE, 0644)
			
 
				+	if err != nil {
			
 
				+		return err
			
 
				+	}
			
 
				+	return c.WriteTo(file)
			
 
				+}
			
 
				+
			
 
				+// WriteClassesToFile writes all classes to files.
			
 
				+func (c *Classifier) WriteClassesToFile(rootPath string) (err error) {
			
 
				+	for name := range c.datas {
			
 
				+		c.WriteClassToFile(name, rootPath)
			
 
				+	}
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				+// WriteClassToFile writes a single class to file.
			
 
				+func (c *Classifier) WriteClassToFile(name Class, rootPath string) (err error) {
			
 
				+	data := c.datas[name]
			
 
				+	fileName := filepath.Join(rootPath, string(name))
			
 
				+	file, err := os.OpenFile(fileName, os.O_WRONLY|os.O_CREATE, 0644)
			
 
				+	if err != nil {
			
 
				+		return err
			
 
				+	}
			
 
				+	enc := gob.NewEncoder(file)
			
 
				+	err = enc.Encode(data)
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				+// WriteTo serializes this classifier to GOB and write to Writer.
			
 
				+func (c *Classifier) WriteTo(w io.Writer) (err error) {
			
 
				+	enc := gob.NewEncoder(w)
			
 
				+	err = enc.Encode(&serializableClassifier{c.Classes, c.learned, int(c.seen), c.datas, c.tfIdf, c.DidConvertTfIdf})
			
 
				+
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				+// ReadClassFromFile loads existing class data from a
			
 
				+// file.
			
 
				+func (c *Classifier) ReadClassFromFile(class Class, location string) (err error) {
			
 
				+	fileName := filepath.Join(location, string(class))
			
 
				+	file, err := os.Open(fileName)
			
 
				+
			
 
				+	if err != nil {
			
 
				+		return err
			
 
				+	}
			
 
				+
			
 
				+	dec := gob.NewDecoder(file)
			
 
				+	w := new(classData)
			
 
				+	err = dec.Decode(w)
			
 
				+
			
 
				+	c.learned++
			
 
				+	c.datas[class] = w
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				+// findMax finds the maximum of a set of scores; if the
			
 
				+// maximum is strict -- that is, it is the single unique
			
 
				+// maximum from the set -- then strict has return value
			
 
				+// true. Otherwise it is false.
			
 
				+func findMax(scores []float64) (inx int, strict bool) {
			
 
				+	inx = 0
			
 
				+	strict = true
			
 
				+	for i := 1; i < len(scores); i++ {
			
 
				+		if scores[inx] < scores[i] {
			
 
				+			inx = i
			
 
				+			strict = true
			
 
				+		} else if scores[inx] == scores[i] {
			
 
				+			strict = false
			
 
				+		}
			
 
				+	}
			
 
				+	return
			
 
				+}
			
--- a/classifier.go
+++ b/classifier.go
@@ -86,16 +86,11 @@ func feedRequest(req *http.Request, class string) {
 
				 
			
 
				 	feed := formatRequest(req)
			
 
				 
			
 
				-	tokens := strings.Fields(sanitizeHeaders(feed))
			
 
				-
			
 
				 	if class == "BAD" {
			
 
				 
			
 
				 		log.Println("Feeding BAD token: ", feed)
			
 
				 
			
 
				-		for _, tk := range tokens {
			
 
				-
			
 
				-			ControPlane.BadTokens <- tk
			
 
				-		}
			
 
				+		ControPlane.BadTokens <- sanitizeHeaders(feed)
			
 
				 
			
 
				 	}
			
 
				 
			
@@ -103,10 +98,7 @@ func feedRequest(req *http.Request, class string) {
 
				 
			
 
				 		log.Println("Feeding GOOD Token:", feed)
			
 
				 
			
 
				-		for _, tk := range tokens {
			
 
				-
			
 
				-			ControPlane.GoodTokens <- tk
			
 
				-		}
			
 
				+		ControPlane.GoodTokens <- sanitizeHeaders(feed)
			
 
				 
			
 
				 	}
			
 
				 
			
--- a/file.go
+++ b/file.go
@@ -1,7 +1,7 @@
 
				 package main
			
 
				 
			
 
				 import (
			
 
				-	"encoding/json"
			
 
				+	//	"encoding/json"
			
 
				 	"fmt"
			
 
				 	"io"
			
 
				 	"log"
			
@@ -37,38 +37,39 @@ func saveBayesToFile() {
 
				 	log.Println("Trying to write json file")
			
 
				 	defer handlepanic()
			
 
				 
			
 
				-	var tmpJSON []byte
			
 
				+	dumpfile := os.Getenv("DUMPFILE")
			
 
				+	if dumpfile == "" {
			
 
				+		dumpfile = "bayes.json"
			
 
				+	}
			
 
				 
			
 
				-	Classifier.Matrix.busy.Lock()
			
 
				-	defer Classifier.Matrix.busy.Unlock()
			
 
				+	err := ZClassifier.Learning.bayez.WriteClassesToFile(dumpfile)
			
 
				 
			
 
				-	tmpJSON, err := json.MarshalIndent(Classifier.Matrix.bScores, "", " ")
			
 
				 	if err != nil {
			
 
				-		tmpJSON = []byte(err.Error())
			
 
				+		log.Println("Error Writing file: ", dumpfile, err.Error())
			
 
				 	}
			
 
				 
			
 
				-	var statsREPORT string
			
 
				-
			
 
				-	statsREPORT = "\n"
			
 
				+	//	var statsREPORT string
			
 
				 
			
 
				-	Classifier.STATS.Range(func(key interface{}, value interface{}) bool {
			
 
				+	//	statsREPORT = "\n"
			
 
				 
			
 
				-		statsREPORT += fmt.Sprintf("{\"%s\" : \"%d\"}\n", key.(string), value.(int64))
			
 
				-		return true
			
 
				-	})
			
 
				+	//	Classifier.STATS.Range(func(key interface{}, value interface{}) bool {
			
 
				 
			
 
				-	dumpfile := os.Getenv("DUMPFILE")
			
 
				-	if dumpfile == "" {
			
 
				-		dumpfile = "bayes.json"
			
 
				-	}
			
 
				+	//		statsREPORT += fmt.Sprintf("{\"%s\" : \"%d\"}\n", key.(string), value.(int64))
			
 
				+	//		return true
			
 
				+	//	})
			
 
				 
			
 
				-	if DebugLog {
			
 
				-		log.Println("DUMP: ", string(tmpJSON)+statsREPORT)
			
 
				-	} else {
			
 
				-		writeToFile(dumpfile, string(tmpJSON)+statsREPORT)
			
 
				+	//	dumpfile := os.Getenv("DUMPFILE")
			
 
				+	//	if dumpfile == "" {
			
 
				+	//		dumpfile = "bayes.json"
			
 
				+	//	}
			
 
				 
			
 
				-		log.Println("File saved: ", dumpfile)
			
 
				-	}
			
 
				+	//	if DebugLog {
			
 
				+	//		log.Println("DUMP: ", string(tmpJSON)+statsREPORT)
			
 
				+	//	} else {
			
 
				+	//		writeToFile(dumpfile, string(tmpJSON)+statsREPORT)
			
 
				+	//
			
 
				+	//		log.Println("File saved: ", dumpfile)
			
 
				+	//	}
			
 
				 
			
 
				 }
			
 
				 
			
--- a/handler.go
+++ b/handler.go
@@ -12,7 +12,8 @@ func handler(p *httputil.ReverseProxy) func(http.ResponseWriter, *http.Request)
 
				 	return func(w http.ResponseWriter, r *http.Request) {
			
 
				 		//put the request inside our structure
			
 
				 		ProxyFlow.request = r
			
 
				-		probs := Classifier.Posterior(formatRequest(r))
			
 
				+		log.Println("Received HTTP Request")
			
 
				+		probs := ZClassifier.Posterior(formatRequest(r))
			
 
				 		log.Printf("Posterior Probabilities: %+v\n", probs)
			
 
				 		action := quadrant(probs)
			
 
				 		ControPlane.StatsTokens <- action
			
--- a/matrix.go
+++ b/matrix.go
@@ -2,12 +2,16 @@ package main
 
				 
			
 
				 import (
			
 
				 	"bufio"
			
 
				-
			
 
				 	"log"
			
 
				-
			
 
				 	"os"
			
 
				 	"strings"
			
 
				 	"sync"
			
 
				+	"time"
			
 
				+)
			
 
				+
			
 
				+const (
			
 
				+	Good Class = "GOOD"
			
 
				+	Bad  Class = "BAD"
			
 
				 )
			
 
				 
			
 
				 //ByControlPlane contains all the channels we need.
			
@@ -17,14 +21,9 @@ type ByControlPlane struct {
 
				 	StatsTokens chan string
			
 
				 }
			
 
				 
			
 
				-type bScore struct {
			
 
				-	BadScore  float64
			
 
				-	GoodScore float64
			
 
				-}
			
 
				-
			
 
				-type bMap struct {
			
 
				-	bScores map[string]bScore
			
 
				-	busy    sync.Mutex
			
 
				+type safeClassifier struct {
			
 
				+	bayez *Classifier
			
 
				+	busy  sync.Mutex
			
 
				 }
			
 
				 
			
 
				 //ControPlane is the variabile
			
@@ -32,10 +31,10 @@ var ControPlane ByControlPlane
 
				 
			
 
				 //ByClassifier is the structure containing our Pseudo-Bayes classifier.
			
 
				 type ByClassifier struct {
			
 
				-	STATS  sync.Map
			
 
				-	Matrix bMap
			
 
				-	bReg   float64
			
 
				-	gReg   float64
			
 
				+	STATS      sync.Map
			
 
				+	Learning   safeClassifier
			
 
				+	Working    safeClassifier
			
 
				+	Generation int64
			
 
				 }
			
 
				 
			
 
				 //AddStats adds the statistics after proper blocking.
			
@@ -54,80 +53,63 @@ func (c *ByClassifier) AddStats(action string) {
 
				 //IsBAD inserts a bad key in the right place.
			
 
				 func (c *ByClassifier) IsBAD(key string) {
			
 
				 
			
 
				-	c.Matrix.busy.Lock()
			
 
				-	defer c.Matrix.busy.Unlock()
			
 
				+	k := strings.Fields(key)
			
 
				 
			
 
				-	var t bScore
			
 
				+	log.Println("BAD Received", k)
			
 
				 
			
 
				-	if val, ok := c.Matrix.bScores[key]; ok {
			
 
				-		t.BadScore = val.BadScore + 1
			
 
				-		t.GoodScore = val.GoodScore
			
 
				-	} else {
			
 
				-		t.BadScore = 1
			
 
				-		t.GoodScore = 0
			
 
				-	}
			
 
				+	c.Learning.busy.Lock()
			
 
				+	defer c.Learning.busy.Unlock()
			
 
				+
			
 
				+	c.Learning.bayez.Learn(k, Bad)
			
 
				+
			
 
				+	log.Println("BAD Learned", k)
			
 
				 
			
 
				-	c.Matrix.bScores[key] = t
			
 
				-	c.bReg++
			
 
				 }
			
 
				 
			
 
				 //IsGOOD inserts the key in the right place.
			
 
				 func (c *ByClassifier) IsGOOD(key string) {
			
 
				 
			
 
				-	c.Matrix.busy.Lock()
			
 
				-	defer c.Matrix.busy.Unlock()
			
 
				+	k := strings.Fields(key)
			
 
				 
			
 
				-	var t bScore
			
 
				+	log.Println("GOOD Received", k)
			
 
				 
			
 
				-	if val, ok := c.Matrix.bScores[key]; ok {
			
 
				-		t.GoodScore = val.GoodScore + 1
			
 
				-		t.BadScore = val.BadScore
			
 
				-	} else {
			
 
				-		t.BadScore = 0
			
 
				-		t.GoodScore = 1
			
 
				-	}
			
 
				+	c.Learning.busy.Lock()
			
 
				+	defer c.Learning.busy.Unlock()
			
 
				 
			
 
				-	c.Matrix.bScores[key] = t
			
 
				-	c.gReg++
			
 
				+	c.Learning.bayez.Learn(k, Good)
			
 
				+
			
 
				+	log.Println("GOOD Learned", k)
			
 
				 
			
 
				 }
			
 
				 
			
 
				 //Posterior calculates Shannon based entropy using bad and good as different distributions
			
 
				 func (c *ByClassifier) Posterior(hdr string) map[string]float64 {
			
 
				 
			
 
				-	c.Matrix.busy.Lock()
			
 
				-	defer c.Matrix.busy.Unlock()
			
 
				-
			
 
				-	tokens := strings.Fields(sanitizeHeaders(hdr))
			
 
				-	lenTokens := float64(len(tokens))
			
 
				-
			
 
				+	tokens := sanitizeHeaders(hdr)
			
 
				 	ff := make(map[string]float64)
			
 
				 
			
 
				-	if lenTokens == 0 {
			
 
				+	if c.Generation == 0 {
			
 
				 		ff["BAD"] = 0.5
			
 
				 		ff["GOOD"] = 0.5
			
 
				 		return ff
			
 
				-	}
			
 
				 
			
 
				-	log.Println("Start classification of: ", tokens)
			
 
				+	}
			
 
				 
			
 
				-	var hBadM, hGoodM float64
			
 
				+	log.Println("Posterior locking the Working Bayesian")
			
 
				+	c.Working.busy.Lock()
			
 
				+	defer c.Working.busy.Unlock()
			
 
				 
			
 
				-	for _, tk := range tokens {
			
 
				-		if val, ok := c.Matrix.bScores[tk]; ok {
			
 
				-			if val.BadScore > 0 {
			
 
				-				hBadM += val.BadScore
			
 
				-			}
			
 
				-			if val.GoodScore > 0 {
			
 
				-				hGoodM += val.GoodScore
			
 
				-			}
			
 
				-		}
			
 
				+	log.Println("Going to calculate the Scores")
			
 
				+	scores, _, _, err := c.Working.bayez.SafeProbScores(strings.Fields(tokens))
			
 
				+	log.Println("Scores calculated")
			
 
				+	if err == ErrUnderflow {
			
 
				+		ff["BAD"] = 0.5
			
 
				+		ff["GOOD"] = 0.5
			
 
				+		return ff
			
 
				 	}
			
 
				 
			
 
				-	ff["BAD"] = hBadM / (c.bReg * lenTokens)
			
 
				-	ff["GOOD"] = hGoodM / (c.gReg * lenTokens)
			
 
				-
			
 
				-	log.Println("Entropies: ", ff)
			
 
				+	ff["GOOD"] = scores[0]
			
 
				+	ff["BAD"] = scores[1]
			
 
				 
			
 
				 	return ff
			
 
				 
			
@@ -139,11 +121,9 @@ func (c *ByClassifier) enroll() {
 
				 	ControPlane.GoodTokens = make(chan string, 2048)
			
 
				 	ControPlane.StatsTokens = make(chan string, 2048)
			
 
				 
			
 
				-	c.Matrix.busy.Lock()
			
 
				-	c.Matrix.bScores = make(map[string]bScore)
			
 
				-	c.Matrix.busy.Unlock()
			
 
				-	c.bReg = 0
			
 
				-	c.gReg = 0
			
 
				+	c.Generation = 0
			
 
				+	c.Learning.bayez = NewClassifierTfIdf(Good, Bad)
			
 
				+	c.Working.bayez = NewClassifierTfIdf(Good, Bad)
			
 
				 
			
 
				 	c.readInitList("blacklist.txt", "BAD")
			
 
				 	c.readInitList("whitelist.txt", "GOOD")
			
@@ -151,6 +131,7 @@ func (c *ByClassifier) enroll() {
 
				 	go c.readBadTokens()
			
 
				 	go c.readGoodTokens()
			
 
				 	go c.readStatsTokens()
			
 
				+	go c.updateLearners()
			
 
				 
			
 
				 	log.Println("Classifier populated...")
			
 
				 
			
@@ -213,3 +194,37 @@ func (c *ByClassifier) readInitList(filePath, class string) {
 
				 	}
			
 
				 
			
 
				 }
			
 
				+
			
 
				+func (c *ByClassifier) updateLearners() {
			
 
				+
			
 
				+	log.Println("Bayes Updater Start...")
			
 
				+
			
 
				+	ticker := time.NewTicker(10 * time.Second)
			
 
				+
			
 
				+	for ; true; <-ticker.C {
			
 
				+		var currentGen int64
			
 
				+		log.Println("Maturity is:", Maturity)
			
 
				+		log.Println("Seniority is:", ProxyFlow.seniority)
			
 
				+		if Maturity > 0 {
			
 
				+			currentGen = ProxyFlow.seniority / Maturity
			
 
				+		} else {
			
 
				+			currentGen = 0
			
 
				+		}
			
 
				+		log.Println("Current Generation is: ", currentGen)
			
 
				+		log.Println("Working Generation is: ", c.Generation)
			
 
				+		if currentGen > c.Generation {
			
 
				+			c.Learning.busy.Lock()
			
 
				+			c.Working.busy.Lock()
			
 
				+			c.Working.bayez = c.Learning.bayez
			
 
				+			c.Working.bayez.ConvertTermsFreqToTfIdf()
			
 
				+			c.Learning.bayez = NewClassifierTfIdf(Good, Bad)
			
 
				+			c.Generation = currentGen
			
 
				+			log.Println("Generation Updated to: ", c.Generation)
			
 
				+			c.Learning.busy.Unlock()
			
 
				+			c.Working.busy.Unlock()
			
 
				+
			
 
				+		}
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+}