123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839 |
- package porterstemmer
- import (
- // "log"
- "unicode"
- )
- func isConsonant(s []rune, i int) bool {
- //DEBUG
- //log.Printf("isConsonant: [%+v]", string(s[i]))
- result := true
- switch s[i] {
- case 'a', 'e', 'i', 'o', 'u':
- result = false
- case 'y':
- if 0 == i {
- result = true
- } else {
- result = !isConsonant(s, i-1)
- }
- default:
- result = true
- }
- return result
- }
- func measure(s []rune) uint {
- // Initialize.
- lenS := len(s)
- result := uint(0)
- i := 0
- // Short Circuit.
- if 0 == lenS {
- /////////// RETURN
- return result
- }
- // Ignore (potential) consonant sequence at the beginning of word.
- for isConsonant(s, i) {
- //DEBUG
- //log.Printf("[measure([%s])] Eat Consonant [%d] -> [%s]", string(s), i, string(s[i]))
- i++
- if i >= lenS {
- /////////////// RETURN
- return result
- }
- }
- // For each pair of a vowel sequence followed by a consonant sequence, increment result.
- Outer:
- for i < lenS {
- for !isConsonant(s, i) {
- //DEBUG
- //log.Printf("[measure([%s])] VOWEL [%d] -> [%s]", string(s), i, string(s[i]))
- i++
- if i >= lenS {
- /////////// BREAK
- break Outer
- }
- }
- for isConsonant(s, i) {
- //DEBUG
- //log.Printf("[measure([%s])] CONSONANT [%d] -> [%s]", string(s), i, string(s[i]))
- i++
- if i >= lenS {
- result++
- /////////// BREAK
- break Outer
- }
- }
- result++
- }
- // Return
- return result
- }
- func hasSuffix(s, suffix []rune) bool {
- lenSMinusOne := len(s) - 1
- lenSuffixMinusOne := len(suffix) - 1
- if lenSMinusOne <= lenSuffixMinusOne {
- return false
- } else if s[lenSMinusOne] != suffix[lenSuffixMinusOne] { // I suspect checking this first should speed this function up in practice.
- /////// RETURN
- return false
- } else {
- for i := 0; i < lenSuffixMinusOne; i++ {
- if suffix[i] != s[lenSMinusOne-lenSuffixMinusOne+i] {
- /////////////// RETURN
- return false
- }
- }
- }
- return true
- }
- func containsVowel(s []rune) bool {
- lenS := len(s)
- for i := 0; i < lenS; i++ {
- if !isConsonant(s, i) {
- /////////// RETURN
- return true
- }
- }
- return false
- }
- func hasRepeatDoubleConsonantSuffix(s []rune) bool {
- // Initialize.
- lenS := len(s)
- result := false
- // Do it!
- if 2 > lenS {
- result = false
- } else if s[lenS-1] == s[lenS-2] && isConsonant(s, lenS-1) { // Will using isConsonant() cause a problem with "YY"?
- result = true
- } else {
- result = false
- }
- // Return,
- return result
- }
- func hasConsonantVowelConsonantSuffix(s []rune) bool {
- // Initialize.
- lenS := len(s)
- result := false
- // Do it!
- if 3 > lenS {
- result = false
- } else if isConsonant(s, lenS-3) && !isConsonant(s, lenS-2) && isConsonant(s, lenS-1) {
- result = true
- } else {
- result = false
- }
- // Return
- return result
- }
- func step1a(s []rune) []rune {
- // Initialize.
- var result []rune = s
- lenS := len(s)
- // Do it!
- if suffix := []rune("sses"); hasSuffix(s, suffix) {
- lenTrim := 2
- subSlice := s[:lenS-lenTrim]
- result = subSlice
- } else if suffix := []rune("ies"); hasSuffix(s, suffix) {
- lenTrim := 2
- subSlice := s[:lenS-lenTrim]
- result = subSlice
- } else if suffix := []rune("ss"); hasSuffix(s, suffix) {
- result = s
- } else if suffix := []rune("s"); hasSuffix(s, suffix) {
- lenSuffix := 1
- subSlice := s[:lenS-lenSuffix]
- result = subSlice
- }
- // Return.
- return result
- }
- func step1b(s []rune) []rune {
- // Initialize.
- var result []rune = s
- lenS := len(s)
- // Do it!
- if suffix := []rune("eed"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 0 < m {
- lenTrim := 1
- result = s[:lenS-lenTrim]
- }
- } else if suffix := []rune("ed"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- if containsVowel(subSlice) {
- if suffix2 := []rune("at"); hasSuffix(subSlice, suffix2) {
- lenTrim := -1
- result = s[:lenS-lenSuffix-lenTrim]
- } else if suffix2 := []rune("bl"); hasSuffix(subSlice, suffix2) {
- lenTrim := -1
- result = s[:lenS-lenSuffix-lenTrim]
- } else if suffix2 := []rune("iz"); hasSuffix(subSlice, suffix2) {
- lenTrim := -1
- result = s[:lenS-lenSuffix-lenTrim]
- } else if c := subSlice[len(subSlice)-1]; 'l' != c && 's' != c && 'z' != c && hasRepeatDoubleConsonantSuffix(subSlice) {
- lenTrim := 1
- lenSubSlice := len(subSlice)
- result = subSlice[:lenSubSlice-lenTrim]
- } else if c := subSlice[len(subSlice)-1]; 1 == measure(subSlice) && hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c {
- lenTrim := -1
- result = s[:lenS-lenSuffix-lenTrim]
- result[len(result)-1] = 'e'
- } else {
- result = subSlice
- }
- }
- } else if suffix := []rune("ing"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- if containsVowel(subSlice) {
- if suffix2 := []rune("at"); hasSuffix(subSlice, suffix2) {
- lenTrim := -1
- result = s[:lenS-lenSuffix-lenTrim]
- result[len(result)-1] = 'e'
- } else if suffix2 := []rune("bl"); hasSuffix(subSlice, suffix2) {
- lenTrim := -1
- result = s[:lenS-lenSuffix-lenTrim]
- result[len(result)-1] = 'e'
- } else if suffix2 := []rune("iz"); hasSuffix(subSlice, suffix2) {
- lenTrim := -1
- result = s[:lenS-lenSuffix-lenTrim]
- result[len(result)-1] = 'e'
- } else if c := subSlice[len(subSlice)-1]; 'l' != c && 's' != c && 'z' != c && hasRepeatDoubleConsonantSuffix(subSlice) {
- lenTrim := 1
- lenSubSlice := len(subSlice)
- result = subSlice[:lenSubSlice-lenTrim]
- } else if c := subSlice[len(subSlice)-1]; 1 == measure(subSlice) && hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c {
- lenTrim := -1
- result = s[:lenS-lenSuffix-lenTrim]
- result[len(result)-1] = 'e'
- } else {
- result = subSlice
- }
- }
- }
- // Return.
- return result
- }
- func step1c(s []rune) []rune {
- // Initialize.
- lenS := len(s)
- result := s
- // Do it!
- if 2 > lenS {
- /////////// RETURN
- return result
- }
- if 'y' == s[lenS-1] && containsVowel(s[:lenS-1]) {
- result[lenS-1] = 'i'
- } else if 'Y' == s[lenS-1] && containsVowel(s[:lenS-1]) {
- result[lenS-1] = 'I'
- }
- // Return.
- return result
- }
- func step2(s []rune) []rune {
- // Initialize.
- lenS := len(s)
- result := s
- // Do it!
- if suffix := []rune("ational"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result[lenS-5] = 'e'
- result = result[:lenS-4]
- }
- } else if suffix := []rune("tional"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result = result[:lenS-2]
- }
- } else if suffix := []rune("enci"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result[lenS-1] = 'e'
- }
- } else if suffix := []rune("anci"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result[lenS-1] = 'e'
- }
- } else if suffix := []rune("izer"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result = s[:lenS-1]
- }
- } else if suffix := []rune("bli"); hasSuffix(s, suffix) { // --DEPARTURE--
- // } else if suffix := []rune("abli") ; hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result[lenS-1] = 'e'
- }
- } else if suffix := []rune("alli"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result = s[:lenS-2]
- }
- } else if suffix := []rune("entli"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result = s[:lenS-2]
- }
- } else if suffix := []rune("eli"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result = s[:lenS-2]
- }
- } else if suffix := []rune("ousli"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result = s[:lenS-2]
- }
- } else if suffix := []rune("ization"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result[lenS-5] = 'e'
- result = s[:lenS-4]
- }
- } else if suffix := []rune("ation"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result[lenS-3] = 'e'
- result = s[:lenS-2]
- }
- } else if suffix := []rune("ator"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result[lenS-2] = 'e'
- result = s[:lenS-1]
- }
- } else if suffix := []rune("alism"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result = s[:lenS-3]
- }
- } else if suffix := []rune("iveness"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result = s[:lenS-4]
- }
- } else if suffix := []rune("fulness"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result = s[:lenS-4]
- }
- } else if suffix := []rune("ousness"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result = s[:lenS-4]
- }
- } else if suffix := []rune("aliti"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result = s[:lenS-3]
- }
- } else if suffix := []rune("iviti"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result[lenS-3] = 'e'
- result = result[:lenS-2]
- }
- } else if suffix := []rune("biliti"); hasSuffix(s, suffix) {
- if 0 < measure(s[:lenS-len(suffix)]) {
- result[lenS-5] = 'l'
- result[lenS-4] = 'e'
- result = result[:lenS-3]
- }
- } else if suffix := []rune("logi"); hasSuffix(s, suffix) { // --DEPARTURE--
- if 0 < measure(s[:lenS-len(suffix)]) {
- lenTrim := 1
- result = s[:lenS-lenTrim]
- }
- }
- // Return.
- return result
- }
- func step3(s []rune) []rune {
- // Initialize.
- lenS := len(s)
- result := s
- // Do it!
- if suffix := []rune("icate"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- if 0 < measure(s[:lenS-lenSuffix]) {
- result = result[:lenS-3]
- }
- } else if suffix := []rune("ative"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 0 < m {
- result = subSlice
- }
- } else if suffix := []rune("alize"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- if 0 < measure(s[:lenS-lenSuffix]) {
- result = result[:lenS-3]
- }
- } else if suffix := []rune("iciti"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- if 0 < measure(s[:lenS-lenSuffix]) {
- result = result[:lenS-3]
- }
- } else if suffix := []rune("ical"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- if 0 < measure(s[:lenS-lenSuffix]) {
- result = result[:lenS-2]
- }
- } else if suffix := []rune("ful"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 0 < m {
- result = subSlice
- }
- } else if suffix := []rune("ness"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 0 < m {
- result = subSlice
- }
- }
- // Return.
- return result
- }
- func step4(s []rune) []rune {
- // Initialize.
- lenS := len(s)
- result := s
- // Do it!
- if suffix := []rune("al"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = result[:lenS-lenSuffix]
- }
- } else if suffix := []rune("ance"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = result[:lenS-lenSuffix]
- }
- } else if suffix := []rune("ence"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = result[:lenS-lenSuffix]
- }
- } else if suffix := []rune("er"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- }
- } else if suffix := []rune("ic"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- }
- } else if suffix := []rune("able"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- }
- } else if suffix := []rune("ible"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- }
- } else if suffix := []rune("ant"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- }
- } else if suffix := []rune("ement"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- }
- } else if suffix := []rune("ment"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- }
- } else if suffix := []rune("ent"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- }
- } else if suffix := []rune("ion"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- c := subSlice[len(subSlice)-1]
- if 1 < m && ('s' == c || 't' == c) {
- result = subSlice
- }
- } else if suffix := []rune("ou"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- }
- } else if suffix := []rune("ism"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- }
- } else if suffix := []rune("ate"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- }
- } else if suffix := []rune("iti"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- }
- } else if suffix := []rune("ous"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- }
- } else if suffix := []rune("ive"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- }
- } else if suffix := []rune("ize"); hasSuffix(s, suffix) {
- lenSuffix := len(suffix)
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- }
- }
- // Return.
- return result
- }
- func step5a(s []rune) []rune {
- // Initialize.
- lenS := len(s)
- result := s
- // Do it!
- if 'e' == s[lenS-1] {
- lenSuffix := 1
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- } else if 1 == m {
- if c := subSlice[len(subSlice)-1]; !(hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c) {
- result = subSlice
- }
- }
- }
- // Return.
- return result
- }
- func step5b(s []rune) []rune {
- // Initialize.
- lenS := len(s)
- result := s
- // Do it!
- if 2 < lenS && 'l' == s[lenS-2] && 'l' == s[lenS-1] {
- lenSuffix := 1
- subSlice := s[:lenS-lenSuffix]
- m := measure(subSlice)
- if 1 < m {
- result = subSlice
- }
- }
- // Return.
- return result
- }
- func StemString(s string) string {
- // Convert string to []rune
- runeArr := []rune(s)
- // Stem.
- runeArr = Stem(runeArr)
- // Convert []rune to string
- str := string(runeArr)
- // Return.
- return str
- }
- func Stem(s []rune) []rune {
- // Initialize.
- lenS := len(s)
- // Short circuit.
- if 0 == lenS {
- /////////// RETURN
- return s
- }
- // Make all runes lowercase.
- for i := 0; i < lenS; i++ {
- s[i] = unicode.ToLower(s[i])
- }
- // Stem
- result := StemWithoutLowerCasing(s)
- // Return.
- return result
- }
- func StemWithoutLowerCasing(s []rune) []rune {
- // Initialize.
- lenS := len(s)
- // Words that are of length 2 or less is already stemmed.
- // Don't do anything.
- if 2 >= lenS {
- /////////// RETURN
- return s
- }
- // Stem
- s = step1a(s)
- s = step1b(s)
- s = step1c(s)
- s = step2(s)
- s = step3(s)
- s = step4(s)
- s = step5a(s)
- s = step5b(s)
- // Return.
- return s
- }
|