commit be5dd0f478f3c59d08df34c8e2eda1bc4e895a45
parent 9f030f04661d3469a6fd50c14c377aa8ebf130eb
Author: Andrew Laack <andrew@laack.co>
Date: Wed, 6 May 2026 16:57:35 -0500
Added stemmer for indexing
Diffstat:
6 files changed, 969 insertions(+), 7 deletions(-)
diff --git a/crawl/main.go b/crawl/main.go
@@ -4,6 +4,7 @@ import (
"database/sql"
"errors"
"fmt"
+ "geminisearch/gemtextparser"
"github.com/makeworld-the-better-one/go-gemini"
_ "github.com/mattn/go-sqlite3"
"io"
@@ -12,7 +13,6 @@ import (
"strings"
"sync"
"time"
- "geminisearch/gemtextparser"
)
var OUTPUT_DIR = "outputs/crawled"
@@ -178,7 +178,7 @@ func writeSiteOutput(outputDir string, body string, currentUrl string) (string,
return "", errors.New("Must use absolute paths.")
}
- // This sometimes seems to include a leading '/' so drop that, no need, directory
+ // This sometimes seems to include a leading '/' so drop that, no need, directory
// hierarchy gives this information to us on the fs.
filename := u.EscapedPath()
diff --git a/gemtextparser/parse.go b/gemtextparser/parse.go
@@ -24,6 +24,85 @@ func stripLeadingWhiteSpace(text string) string {
return text
}
+func strip(s string) string {
+ var result strings.Builder
+ for i := 0; i < len(s); i++ {
+ b := s[i]
+ if ('a' <= b && b <= 'z') ||
+ ('A' <= b && b <= 'Z') ||
+ ('0' <= b && b <= '9') ||
+ b == ' ' {
+ result.WriteByte(b)
+ }
+ }
+ return result.String()
+}
+
+func StemmedDocument(body string) []string {
+ words := ParseWords(body)
+ results := []string{}
+ for _, word := range words {
+ rl := []rune(word)
+ word := string(Stem(rl))
+ results = append(results, word)
+ }
+
+ return results
+}
+
+// This is only gemtext because it doesn't parse "words" that exist in pre-formatted sections
+// because that's frequently ascii art and such.
+
+func ParseWords(body string) []string {
+
+ lines := strings.Split(body, "\n")
+
+ escaped := false
+ escape := "```"
+ results := []string{}
+
+ for _, item := range lines {
+
+ if len(item) >= 3 && strings.Compare(escape, item[:3]) == 0 {
+ escaped = !escaped
+ }
+
+ // Keep link name, remove gemini:// stuff from it
+ if len(item) > 3 {
+ if item[0] == '=' && item[1] == '>' {
+
+ link := stripLeadingWhiteSpace(item[2:])
+
+ indexOfSpace := strings.Index(link, " ")
+ indexOfTab := strings.Index(link, "\t")
+
+ indexOfSpaceOrTab := len(link)
+
+ if indexOfSpace != -1 {
+ indexOfSpaceOrTab = indexOfSpace
+ }
+ if indexOfTab != -1 {
+ if indexOfTab < indexOfSpace || indexOfSpace == -1 {
+ indexOfSpaceOrTab = indexOfTab
+ }
+ }
+
+ item = link[indexOfSpaceOrTab:]
+
+ }
+ }
+
+ if !escaped {
+ item = strip(item)
+ results = append(results, strings.Fields(item)...)
+ continue
+ }
+
+ }
+
+ return results
+}
+
func ParseLinks(body string, currentUrl string) []string {
base, err := url.Parse(currentUrl)
diff --git a/gemtextparser/parse_test.go b/gemtextparser/parse_test.go
@@ -79,3 +79,13 @@ func TestParsePreformattedLinks(t *testing.T) {
}
}
+
+func TestParseWords(t *testing.T) {
+ body := readGemtext(DATA_DIR+"wordParsing.gmi", t)
+ results := ParseWords(body)
+ for _, res := range results {
+ if strings.Compare(res, "FINE") != 0 {
+ t.Errorf("Unexpected words parsed: %v", results)
+ }
+ }
+}
diff --git a/gemtextparser/stemmer.go b/gemtextparser/stemmer.go
@@ -0,0 +1,844 @@
+// SOURCE: https://github.com/reiver/go-porterstemmer/blob/master/porterstemmer.go
+// ATTRIBUTION: Created by Charles Iliya Krempeaux - reiver on GitHub
+// porter stemmer
+
+package gemtextparser
+
+import (
+ "unicode"
+)
+
+func isConsonant(s []rune, i int) bool {
+
+ //DEBUG
+ //log.Printf("isConsonant: [%+v]", string(s[i]))
+
+ result := true
+
+ switch s[i] {
+ case 'a', 'e', 'i', 'o', 'u':
+ result = false
+ case 'y':
+ if 0 == i {
+ result = true
+ } else {
+ result = !isConsonant(s, i-1)
+ }
+ default:
+ result = true
+ }
+
+ return result
+}
+
+func measure(s []rune) uint {
+
+ // Initialize.
+ lenS := len(s)
+ result := uint(0)
+ i := 0
+
+ // Short Circuit.
+ if 0 == lenS {
+ /////////// RETURN
+ return result
+ }
+
+ // Ignore (potential) consonant sequence at the beginning of word.
+ for isConsonant(s, i) {
+
+ //DEBUG
+ //log.Printf("[measure([%s])] Eat Consonant [%d] -> [%s]", string(s), i, string(s[i]))
+
+ i++
+ if i >= lenS {
+ /////////////// RETURN
+ return result
+ }
+ }
+
+ // For each pair of a vowel sequence followed by a consonant sequence, increment result.
+Outer:
+ for i < lenS {
+
+ for !isConsonant(s, i) {
+
+ //DEBUG
+ //log.Printf("[measure([%s])] VOWEL [%d] -> [%s]", string(s), i, string(s[i]))
+
+ i++
+ if i >= lenS {
+ /////////// BREAK
+ break Outer
+ }
+ }
+ for isConsonant(s, i) {
+
+ //DEBUG
+ //log.Printf("[measure([%s])] CONSONANT [%d] -> [%s]", string(s), i, string(s[i]))
+
+ i++
+ if i >= lenS {
+ result++
+ /////////// BREAK
+ break Outer
+ }
+ }
+ result++
+ }
+
+ // Return
+ return result
+}
+
+func hasSuffix(s, suffix []rune) bool {
+
+ lenSMinusOne := len(s) - 1
+ lenSuffixMinusOne := len(suffix) - 1
+
+ if lenSMinusOne <= lenSuffixMinusOne {
+ return false
+ } else if s[lenSMinusOne] != suffix[lenSuffixMinusOne] { // I suspect checking this first should speed this function up in practice.
+ /////// RETURN
+ return false
+ } else {
+
+ for i := 0; i < lenSuffixMinusOne; i++ {
+
+ if suffix[i] != s[lenSMinusOne-lenSuffixMinusOne+i] {
+ /////////////// RETURN
+ return false
+ }
+
+ }
+
+ }
+
+ return true
+}
+
+func containsVowel(s []rune) bool {
+
+ lenS := len(s)
+
+ for i := 0; i < lenS; i++ {
+
+ if !isConsonant(s, i) {
+ /////////// RETURN
+ return true
+ }
+
+ }
+
+ return false
+}
+
+func hasRepeatDoubleConsonantSuffix(s []rune) bool {
+
+ // Initialize.
+ lenS := len(s)
+
+ result := false
+
+ // Do it!
+ if 2 > lenS {
+ result = false
+ } else if s[lenS-1] == s[lenS-2] && isConsonant(s, lenS-1) { // Will using isConsonant() cause a problem with "YY"?
+ result = true
+ } else {
+ result = false
+ }
+
+ // Return,
+ return result
+}
+
+func hasConsonantVowelConsonantSuffix(s []rune) bool {
+
+ // Initialize.
+ lenS := len(s)
+
+ result := false
+
+ // Do it!
+ if 3 > lenS {
+ result = false
+ } else if isConsonant(s, lenS-3) && !isConsonant(s, lenS-2) && isConsonant(s, lenS-1) {
+ result = true
+ } else {
+ result = false
+ }
+
+ // Return
+ return result
+}
+
+func step1a(s []rune) []rune {
+
+ // Initialize.
+ var result []rune = s
+
+ lenS := len(s)
+
+ // Do it!
+ if suffix := []rune("sses"); hasSuffix(s, suffix) {
+
+ lenTrim := 2
+
+ subSlice := s[:lenS-lenTrim]
+
+ result = subSlice
+ } else if suffix := []rune("ies"); hasSuffix(s, suffix) {
+ lenTrim := 2
+
+ subSlice := s[:lenS-lenTrim]
+
+ result = subSlice
+ } else if suffix := []rune("ss"); hasSuffix(s, suffix) {
+
+ result = s
+ } else if suffix := []rune("s"); hasSuffix(s, suffix) {
+
+ lenSuffix := 1
+
+ subSlice := s[:lenS-lenSuffix]
+
+ result = subSlice
+ }
+
+ // Return.
+ return result
+}
+
+func step1b(s []rune) []rune {
+
+ // Initialize.
+ var result []rune = s
+
+ lenS := len(s)
+
+ // Do it!
+ if suffix := []rune("eed"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 0 < m {
+ lenTrim := 1
+
+ result = s[:lenS-lenTrim]
+ }
+ } else if suffix := []rune("ed"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ if containsVowel(subSlice) {
+
+ if suffix2 := []rune("at"); hasSuffix(subSlice, suffix2) {
+ lenTrim := -1
+
+ result = s[:lenS-lenSuffix-lenTrim]
+ } else if suffix2 := []rune("bl"); hasSuffix(subSlice, suffix2) {
+ lenTrim := -1
+
+ result = s[:lenS-lenSuffix-lenTrim]
+ } else if suffix2 := []rune("iz"); hasSuffix(subSlice, suffix2) {
+ lenTrim := -1
+
+ result = s[:lenS-lenSuffix-lenTrim]
+ } else if c := subSlice[len(subSlice)-1]; 'l' != c && 's' != c && 'z' != c && hasRepeatDoubleConsonantSuffix(subSlice) {
+ lenTrim := 1
+
+ lenSubSlice := len(subSlice)
+
+ result = subSlice[:lenSubSlice-lenTrim]
+ } else if c := subSlice[len(subSlice)-1]; 1 == measure(subSlice) && hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c {
+ lenTrim := -1
+
+ result = s[:lenS-lenSuffix-lenTrim]
+
+ result[len(result)-1] = 'e'
+ } else {
+ result = subSlice
+ }
+
+ }
+ } else if suffix := []rune("ing"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ if containsVowel(subSlice) {
+
+ if suffix2 := []rune("at"); hasSuffix(subSlice, suffix2) {
+ lenTrim := -1
+
+ result = s[:lenS-lenSuffix-lenTrim]
+
+ result[len(result)-1] = 'e'
+ } else if suffix2 := []rune("bl"); hasSuffix(subSlice, suffix2) {
+ lenTrim := -1
+
+ result = s[:lenS-lenSuffix-lenTrim]
+
+ result[len(result)-1] = 'e'
+ } else if suffix2 := []rune("iz"); hasSuffix(subSlice, suffix2) {
+ lenTrim := -1
+
+ result = s[:lenS-lenSuffix-lenTrim]
+
+ result[len(result)-1] = 'e'
+ } else if c := subSlice[len(subSlice)-1]; 'l' != c && 's' != c && 'z' != c && hasRepeatDoubleConsonantSuffix(subSlice) {
+ lenTrim := 1
+
+ lenSubSlice := len(subSlice)
+
+ result = subSlice[:lenSubSlice-lenTrim]
+ } else if c := subSlice[len(subSlice)-1]; 1 == measure(subSlice) && hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c {
+ lenTrim := -1
+
+ result = s[:lenS-lenSuffix-lenTrim]
+
+ result[len(result)-1] = 'e'
+ } else {
+ result = subSlice
+ }
+
+ }
+ }
+
+ // Return.
+ return result
+}
+
+func step1c(s []rune) []rune {
+
+ // Initialize.
+ lenS := len(s)
+
+ result := s
+
+ // Do it!
+ if 2 > lenS {
+ /////////// RETURN
+ return result
+ }
+
+ if 'y' == s[lenS-1] && containsVowel(s[:lenS-1]) {
+
+ result[lenS-1] = 'i'
+
+ } else if 'Y' == s[lenS-1] && containsVowel(s[:lenS-1]) {
+
+ result[lenS-1] = 'I'
+
+ }
+
+ // Return.
+ return result
+}
+
+func step2(s []rune) []rune {
+
+ // Initialize.
+ lenS := len(s)
+
+ result := s
+
+ // Do it!
+ if suffix := []rune("ational"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result[lenS-5] = 'e'
+ result = result[:lenS-4]
+ }
+ } else if suffix := []rune("tional"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result = result[:lenS-2]
+ }
+ } else if suffix := []rune("enci"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result[lenS-1] = 'e'
+ }
+ } else if suffix := []rune("anci"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result[lenS-1] = 'e'
+ }
+ } else if suffix := []rune("izer"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result = s[:lenS-1]
+ }
+ } else if suffix := []rune("bli"); hasSuffix(s, suffix) { // --DEPARTURE--
+ // } else if suffix := []rune("abli") ; hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result[lenS-1] = 'e'
+ }
+ } else if suffix := []rune("alli"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result = s[:lenS-2]
+ }
+ } else if suffix := []rune("entli"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result = s[:lenS-2]
+ }
+ } else if suffix := []rune("eli"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result = s[:lenS-2]
+ }
+ } else if suffix := []rune("ousli"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result = s[:lenS-2]
+ }
+ } else if suffix := []rune("ization"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result[lenS-5] = 'e'
+
+ result = s[:lenS-4]
+ }
+ } else if suffix := []rune("ation"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result[lenS-3] = 'e'
+
+ result = s[:lenS-2]
+ }
+ } else if suffix := []rune("ator"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result[lenS-2] = 'e'
+
+ result = s[:lenS-1]
+ }
+ } else if suffix := []rune("alism"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result = s[:lenS-3]
+ }
+ } else if suffix := []rune("iveness"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result = s[:lenS-4]
+ }
+ } else if suffix := []rune("fulness"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result = s[:lenS-4]
+ }
+ } else if suffix := []rune("ousness"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result = s[:lenS-4]
+ }
+ } else if suffix := []rune("aliti"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result = s[:lenS-3]
+ }
+ } else if suffix := []rune("iviti"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result[lenS-3] = 'e'
+
+ result = result[:lenS-2]
+ }
+ } else if suffix := []rune("biliti"); hasSuffix(s, suffix) {
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ result[lenS-5] = 'l'
+ result[lenS-4] = 'e'
+
+ result = result[:lenS-3]
+ }
+ } else if suffix := []rune("logi"); hasSuffix(s, suffix) { // --DEPARTURE--
+ if 0 < measure(s[:lenS-len(suffix)]) {
+ lenTrim := 1
+
+ result = s[:lenS-lenTrim]
+ }
+ }
+
+ // Return.
+ return result
+}
+
+func step3(s []rune) []rune {
+
+ // Initialize.
+ lenS := len(s)
+ result := s
+
+ // Do it!
+ if suffix := []rune("icate"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ if 0 < measure(s[:lenS-lenSuffix]) {
+ result = result[:lenS-3]
+ }
+ } else if suffix := []rune("ative"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 0 < m {
+ result = subSlice
+ }
+ } else if suffix := []rune("alize"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ if 0 < measure(s[:lenS-lenSuffix]) {
+ result = result[:lenS-3]
+ }
+ } else if suffix := []rune("iciti"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ if 0 < measure(s[:lenS-lenSuffix]) {
+ result = result[:lenS-3]
+ }
+ } else if suffix := []rune("ical"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ if 0 < measure(s[:lenS-lenSuffix]) {
+ result = result[:lenS-2]
+ }
+ } else if suffix := []rune("ful"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 0 < m {
+ result = subSlice
+ }
+ } else if suffix := []rune("ness"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 0 < m {
+ result = subSlice
+ }
+ }
+
+ // Return.
+ return result
+}
+
+func step4(s []rune) []rune {
+
+ // Initialize.
+ lenS := len(s)
+ result := s
+
+ // Do it!
+ if suffix := []rune("al"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = result[:lenS-lenSuffix]
+ }
+ } else if suffix := []rune("ance"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = result[:lenS-lenSuffix]
+ }
+ } else if suffix := []rune("ence"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = result[:lenS-lenSuffix]
+ }
+ } else if suffix := []rune("er"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ }
+ } else if suffix := []rune("ic"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ }
+ } else if suffix := []rune("able"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ }
+ } else if suffix := []rune("ible"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ }
+ } else if suffix := []rune("ant"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ }
+ } else if suffix := []rune("ement"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ }
+ } else if suffix := []rune("ment"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ }
+ } else if suffix := []rune("ent"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ }
+ } else if suffix := []rune("ion"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ c := subSlice[len(subSlice)-1]
+
+ if 1 < m && ('s' == c || 't' == c) {
+ result = subSlice
+ }
+ } else if suffix := []rune("ou"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ }
+ } else if suffix := []rune("ism"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ }
+ } else if suffix := []rune("ate"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ }
+ } else if suffix := []rune("iti"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ }
+ } else if suffix := []rune("ous"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ }
+ } else if suffix := []rune("ive"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ }
+ } else if suffix := []rune("ize"); hasSuffix(s, suffix) {
+ lenSuffix := len(suffix)
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ }
+ }
+
+ // Return.
+ return result
+}
+
+func step5a(s []rune) []rune {
+
+ // Initialize.
+ lenS := len(s)
+ result := s
+
+ // Do it!
+ if 'e' == s[lenS-1] {
+ lenSuffix := 1
+
+ subSlice := s[:lenS-lenSuffix]
+ if len(subSlice) == 0 {
+ return result
+ }
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ } else if 1 == m {
+ if c := subSlice[len(subSlice)-1]; !(hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c) {
+ result = subSlice
+ }
+ }
+ }
+
+ // Return.
+ return result
+}
+
+func step5b(s []rune) []rune {
+
+ // Initialize.
+ lenS := len(s)
+ result := s
+
+ // Do it!
+ if 2 < lenS && 'l' == s[lenS-2] && 'l' == s[lenS-1] {
+
+ lenSuffix := 1
+
+ subSlice := s[:lenS-lenSuffix]
+
+ m := measure(subSlice)
+
+ if 1 < m {
+ result = subSlice
+ }
+ }
+
+ // Return.
+ return result
+}
+
+func StemString(s string) string {
+
+ // Convert string to []rune
+ runeArr := []rune(s)
+
+ // Stem.
+ runeArr = Stem(runeArr)
+
+ // Convert []rune to string
+ str := string(runeArr)
+
+ // Return.
+ return str
+}
+
+func Stem(s []rune) []rune {
+
+ // Initialize.
+ lenS := len(s)
+
+ // Short circuit.
+ if 0 == lenS {
+ /////////// RETURN
+ return s
+ }
+
+ // Make all runes lowercase.
+ for i := 0; i < lenS; i++ {
+ s[i] = unicode.ToLower(s[i])
+ }
+
+ // Stem
+ result := StemWithoutLowerCasing(s)
+
+ // Return.
+ return result
+}
+
+func StemWithoutLowerCasing(s []rune) []rune {
+
+ // Initialize.
+ lenS := len(s)
+
+ // Words that are of length 2 or less is already stemmed.
+ // Don't do anything.
+ if 2 >= lenS {
+ /////////// RETURN
+ return s
+ }
+
+ // Stem
+ s = step1a(s)
+ s = step1b(s)
+ s = step1c(s)
+ s = step2(s)
+ s = step3(s)
+ s = step4(s)
+ s = step5a(s)
+ s = step5b(s)
+
+ // Return.
+ return s
+}
diff --git a/gemtextparser/test_data/wordParsing.gmi b/gemtextparser/test_data/wordParsing.gmi
@@ -0,0 +1,21 @@
+FINE
+
+> FINE
+
+# FINE
+
+## FINE
+
+### FINE
+
+```BAD
+BAD BAD
+```
+
+"FINE"
+`FINE`
+
+=> https://BADBAD.com FINE FINE FINE FINE
+
+```
+BAD
diff --git a/index/main.go b/index/main.go
@@ -2,11 +2,11 @@ package main
import (
"database/sql"
- "os"
"errors"
"fmt"
"geminisearch/gemtextparser"
_ "github.com/mattn/go-sqlite3"
+ "os"
)
// TODO: centralize this?
@@ -18,7 +18,7 @@ var INDEXED_DB_NAME = "main.db"
// TODO: Probably return error instead of panic...
func connectToDB(directory string, name string) *sql.DB {
- if _, err := os.Stat(directory+"/"+name); errors.Is(err, os.ErrNotExist) {
+ if _, err := os.Stat(directory + "/" + name); errors.Is(err, os.ErrNotExist) {
panic(err)
}
db, err := sql.Open("sqlite3", directory+"/"+name)
@@ -34,12 +34,13 @@ func ensureIndexDB(dir string, filename string) {
os.MkdirAll(dir, 0755)
db, err := sql.Open("sqlite3", "./"+dir+"/"+filename)
- defer db.Close()
if err != nil {
panic(err)
}
+ defer db.Close()
+
// There could be multiple links from the same source to the same destination, though that'd be kinda stupid
createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);"
@@ -57,7 +58,7 @@ func ensureIndexDB(dir string, filename string) {
func getNextPath(dbCrawl *sql.DB) (string, string) {
- // It's not guaranteed this site will be indexed because it's possible
+ // It's not guaranteed this site will be indexed because it's possible
// the process fails during that but is still removed from the db
// We could fix this with a locking mechanism, but I don't know how necessary that'd be
@@ -81,7 +82,7 @@ func insertLinks(db *sql.DB, source string, destinations []string) error {
// createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);"
insertLinkQuery := "INSERT INTO link (source, destination, added_timestamp) VALUES (?, ?, CURRENT_TIMESTAMP)"
-
+
// TODO: Why is it not parsing error return?
tx, _ := db.Begin()
@@ -126,4 +127,11 @@ func main() {
panic(err)
}
+ words := gemtextparser.StemmedDocument(body)
+
+ for _, word := range words {
+ // TODO: Setup inverted index
+ fmt.Printf("%s,", word)
+ }
+
}