gemini-search

A simple search engine for Geminispace
git clone git://git.laack.co/gemini-search.git
Log | Files | Refs | README

commit be5dd0f478f3c59d08df34c8e2eda1bc4e895a45
parent 9f030f04661d3469a6fd50c14c377aa8ebf130eb
Author: Andrew Laack <andrew@laack.co>
Date:   Wed,  6 May 2026 16:57:35 -0500

Added stemmer for indexing

Diffstat:
Mcrawl/main.go | 4++--
Mgemtextparser/parse.go | 79+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mgemtextparser/parse_test.go | 10++++++++++
Agemtextparser/stemmer.go | 844+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Agemtextparser/test_data/wordParsing.gmi | 21+++++++++++++++++++++
Mindex/main.go | 18+++++++++++++-----
6 files changed, 969 insertions(+), 7 deletions(-)

diff --git a/crawl/main.go b/crawl/main.go @@ -4,6 +4,7 @@ import ( "database/sql" "errors" "fmt" + "geminisearch/gemtextparser" "github.com/makeworld-the-better-one/go-gemini" _ "github.com/mattn/go-sqlite3" "io" @@ -12,7 +13,6 @@ import ( "strings" "sync" "time" - "geminisearch/gemtextparser" ) var OUTPUT_DIR = "outputs/crawled" @@ -178,7 +178,7 @@ func writeSiteOutput(outputDir string, body string, currentUrl string) (string, return "", errors.New("Must use absolute paths.") } - // This sometimes seems to include a leading '/' so drop that, no need, directory + // This sometimes seems to include a leading '/' so drop that, no need, directory // hierarchy gives this information to us on the fs. filename := u.EscapedPath() diff --git a/gemtextparser/parse.go b/gemtextparser/parse.go @@ -24,6 +24,85 @@ func stripLeadingWhiteSpace(text string) string { return text } +func strip(s string) string { + var result strings.Builder + for i := 0; i < len(s); i++ { + b := s[i] + if ('a' <= b && b <= 'z') || + ('A' <= b && b <= 'Z') || + ('0' <= b && b <= '9') || + b == ' ' { + result.WriteByte(b) + } + } + return result.String() +} + +func StemmedDocument(body string) []string { + words := ParseWords(body) + results := []string{} + for _, word := range words { + rl := []rune(word) + word := string(Stem(rl)) + results = append(results, word) + } + + return results +} + +// This is only gemtext because it doesn't parse "words" that exist in pre-formatted sections +// because that's frequently ascii art and such. + +func ParseWords(body string) []string { + + lines := strings.Split(body, "\n") + + escaped := false + escape := "```" + results := []string{} + + for _, item := range lines { + + if len(item) >= 3 && strings.Compare(escape, item[:3]) == 0 { + escaped = !escaped + } + + // Keep link name, remove gemini:// stuff from it + if len(item) > 3 { + if item[0] == '=' && item[1] == '>' { + + link := stripLeadingWhiteSpace(item[2:]) + + indexOfSpace := strings.Index(link, " ") + indexOfTab := strings.Index(link, "\t") + + indexOfSpaceOrTab := len(link) + + if indexOfSpace != -1 { + indexOfSpaceOrTab = indexOfSpace + } + if indexOfTab != -1 { + if indexOfTab < indexOfSpace || indexOfSpace == -1 { + indexOfSpaceOrTab = indexOfTab + } + } + + item = link[indexOfSpaceOrTab:] + + } + } + + if !escaped { + item = strip(item) + results = append(results, strings.Fields(item)...) + continue + } + + } + + return results +} + func ParseLinks(body string, currentUrl string) []string { base, err := url.Parse(currentUrl) diff --git a/gemtextparser/parse_test.go b/gemtextparser/parse_test.go @@ -79,3 +79,13 @@ func TestParsePreformattedLinks(t *testing.T) { } } + +func TestParseWords(t *testing.T) { + body := readGemtext(DATA_DIR+"wordParsing.gmi", t) + results := ParseWords(body) + for _, res := range results { + if strings.Compare(res, "FINE") != 0 { + t.Errorf("Unexpected words parsed: %v", results) + } + } +} diff --git a/gemtextparser/stemmer.go b/gemtextparser/stemmer.go @@ -0,0 +1,844 @@ +// SOURCE: https://github.com/reiver/go-porterstemmer/blob/master/porterstemmer.go +// ATTRIBUTION: Created by Charles Iliya Krempeaux - reiver on GitHub +// porter stemmer + +package gemtextparser + +import ( + "unicode" +) + +func isConsonant(s []rune, i int) bool { + + //DEBUG + //log.Printf("isConsonant: [%+v]", string(s[i])) + + result := true + + switch s[i] { + case 'a', 'e', 'i', 'o', 'u': + result = false + case 'y': + if 0 == i { + result = true + } else { + result = !isConsonant(s, i-1) + } + default: + result = true + } + + return result +} + +func measure(s []rune) uint { + + // Initialize. + lenS := len(s) + result := uint(0) + i := 0 + + // Short Circuit. + if 0 == lenS { + /////////// RETURN + return result + } + + // Ignore (potential) consonant sequence at the beginning of word. + for isConsonant(s, i) { + + //DEBUG + //log.Printf("[measure([%s])] Eat Consonant [%d] -> [%s]", string(s), i, string(s[i])) + + i++ + if i >= lenS { + /////////////// RETURN + return result + } + } + + // For each pair of a vowel sequence followed by a consonant sequence, increment result. +Outer: + for i < lenS { + + for !isConsonant(s, i) { + + //DEBUG + //log.Printf("[measure([%s])] VOWEL [%d] -> [%s]", string(s), i, string(s[i])) + + i++ + if i >= lenS { + /////////// BREAK + break Outer + } + } + for isConsonant(s, i) { + + //DEBUG + //log.Printf("[measure([%s])] CONSONANT [%d] -> [%s]", string(s), i, string(s[i])) + + i++ + if i >= lenS { + result++ + /////////// BREAK + break Outer + } + } + result++ + } + + // Return + return result +} + +func hasSuffix(s, suffix []rune) bool { + + lenSMinusOne := len(s) - 1 + lenSuffixMinusOne := len(suffix) - 1 + + if lenSMinusOne <= lenSuffixMinusOne { + return false + } else if s[lenSMinusOne] != suffix[lenSuffixMinusOne] { // I suspect checking this first should speed this function up in practice. + /////// RETURN + return false + } else { + + for i := 0; i < lenSuffixMinusOne; i++ { + + if suffix[i] != s[lenSMinusOne-lenSuffixMinusOne+i] { + /////////////// RETURN + return false + } + + } + + } + + return true +} + +func containsVowel(s []rune) bool { + + lenS := len(s) + + for i := 0; i < lenS; i++ { + + if !isConsonant(s, i) { + /////////// RETURN + return true + } + + } + + return false +} + +func hasRepeatDoubleConsonantSuffix(s []rune) bool { + + // Initialize. + lenS := len(s) + + result := false + + // Do it! + if 2 > lenS { + result = false + } else if s[lenS-1] == s[lenS-2] && isConsonant(s, lenS-1) { // Will using isConsonant() cause a problem with "YY"? + result = true + } else { + result = false + } + + // Return, + return result +} + +func hasConsonantVowelConsonantSuffix(s []rune) bool { + + // Initialize. + lenS := len(s) + + result := false + + // Do it! + if 3 > lenS { + result = false + } else if isConsonant(s, lenS-3) && !isConsonant(s, lenS-2) && isConsonant(s, lenS-1) { + result = true + } else { + result = false + } + + // Return + return result +} + +func step1a(s []rune) []rune { + + // Initialize. + var result []rune = s + + lenS := len(s) + + // Do it! + if suffix := []rune("sses"); hasSuffix(s, suffix) { + + lenTrim := 2 + + subSlice := s[:lenS-lenTrim] + + result = subSlice + } else if suffix := []rune("ies"); hasSuffix(s, suffix) { + lenTrim := 2 + + subSlice := s[:lenS-lenTrim] + + result = subSlice + } else if suffix := []rune("ss"); hasSuffix(s, suffix) { + + result = s + } else if suffix := []rune("s"); hasSuffix(s, suffix) { + + lenSuffix := 1 + + subSlice := s[:lenS-lenSuffix] + + result = subSlice + } + + // Return. + return result +} + +func step1b(s []rune) []rune { + + // Initialize. + var result []rune = s + + lenS := len(s) + + // Do it! + if suffix := []rune("eed"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 0 < m { + lenTrim := 1 + + result = s[:lenS-lenTrim] + } + } else if suffix := []rune("ed"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + if containsVowel(subSlice) { + + if suffix2 := []rune("at"); hasSuffix(subSlice, suffix2) { + lenTrim := -1 + + result = s[:lenS-lenSuffix-lenTrim] + } else if suffix2 := []rune("bl"); hasSuffix(subSlice, suffix2) { + lenTrim := -1 + + result = s[:lenS-lenSuffix-lenTrim] + } else if suffix2 := []rune("iz"); hasSuffix(subSlice, suffix2) { + lenTrim := -1 + + result = s[:lenS-lenSuffix-lenTrim] + } else if c := subSlice[len(subSlice)-1]; 'l' != c && 's' != c && 'z' != c && hasRepeatDoubleConsonantSuffix(subSlice) { + lenTrim := 1 + + lenSubSlice := len(subSlice) + + result = subSlice[:lenSubSlice-lenTrim] + } else if c := subSlice[len(subSlice)-1]; 1 == measure(subSlice) && hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c { + lenTrim := -1 + + result = s[:lenS-lenSuffix-lenTrim] + + result[len(result)-1] = 'e' + } else { + result = subSlice + } + + } + } else if suffix := []rune("ing"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + if containsVowel(subSlice) { + + if suffix2 := []rune("at"); hasSuffix(subSlice, suffix2) { + lenTrim := -1 + + result = s[:lenS-lenSuffix-lenTrim] + + result[len(result)-1] = 'e' + } else if suffix2 := []rune("bl"); hasSuffix(subSlice, suffix2) { + lenTrim := -1 + + result = s[:lenS-lenSuffix-lenTrim] + + result[len(result)-1] = 'e' + } else if suffix2 := []rune("iz"); hasSuffix(subSlice, suffix2) { + lenTrim := -1 + + result = s[:lenS-lenSuffix-lenTrim] + + result[len(result)-1] = 'e' + } else if c := subSlice[len(subSlice)-1]; 'l' != c && 's' != c && 'z' != c && hasRepeatDoubleConsonantSuffix(subSlice) { + lenTrim := 1 + + lenSubSlice := len(subSlice) + + result = subSlice[:lenSubSlice-lenTrim] + } else if c := subSlice[len(subSlice)-1]; 1 == measure(subSlice) && hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c { + lenTrim := -1 + + result = s[:lenS-lenSuffix-lenTrim] + + result[len(result)-1] = 'e' + } else { + result = subSlice + } + + } + } + + // Return. + return result +} + +func step1c(s []rune) []rune { + + // Initialize. + lenS := len(s) + + result := s + + // Do it! + if 2 > lenS { + /////////// RETURN + return result + } + + if 'y' == s[lenS-1] && containsVowel(s[:lenS-1]) { + + result[lenS-1] = 'i' + + } else if 'Y' == s[lenS-1] && containsVowel(s[:lenS-1]) { + + result[lenS-1] = 'I' + + } + + // Return. + return result +} + +func step2(s []rune) []rune { + + // Initialize. + lenS := len(s) + + result := s + + // Do it! + if suffix := []rune("ational"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result[lenS-5] = 'e' + result = result[:lenS-4] + } + } else if suffix := []rune("tional"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result = result[:lenS-2] + } + } else if suffix := []rune("enci"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result[lenS-1] = 'e' + } + } else if suffix := []rune("anci"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result[lenS-1] = 'e' + } + } else if suffix := []rune("izer"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result = s[:lenS-1] + } + } else if suffix := []rune("bli"); hasSuffix(s, suffix) { // --DEPARTURE-- + // } else if suffix := []rune("abli") ; hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result[lenS-1] = 'e' + } + } else if suffix := []rune("alli"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result = s[:lenS-2] + } + } else if suffix := []rune("entli"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result = s[:lenS-2] + } + } else if suffix := []rune("eli"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result = s[:lenS-2] + } + } else if suffix := []rune("ousli"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result = s[:lenS-2] + } + } else if suffix := []rune("ization"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result[lenS-5] = 'e' + + result = s[:lenS-4] + } + } else if suffix := []rune("ation"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result[lenS-3] = 'e' + + result = s[:lenS-2] + } + } else if suffix := []rune("ator"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result[lenS-2] = 'e' + + result = s[:lenS-1] + } + } else if suffix := []rune("alism"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result = s[:lenS-3] + } + } else if suffix := []rune("iveness"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result = s[:lenS-4] + } + } else if suffix := []rune("fulness"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result = s[:lenS-4] + } + } else if suffix := []rune("ousness"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result = s[:lenS-4] + } + } else if suffix := []rune("aliti"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result = s[:lenS-3] + } + } else if suffix := []rune("iviti"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result[lenS-3] = 'e' + + result = result[:lenS-2] + } + } else if suffix := []rune("biliti"); hasSuffix(s, suffix) { + if 0 < measure(s[:lenS-len(suffix)]) { + result[lenS-5] = 'l' + result[lenS-4] = 'e' + + result = result[:lenS-3] + } + } else if suffix := []rune("logi"); hasSuffix(s, suffix) { // --DEPARTURE-- + if 0 < measure(s[:lenS-len(suffix)]) { + lenTrim := 1 + + result = s[:lenS-lenTrim] + } + } + + // Return. + return result +} + +func step3(s []rune) []rune { + + // Initialize. + lenS := len(s) + result := s + + // Do it! + if suffix := []rune("icate"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + if 0 < measure(s[:lenS-lenSuffix]) { + result = result[:lenS-3] + } + } else if suffix := []rune("ative"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 0 < m { + result = subSlice + } + } else if suffix := []rune("alize"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + if 0 < measure(s[:lenS-lenSuffix]) { + result = result[:lenS-3] + } + } else if suffix := []rune("iciti"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + if 0 < measure(s[:lenS-lenSuffix]) { + result = result[:lenS-3] + } + } else if suffix := []rune("ical"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + if 0 < measure(s[:lenS-lenSuffix]) { + result = result[:lenS-2] + } + } else if suffix := []rune("ful"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 0 < m { + result = subSlice + } + } else if suffix := []rune("ness"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 0 < m { + result = subSlice + } + } + + // Return. + return result +} + +func step4(s []rune) []rune { + + // Initialize. + lenS := len(s) + result := s + + // Do it! + if suffix := []rune("al"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = result[:lenS-lenSuffix] + } + } else if suffix := []rune("ance"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = result[:lenS-lenSuffix] + } + } else if suffix := []rune("ence"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = result[:lenS-lenSuffix] + } + } else if suffix := []rune("er"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = subSlice + } + } else if suffix := []rune("ic"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = subSlice + } + } else if suffix := []rune("able"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = subSlice + } + } else if suffix := []rune("ible"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = subSlice + } + } else if suffix := []rune("ant"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = subSlice + } + } else if suffix := []rune("ement"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = subSlice + } + } else if suffix := []rune("ment"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = subSlice + } + } else if suffix := []rune("ent"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = subSlice + } + } else if suffix := []rune("ion"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + c := subSlice[len(subSlice)-1] + + if 1 < m && ('s' == c || 't' == c) { + result = subSlice + } + } else if suffix := []rune("ou"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = subSlice + } + } else if suffix := []rune("ism"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = subSlice + } + } else if suffix := []rune("ate"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = subSlice + } + } else if suffix := []rune("iti"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = subSlice + } + } else if suffix := []rune("ous"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = subSlice + } + } else if suffix := []rune("ive"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = subSlice + } + } else if suffix := []rune("ize"); hasSuffix(s, suffix) { + lenSuffix := len(suffix) + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = subSlice + } + } + + // Return. + return result +} + +func step5a(s []rune) []rune { + + // Initialize. + lenS := len(s) + result := s + + // Do it! + if 'e' == s[lenS-1] { + lenSuffix := 1 + + subSlice := s[:lenS-lenSuffix] + if len(subSlice) == 0 { + return result + } + m := measure(subSlice) + + if 1 < m { + result = subSlice + } else if 1 == m { + if c := subSlice[len(subSlice)-1]; !(hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c) { + result = subSlice + } + } + } + + // Return. + return result +} + +func step5b(s []rune) []rune { + + // Initialize. + lenS := len(s) + result := s + + // Do it! + if 2 < lenS && 'l' == s[lenS-2] && 'l' == s[lenS-1] { + + lenSuffix := 1 + + subSlice := s[:lenS-lenSuffix] + + m := measure(subSlice) + + if 1 < m { + result = subSlice + } + } + + // Return. + return result +} + +func StemString(s string) string { + + // Convert string to []rune + runeArr := []rune(s) + + // Stem. + runeArr = Stem(runeArr) + + // Convert []rune to string + str := string(runeArr) + + // Return. + return str +} + +func Stem(s []rune) []rune { + + // Initialize. + lenS := len(s) + + // Short circuit. + if 0 == lenS { + /////////// RETURN + return s + } + + // Make all runes lowercase. + for i := 0; i < lenS; i++ { + s[i] = unicode.ToLower(s[i]) + } + + // Stem + result := StemWithoutLowerCasing(s) + + // Return. + return result +} + +func StemWithoutLowerCasing(s []rune) []rune { + + // Initialize. + lenS := len(s) + + // Words that are of length 2 or less is already stemmed. + // Don't do anything. + if 2 >= lenS { + /////////// RETURN + return s + } + + // Stem + s = step1a(s) + s = step1b(s) + s = step1c(s) + s = step2(s) + s = step3(s) + s = step4(s) + s = step5a(s) + s = step5b(s) + + // Return. + return s +} diff --git a/gemtextparser/test_data/wordParsing.gmi b/gemtextparser/test_data/wordParsing.gmi @@ -0,0 +1,21 @@ +FINE + +> FINE + +# FINE + +## FINE + +### FINE + +```BAD +BAD BAD +``` + +"FINE" +`FINE` + +=> https://BADBAD.com FINE FINE FINE FINE + +``` +BAD diff --git a/index/main.go b/index/main.go @@ -2,11 +2,11 @@ package main import ( "database/sql" - "os" "errors" "fmt" "geminisearch/gemtextparser" _ "github.com/mattn/go-sqlite3" + "os" ) // TODO: centralize this? @@ -18,7 +18,7 @@ var INDEXED_DB_NAME = "main.db" // TODO: Probably return error instead of panic... func connectToDB(directory string, name string) *sql.DB { - if _, err := os.Stat(directory+"/"+name); errors.Is(err, os.ErrNotExist) { + if _, err := os.Stat(directory + "/" + name); errors.Is(err, os.ErrNotExist) { panic(err) } db, err := sql.Open("sqlite3", directory+"/"+name) @@ -34,12 +34,13 @@ func ensureIndexDB(dir string, filename string) { os.MkdirAll(dir, 0755) db, err := sql.Open("sqlite3", "./"+dir+"/"+filename) - defer db.Close() if err != nil { panic(err) } + defer db.Close() + // There could be multiple links from the same source to the same destination, though that'd be kinda stupid createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);" @@ -57,7 +58,7 @@ func ensureIndexDB(dir string, filename string) { func getNextPath(dbCrawl *sql.DB) (string, string) { - // It's not guaranteed this site will be indexed because it's possible + // It's not guaranteed this site will be indexed because it's possible // the process fails during that but is still removed from the db // We could fix this with a locking mechanism, but I don't know how necessary that'd be @@ -81,7 +82,7 @@ func insertLinks(db *sql.DB, source string, destinations []string) error { // createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);" insertLinkQuery := "INSERT INTO link (source, destination, added_timestamp) VALUES (?, ?, CURRENT_TIMESTAMP)" - + // TODO: Why is it not parsing error return? tx, _ := db.Begin() @@ -126,4 +127,11 @@ func main() { panic(err) } + words := gemtextparser.StemmedDocument(body) + + for _, word := range words { + // TODO: Setup inverted index + fmt.Printf("%s,", word) + } + }