gemini-search

A simple search engine for Geminispace
git clone git://git.laack.co/gemini-search.git
Log | Files | Refs | README

commit 5f6c510a5e5bc484f72e855e2d51ae0ff3cf2fb2
parent 5676aad6afb2448cc5185cc809feeedaef0fd184
Author: Andrew Laack <andrew@laack.co>
Date:   Wed,  6 May 2026 14:38:03 -0500

Renaming modules, setting up indexing

Diffstat:
Mcrawl/main.go | 4++--
Dgemtext_parser/parse.go | 100-------------------------------------------------------------------------------
Dgemtext_parser/parse_test.go | 81-------------------------------------------------------------------------------
Agemtextparser/parse.go | 100+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Agemtextparser/parse_test.go | 81+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rgemtext_parser/test_data/absoluteLinks.gmi -> gemtextparser/test_data/absoluteLinks.gmi | 0
Rgemtext_parser/test_data/nonStandard.gmi -> gemtextparser/test_data/nonStandard.gmi | 0
Rgemtext_parser/test_data/preformatedLinks.gmi -> gemtextparser/test_data/preformatedLinks.gmi | 0
Rgemtext_parser/test_data/relativeLinks.gmi -> gemtextparser/test_data/relativeLinks.gmi | 0
Mgo.mod | 2+-
Dindex/go.mod | 5-----
Dindex/go.sum | 2--
Mindex/main.go | 2+-
13 files changed, 185 insertions(+), 192 deletions(-)

diff --git a/crawl/main.go b/crawl/main.go @@ -12,7 +12,7 @@ import ( "strings" "sync" "time" - "gemini-search/gemtext_parser" + "geminisearch/gemtextparser" ) var OUTPUT_DIR = "outputs/crawled" @@ -272,7 +272,7 @@ func worker(db *sql.DB, wg *sync.WaitGroup) { fmt.Printf("Fetched %s\n", currentUrl) - forwardGeminiLinks := gemtext_parser.ParseLinks(body, currentUrl) + forwardGeminiLinks := gemtextparser.ParseLinks(body, currentUrl) outputLocation, err := writeSiteOutput(OUTPUT_DIR, body, currentUrl) diff --git a/gemtext_parser/parse.go b/gemtext_parser/parse.go @@ -1,100 +0,0 @@ -package gemtext_parser - -import ( - "fmt" - "net/url" - "strings" -) - -func stripLeadingWhiteSpace(text string) string { - - for len(text) > 0 { - if text[0] == ' ' || text[0] == '\t' { - if len(text) > 1 { - text = text[1:] - } else { - text = "" - return text - } - } else { - return text - } - } - - return text -} - -func ParseLinks(body string, currentUrl string) []string { - - base, err := url.Parse(currentUrl) - - if err != nil { - panic(err) - } - - lines := strings.Split(body, "\n") - - links := []string{} - - escaped := false - escape := "```" - - for _, item := range lines { - - // must start with escape characters, the rest doesn't matter to us - // > Any line whose first three characters are "```" (...) are preformatted toggle lines - if len(item) >= 3 && strings.Compare(escape, item[:3]) == 0 { - escaped = !escaped - } - - if len(item) > 3 && !escaped { - if item[0] == '=' && item[1] == '>' { - - // sometimes links end with a \r, but that isn't valid so we won't allow it - links = append(links, stripLeadingWhiteSpace(item[2:])) - } - } - - } - - geminiLinks := []string{} - - for _, item := range links { - - // this is for finding the text associated with the link - - indexOfSpace := strings.Index(item, " ") - indexOfTab := strings.Index(item, "\t") - - // default if there aren't any - indexOfSpaceOrTab := len(item) - - if indexOfSpace != -1 { - indexOfSpaceOrTab = indexOfSpace - } - - if indexOfTab != -1 { - if indexOfTab < indexOfSpace || indexOfSpace == -1 { - indexOfSpaceOrTab = indexOfTab - } - } - - if len(item) >= 10 && strings.Compare(item[:9], "gemini://") == 0 { - geminiLinks = append(geminiLinks, item[0:indexOfSpaceOrTab]) - } - if strings.Contains(item, "://") == false { - // relative link - - u, err := url.Parse(item[0:indexOfSpaceOrTab]) - - if err != nil { - fmt.Printf("Unable to parse link: %s, %s\n", item, err) - continue - } - - geminiLinks = append(geminiLinks, base.ResolveReference(u).String()) - } - } - - return geminiLinks -} diff --git a/gemtext_parser/parse_test.go b/gemtext_parser/parse_test.go @@ -1,81 +0,0 @@ -package gemtext_parser - -import ( - "os" - "strings" - "testing" -) - -var DATA_DIR = "test_data/" -var URL = "gemini://laack.co" - -func hasSpaceOrTab(text string) bool { - return strings.Contains(text, " ") || strings.Contains(text, "\t") -} - -func readGemtext(path string, t *testing.T) string { - - file, err := os.ReadFile(path) - - if err != nil { - t.Errorf("Unable to read %s", path) - return "" - } - - return string(file) -} - -func TestAbsoluteRelativeParsingMatch(t *testing.T) { - - absBody := readGemtext(DATA_DIR+"absoluteLinks.gmi", t) - absLinks := ParseLinks(absBody, URL) - - if len(absLinks) != 5 { - t.Errorf("Unexpected number of absolute links") - } - - relBody := readGemtext(DATA_DIR+"relativeLinks.gmi", t) - relLinks := ParseLinks(relBody, URL) - - if len(relLinks) != 5 { - t.Errorf("Unexpected number of relative links") - } - - for index, link := range relLinks { - if strings.Compare(link, absLinks[index]) != 0 { - t.Errorf("Links don't match: %s => %s", link, absLinks[index]) - } - } -} - -func TestParseNonStandardLinks(t *testing.T) { - - body := readGemtext(DATA_DIR+"nonStandard.gmi", t) - links := ParseLinks(body, URL) - - if len(links) != 12 { - t.Errorf("Unexpected link count: %v", links) - } - - for _, link := range links { - if hasSpaceOrTab(link) { - - // this seeems like the best place to do this check because these nonstandard links - // test all variations of whitespace that can exist - - t.Errorf("Link contains a space or tab, improper whitespace handling: %s", link) - } - } - -} - -func TestParsePreformattedLinks(t *testing.T) { - - body := readGemtext(DATA_DIR+"preformatedLinks.gmi", t) - links := ParseLinks(body, URL) - - if len(links) != 2 { - t.Errorf("Unexpected link count: %v", links) - } - -} diff --git a/gemtextparser/parse.go b/gemtextparser/parse.go @@ -0,0 +1,100 @@ +package gemtextparser + +import ( + "fmt" + "net/url" + "strings" +) + +func stripLeadingWhiteSpace(text string) string { + + for len(text) > 0 { + if text[0] == ' ' || text[0] == '\t' { + if len(text) > 1 { + text = text[1:] + } else { + text = "" + return text + } + } else { + return text + } + } + + return text +} + +func ParseLinks(body string, currentUrl string) []string { + + base, err := url.Parse(currentUrl) + + if err != nil { + panic(err) + } + + lines := strings.Split(body, "\n") + + links := []string{} + + escaped := false + escape := "```" + + for _, item := range lines { + + // must start with escape characters, the rest doesn't matter to us + // > Any line whose first three characters are "```" (...) are preformatted toggle lines + if len(item) >= 3 && strings.Compare(escape, item[:3]) == 0 { + escaped = !escaped + } + + if len(item) > 3 && !escaped { + if item[0] == '=' && item[1] == '>' { + + // sometimes links end with a \r, but that isn't valid so we won't allow it + links = append(links, stripLeadingWhiteSpace(item[2:])) + } + } + + } + + geminiLinks := []string{} + + for _, item := range links { + + // this is for finding the text associated with the link + + indexOfSpace := strings.Index(item, " ") + indexOfTab := strings.Index(item, "\t") + + // default if there aren't any + indexOfSpaceOrTab := len(item) + + if indexOfSpace != -1 { + indexOfSpaceOrTab = indexOfSpace + } + + if indexOfTab != -1 { + if indexOfTab < indexOfSpace || indexOfSpace == -1 { + indexOfSpaceOrTab = indexOfTab + } + } + + if len(item) >= 10 && strings.Compare(item[:9], "gemini://") == 0 { + geminiLinks = append(geminiLinks, item[0:indexOfSpaceOrTab]) + } + if strings.Contains(item, "://") == false { + // relative link + + u, err := url.Parse(item[0:indexOfSpaceOrTab]) + + if err != nil { + fmt.Printf("Unable to parse link: %s, %s\n", item, err) + continue + } + + geminiLinks = append(geminiLinks, base.ResolveReference(u).String()) + } + } + + return geminiLinks +} diff --git a/gemtextparser/parse_test.go b/gemtextparser/parse_test.go @@ -0,0 +1,81 @@ +package gemtextparser + +import ( + "os" + "strings" + "testing" +) + +var DATA_DIR = "test_data/" +var URL = "gemini://laack.co" + +func hasSpaceOrTab(text string) bool { + return strings.Contains(text, " ") || strings.Contains(text, "\t") +} + +func readGemtext(path string, t *testing.T) string { + + file, err := os.ReadFile(path) + + if err != nil { + t.Errorf("Unable to read %s", path) + return "" + } + + return string(file) +} + +func TestAbsoluteRelativeParsingMatch(t *testing.T) { + + absBody := readGemtext(DATA_DIR+"absoluteLinks.gmi", t) + absLinks := ParseLinks(absBody, URL) + + if len(absLinks) != 5 { + t.Errorf("Unexpected number of absolute links") + } + + relBody := readGemtext(DATA_DIR+"relativeLinks.gmi", t) + relLinks := ParseLinks(relBody, URL) + + if len(relLinks) != 5 { + t.Errorf("Unexpected number of relative links") + } + + for index, link := range relLinks { + if strings.Compare(link, absLinks[index]) != 0 { + t.Errorf("Links don't match: %s => %s", link, absLinks[index]) + } + } +} + +func TestParseNonStandardLinks(t *testing.T) { + + body := readGemtext(DATA_DIR+"nonStandard.gmi", t) + links := ParseLinks(body, URL) + + if len(links) != 12 { + t.Errorf("Unexpected link count: %v", links) + } + + for _, link := range links { + if hasSpaceOrTab(link) { + + // this seeems like the best place to do this check because these nonstandard links + // test all variations of whitespace that can exist + + t.Errorf("Link contains a space or tab, improper whitespace handling: %s", link) + } + } + +} + +func TestParsePreformattedLinks(t *testing.T) { + + body := readGemtext(DATA_DIR+"preformatedLinks.gmi", t) + links := ParseLinks(body, URL) + + if len(links) != 2 { + t.Errorf("Unexpected link count: %v", links) + } + +} diff --git a/gemtext_parser/test_data/absoluteLinks.gmi b/gemtextparser/test_data/absoluteLinks.gmi diff --git a/gemtext_parser/test_data/nonStandard.gmi b/gemtextparser/test_data/nonStandard.gmi diff --git a/gemtext_parser/test_data/preformatedLinks.gmi b/gemtextparser/test_data/preformatedLinks.gmi diff --git a/gemtext_parser/test_data/relativeLinks.gmi b/gemtextparser/test_data/relativeLinks.gmi diff --git a/go.mod b/go.mod @@ -1,4 +1,4 @@ -module gemini-search +module geminisearch go 1.26.2 diff --git a/index/go.mod b/index/go.mod @@ -1,5 +0,0 @@ -module main - -go 1.26.2 - -require github.com/mattn/go-sqlite3 v1.14.44 diff --git a/index/go.sum b/index/go.sum @@ -1,2 +0,0 @@ -github.com/mattn/go-sqlite3 v1.14.44 h1:3VSe+xafpbzsLbdr2AWlAZk9yRHiBhTBakioXaCKTF8= -github.com/mattn/go-sqlite3 v1.14.44/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ= diff --git a/index/main.go b/index/main.go @@ -6,7 +6,7 @@ import ( _ "github.com/mattn/go-sqlite3" ) -var CRAWLED_DIR = "../outputs/crawled" +var CRAWLED_DIR = "outputs/crawled" var CRAWLED_DB_NAME = "main.db" func connectToCrawlDB() *sql.DB {