gemini-search

A simple search engine for Geminispace
git clone git://git.laack.co/gemini-search.git
Log | Files | Refs | README

commit 47b532d1127bf3f977a7bcd52f6dfcf9f4d13dbb
parent 711491341a21a9af94a17fc8782eacf3867e07a1
Author: Andrew Laack <andrew@laack.co>
Date:   Tue,  5 May 2026 22:24:48 -0500

Refactoring + tests + parsing fixes

Diffstat:
Mcrawl/main.go | 64----------------------------------------------------------------
Acrawl/parse.go | 102+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Acrawl/parse_test.go | 84+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Acrawl/test_data/absoluteLinks.gmi | 15+++++++++++++++
Acrawl/test_data/nonStandard.gmi | 63+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Acrawl/test_data/preformatedLinks.gmi | 17+++++++++++++++++
Acrawl/test_data/relativeLinks.gmi | 15+++++++++++++++
7 files changed, 296 insertions(+), 64 deletions(-)

diff --git a/crawl/main.go b/crawl/main.go @@ -34,70 +34,6 @@ func fetchSite(currentUrl string) (string, error){ } -func parseLinks(body string, currentUrl string) []string { - - base, err := url.Parse(currentUrl) - - if err != nil { - panic(err) - } - - lines := strings.Split(body, "\n") - - links := []string{} - - escaped := false - escape := "```" - - for _, item := range lines { - - if strings.Compare(escape, item) == 0 { - escaped = !escaped - } - - if len(item) > 3 && !escaped { - if item[0] == '=' && item[1] == '>' { - // TODO: is =>link valid? - - // TODO: I've seen =>{tab}link{tab} - // and stuff like that from tuxmachines. is that reasonable to support? - // is tab == space? - - links = append(links, item[3:]) - } - } - - } - - geminiLinks := []string{} - - for _, item := range links { - - indexOfSpace := strings.Index(item, " ") - - if indexOfSpace == -1 { - indexOfSpace = len(item) - } - if len(item) >= 10 && strings.Compare(item[:9], "gemini://") == 0 { - geminiLinks = append(geminiLinks, item[0:indexOfSpace]) - } - if strings.Contains(item, "://") == false { - // relative link - - u, err := url.Parse(item[0:indexOfSpace]) - - if err != nil { - fmt.Printf("Unable to parse link: %s, %s\n", item, err) - continue - } - - geminiLinks = append(geminiLinks, base.ResolveReference(u).String()) - } - } - - return geminiLinks -} - func setupDB(dir string, filename string) *sql.DB { os.MkdirAll(dir, 0755) diff --git a/crawl/parse.go b/crawl/parse.go @@ -0,0 +1,102 @@ +package main + +import ( + "fmt" + "net/url" + "strings" +) + +func stripLeadingWhiteSpace(text string) string { + + for len(text) > 0 { + if text[0] == ' ' || text[0] == '\t' { + if len(text) > 1 { + text = text[1:] + } else { + text = "" + return text + } + } else { + return text + } + } + + return text +} + +func parseLinks(body string, currentUrl string) []string { + + base, err := url.Parse(currentUrl) + + if err != nil { + panic(err) + } + + lines := strings.Split(body, "\n") + + links := []string{} + + escaped := false + escape := "```" + + for _, item := range lines { + + // must start with escape characters, the rest doesn't matter to us + // > Any line whose first three characters are "```" (...) are preformatted toggle lines + if len(item) >= 3 && strings.Compare(escape, item[:3]) == 0 { + escaped = !escaped + } + + if len(item) > 3 && !escaped { + if item[0] == '=' && item[1] == '>' { + + // sometimes links end with a \r, but that isn't valid so we won't allow it + links = append(links, stripLeadingWhiteSpace(item[2:])) + } + } + + } + + geminiLinks := []string{} + + for _, item := range links { + + // this is for finding the text associated with the link + + indexOfSpace := strings.Index(item, " ") + indexOfTab := strings.Index(item, "\t") + + // default if there aren't any + indexOfSpaceOrTab := len(item) + + if indexOfSpace != -1 { + indexOfSpaceOrTab = indexOfSpace + } + + if indexOfTab != -1 { + if indexOfTab < indexOfSpace || indexOfSpace == -1{ + indexOfSpaceOrTab = indexOfTab + } + } + + + if len(item) >= 10 && strings.Compare(item[:9], "gemini://") == 0 { + geminiLinks = append(geminiLinks, item[0:indexOfSpaceOrTab]) + } + if strings.Contains(item, "://") == false { + // relative link + + u, err := url.Parse(item[0:indexOfSpaceOrTab]) + + if err != nil { + fmt.Printf("Unable to parse link: %s, %s\n", item, err) + continue + } + + geminiLinks = append(geminiLinks, base.ResolveReference(u).String()) + } + } + + return geminiLinks +} + diff --git a/crawl/parse_test.go b/crawl/parse_test.go @@ -0,0 +1,84 @@ +package main + +import ( + "testing" + "strings" + "os" +) + +var DATA_DIR = "test_data/" +var URL = "gemini://laack.co" + +func hasSpaceOrTab(text string) bool { + return strings.Contains(text," ") || strings.Contains(text,"\t") +} + + +func readGemtext(path string , t *testing.T) string { + + file, err := os.ReadFile(path) + + if err != nil { + t.Errorf("Unable to read %s", path) + return "" + } + + return string(file) +} + +func TestAbsoluteRelativeParsingMatch(t *testing.T) { + + absBody := readGemtext(DATA_DIR+"absoluteLinks.gmi", t) + absLinks := parseLinks(absBody, URL) + + if len(absLinks) != 5 { + t.Errorf("Unexpected number of absolute links") + } + + relBody := readGemtext(DATA_DIR+"relativeLinks.gmi", t) + relLinks := parseLinks(relBody, URL) + + if len(relLinks) != 5 { + t.Errorf("Unexpected number of relative links") + } + + + for index, link := range relLinks { + if strings.Compare(link, absLinks[index]) != 0 { + t.Errorf("Links don't match: %s => %s", link, absLinks[index]) + } + } +} + +func TestParseNonStandardLinks(t *testing.T) { + + body := readGemtext(DATA_DIR+"nonStandard.gmi", t) + links := parseLinks(body, URL) + + if len(links) != 12 { + t.Errorf("Unexpected link count: %v", links) + } + + + for _, link := range links { + if hasSpaceOrTab(link) { + + // this seeems like the best place to do this check because these nonstandard links + // test all variations of whitespace that can exist + + t.Errorf("Link contains a space or tab, improper whitespace handling: %s", link) + } + } + +} + +func TestParsePreformattedLinks(t *testing.T) { + + body := readGemtext(DATA_DIR+"preformatedLinks.gmi", t) + links := parseLinks(body, URL) + + if len(links) != 2 { + t.Errorf("Unexpected link count: %v", links) + } + +} diff --git a/crawl/test_data/absoluteLinks.gmi b/crawl/test_data/absoluteLinks.gmi @@ -0,0 +1,15 @@ +# Example absolute links + +Some text + +## Example absolute links + +> A quote + +### Example absolute links + +=> gemini://laack.co/blog +=> gemini://laack.co/blog/index.gmi +=> gemini://laack.co/blogging +=> gemini://laack.co/blogging/ +=> gemini://laack.co/who_do_you/think/you/are.gmi diff --git a/crawl/test_data/nonStandard.gmi b/crawl/test_data/nonStandard.gmi @@ -0,0 +1,63 @@ +# These are some non-standard but gemtext compliant links + +https://geminiprotocol.net/docs/gemtext-specification.gmi (no this is not a link) + +> All lines beginning with the two characters "=>" are link lines +> =>[<whitespace>]<URL>[<whitespace><USER-FRIENDLY LINK NAME>] + +## These are valid links as <whitespace> is optional (square bracketed) + +=>gemini://blog.laack.co/test testing123 +=>testing/ test +=>/testing/ test + +## These are valid links with tabs + +=> gemini://blog.laack.co a link +=> gemini://blog.laack.co/testing123 another link with a tab before the link text +=> gemini://blog.laack.co/testing1234 another link with two tabs before the link text + +## These are valid links with spaces + +=> gemini://blog.laack.co/test there is a tab after this link +=> testing/ there are two tabs after this link +=> /testing/ there is a space then a tab after this link + + +## These are valid links with spaces and tabs + +=> gemini://blog.laack.co/test aorsienat iersntier nto iersntories otire +=> testing/ rasotieanrst aorsienat iersntier nto iersntories otire +=> /testing/ aorsienat iersntier nto iersntories otireaorsienat iersntier nto iersntories otire + +> URLs in link lines MUST have reserved characters and spaces percent-encoded as per RFC 3986. + +I'm not doing any validation for that, I'd expect the requests library to handle that when appropriate. + +## Some http links (we don't care about these) + +=> http://blog.laack.co/testing2139 other site +=> https://blog.laack.co/testing213987t some site +=> https://blog.laack.co/testing213991t some site +=> https://blog.laack.co/testing213995t some site +=> https://blog.laack.co/testing213990t some site +=> https://blog.laack.co/testing2139nonstandard87t some site +=> https://blog.laack.co/nonstandardtesting213987t some site +=> https://blog.laack.co/testing213988nonstandardt some site +=> https://blog.laack.co/testing213987t some site +=> https://blog.laack.co/testing213990t some site +=> https://blog.laack.co/testing213987t some site +=> https://blog.laack.co/testing213990t some site +=> https://blog.laack.co/testing213989t some site +=> ssh://blog.laack.co/testing213989t some site +=> sftp://blog.laack.co/testing213989t some site +=> something://blog.laack.co/testing213989t some site +=> gemino://blog.laack.co/testing213989t some site + + => gemini://laack.co/this/is-not-valid + => gemini://laack.co/this/is-not-valid + => gemini://laack.co/this/is-not-valid + => gemini://laack.co/this/is-not-valid + => gemini://laack.co/this/is-not-valid + => gemini://laack.co/this/is-not-valid + => gemini://laack.co/this/is-not-valid diff --git a/crawl/test_data/preformatedLinks.gmi b/crawl/test_data/preformatedLinks.gmi @@ -0,0 +1,17 @@ +> Any line whose first three characters are "```" (i.e. three consecutive back ticks with no leading whitespace) are preformatted toggle lines + +```this is alt text (who knew!) +=> gemini://example.org/1 +=> gemini://example.org/2 +=> gemini://example.org/3 +=> gemini://example.org/4 +``` + +=> gemini://laack.co/1 +=> gemini://laack.co/2 + +``` +=> gemini://example.org/1 +=> gemini://example.org/2 +=> gemini://example.org/3 +=> gemini://example.org/4 diff --git a/crawl/test_data/relativeLinks.gmi b/crawl/test_data/relativeLinks.gmi @@ -0,0 +1,15 @@ +# Example relative links + +Some text + +## Example relative links + +> A quote + +### Example relative links + +=> /blog +=> /blog/index.gmi +=> /blogging +=> blogging/ +=> who_do_you/think/you/are.gmi