Refactoring + tests + parsing fixes - gemini-search - A simple search engine for Geminispace

commit 47b532d1127bf3f977a7bcd52f6dfcf9f4d13dbb
parent 711491341a21a9af94a17fc8782eacf3867e07a1
Author: Andrew Laack <andrew@laack.co>
Date:   Tue,  5 May 2026 22:24:48 -0500

Refactoring + tests + parsing fixes

Diffstat:
M crawl/main.go  | 64 ----------------------------------------------------------------
A crawl/parse.go  | 102 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A crawl/parse_test.go  | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A crawl/test_data/absoluteLinks.gmi  | 15 +++++++++++++++
A crawl/test_data/nonStandard.gmi  | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A crawl/test_data/preformatedLinks.gmi  | 17 +++++++++++++++++
A crawl/test_data/relativeLinks.gmi  | 15 +++++++++++++++

7 files changed, 296 insertions(+), 64 deletions(-)
diff --git a/crawl/main.go b/crawl/main.go
@@ -34,70 +34,6 @@ func fetchSite(currentUrl string) (string, error){
 
 }
 
-func parseLinks(body string, currentUrl string) []string {
-
-	base, err := url.Parse(currentUrl)
-
-	if err != nil {
-		panic(err)
-	}
-
-	lines := strings.Split(body, "\n")
-
-	links := []string{}
-
-	escaped := false
-	escape := "```"
-
-	for _, item := range lines {
-
-		if strings.Compare(escape, item) == 0 {
-			escaped = !escaped
-		}
-
-		if len(item) > 3 && !escaped {
-			if item[0] == '=' && item[1] == '>' {
-				// TODO: is =>link valid?
-
-				// TODO: I've seen =>{tab}link{tab} 
-				// and stuff like that from tuxmachines. is that reasonable to support?
-				// is tab == space?
-
-				links = append(links, item[3:])
-			}
-		}
-
-	}
-
-	geminiLinks := []string{}
-
-	for _, item := range links {
-
-		indexOfSpace := strings.Index(item, " ")
-
-		if indexOfSpace == -1 {
-			indexOfSpace = len(item)
-		}
-		if len(item) >= 10 && strings.Compare(item[:9], "gemini://") == 0 {
-			geminiLinks = append(geminiLinks, item[0:indexOfSpace])
-		}
-		if strings.Contains(item, "://") == false {
-			// relative link
-
-			u, err := url.Parse(item[0:indexOfSpace])
-
-			if err != nil {
-				fmt.Printf("Unable to parse link: %s, %s\n", item, err)
-				continue
-			}
-
-			geminiLinks = append(geminiLinks, base.ResolveReference(u).String())
-		}
-	}
-
-	return geminiLinks
-}
-
 func setupDB(dir string, filename string) *sql.DB {
 
 	os.MkdirAll(dir, 0755)
diff --git a/crawl/parse.go b/crawl/parse.go
@@ -0,0 +1,102 @@
+package main
+
+import (
+	"fmt"
+	"net/url"
+	"strings"
+)
+
+func stripLeadingWhiteSpace(text string) string {
+
+	for len(text) > 0 {
+		if text[0] == ' ' || text[0] == '\t' {
+			if len(text) > 1 {
+				text = text[1:]
+			} else {
+				text = ""
+				return text
+			}
+		} else {
+			return text
+		}
+	}
+
+	return text
+}
+
+func parseLinks(body string, currentUrl string) []string {
+
+	base, err := url.Parse(currentUrl)
+
+	if err != nil {
+		panic(err)
+	}
+
+	lines := strings.Split(body, "\n")
+
+	links := []string{}
+
+	escaped := false
+	escape := "```"
+
+	for _, item := range lines {
+
+		// must start with escape characters, the rest doesn't matter to us
+		// > Any line whose first three characters are "```" (...) are preformatted toggle lines
+		if len(item) >= 3 && strings.Compare(escape, item[:3]) == 0 {
+			escaped = !escaped
+		}
+
+		if len(item) > 3 && !escaped {
+			if item[0] == '=' && item[1] == '>' {
+
+				// sometimes links end with a \r, but that isn't valid so we won't allow it
+				links = append(links, stripLeadingWhiteSpace(item[2:]))
+			}
+		}
+
+	}
+
+	geminiLinks := []string{}
+
+	for _, item := range links {
+
+		// this is for finding the text associated with the link
+
+		indexOfSpace := strings.Index(item, " ")
+		indexOfTab := strings.Index(item, "\t")
+
+		// default if there aren't any
+		indexOfSpaceOrTab := len(item)
+
+		if indexOfSpace != -1 {
+			indexOfSpaceOrTab = indexOfSpace
+		}
+
+		if indexOfTab != -1 {
+			if indexOfTab < indexOfSpace || indexOfSpace == -1{
+				indexOfSpaceOrTab = indexOfTab
+			}
+		}
+
+
+		if len(item) >= 10 && strings.Compare(item[:9], "gemini://") == 0 {
+			geminiLinks = append(geminiLinks, item[0:indexOfSpaceOrTab])
+		}
+		if strings.Contains(item, "://") == false {
+			// relative link
+
+			u, err := url.Parse(item[0:indexOfSpaceOrTab])
+
+			if err != nil {
+				fmt.Printf("Unable to parse link: %s, %s\n", item, err)
+				continue
+			}
+
+			geminiLinks = append(geminiLinks, base.ResolveReference(u).String())
+		}
+	}
+
+	return geminiLinks
+}
+
diff --git a/crawl/parse_test.go b/crawl/parse_test.go
@@ -0,0 +1,84 @@
+package main
+
+import (
+	"testing"
+	"strings"
+	"os"
+)
+
+var DATA_DIR = "test_data/"
+var URL = "gemini://laack.co"
+
+func hasSpaceOrTab(text string) bool {
+	return strings.Contains(text," ") || strings.Contains(text,"\t")
+}
+
+
+func readGemtext(path string , t *testing.T) string {
+
+	file, err := os.ReadFile(path)
+
+	if err != nil {
+		t.Errorf("Unable to read %s", path)
+		return ""
+	}
+
+	return string(file)
+}
+
+func TestAbsoluteRelativeParsingMatch(t *testing.T) {
+
+	absBody := readGemtext(DATA_DIR+"absoluteLinks.gmi", t)
+	absLinks := parseLinks(absBody, URL)
+
+	if len(absLinks) != 5 {
+		t.Errorf("Unexpected number of absolute links")
+	}
+
+	relBody := readGemtext(DATA_DIR+"relativeLinks.gmi", t)
+	relLinks := parseLinks(relBody, URL)
+
+	if len(relLinks) != 5 {
+		t.Errorf("Unexpected number of relative links")
+	}
+
+
+	for index, link := range relLinks {
+		if strings.Compare(link, absLinks[index]) != 0 {
+			t.Errorf("Links don't match: %s => %s", link, absLinks[index])
+		}
+	}
+}
+
+func TestParseNonStandardLinks(t *testing.T) {
+
+	body := readGemtext(DATA_DIR+"nonStandard.gmi", t)
+	links := parseLinks(body, URL)
+
+	if len(links) != 12 {
+		t.Errorf("Unexpected link count: %v", links)
+	}
+
+
+	for _, link := range links {
+		if hasSpaceOrTab(link) {
+
+			// this seeems like the best place to do this check because these nonstandard links
+			// test all variations of whitespace that can exist
+
+			t.Errorf("Link contains a space or tab, improper whitespace handling: %s", link)
+		}
+	}
+
+}
+
+func TestParsePreformattedLinks(t *testing.T) {
+
+	body := readGemtext(DATA_DIR+"preformatedLinks.gmi", t)
+	links := parseLinks(body, URL)
+
+	if len(links) != 2 {
+		t.Errorf("Unexpected link count: %v", links)
+	}
+
+}
diff --git a/crawl/test_data/absoluteLinks.gmi b/crawl/test_data/absoluteLinks.gmi
@@ -0,0 +1,15 @@
+# Example absolute links
+
+Some text
+
+## Example absolute links
+
+> A quote
+
+### Example absolute links
+
+=> gemini://laack.co/blog
+=> gemini://laack.co/blog/index.gmi
+=> gemini://laack.co/blogging
+=> gemini://laack.co/blogging/
+=> gemini://laack.co/who_do_you/think/you/are.gmi
diff --git a/crawl/test_data/nonStandard.gmi b/crawl/test_data/nonStandard.gmi
@@ -0,0 +1,63 @@
+# These are some non-standard but gemtext compliant links
+
+https://geminiprotocol.net/docs/gemtext-specification.gmi (no this is not a link)
+
+> All lines beginning with the two characters "=>" are link lines
+> =>[<whitespace>]<URL>[<whitespace><USER-FRIENDLY LINK NAME>]
+
+## These are valid links as <whitespace> is optional (square bracketed)
+
+=>gemini://blog.laack.co/test		testing123
+=>testing/ test
+=>/testing/	test
+
+## These are valid links with tabs
+
+=> 	gemini://blog.laack.co a link
+=>  gemini://blog.laack.co/testing123	another link with a tab before the link text
+=>  gemini://blog.laack.co/testing1234		another link with two tabs before the link text
+
+## These are valid links with spaces
+
+=> gemini://blog.laack.co/test	there is a tab after this link
+=>  testing/		there are two tabs after this link
+=>   /testing/ 	there is a space then a tab after this link
+
+
+## These are valid links with spaces and tabs
+
+=>  gemini://blog.laack.co/test aorsienat iersntier nto iersntories otire
+=>      testing/ rasotieanrst aorsienat iersntier nto iersntories otire
+=>          /testing/ aorsienat iersntier nto iersntories otireaorsienat iersntier nto iersntories otire
+
+> URLs in link lines MUST have reserved characters and spaces percent-encoded as per RFC 3986.
+
+I'm not doing any validation for that, I'd expect the requests library to handle that when appropriate.
+
+## Some http links (we don't care about these)
+
+=> http://blog.laack.co/testing2139 other site
+=> https://blog.laack.co/testing213987t some site
+=> https://blog.laack.co/testing213991t some site
+=> https://blog.laack.co/testing213995t some site
+=> https://blog.laack.co/testing213990t some site
+=> https://blog.laack.co/testing2139nonstandard87t some site
+=> https://blog.laack.co/nonstandardtesting213987t some site
+=> https://blog.laack.co/testing213988nonstandardt some site
+=> https://blog.laack.co/testing213987t some site
+=> https://blog.laack.co/testing213990t some site
+=> https://blog.laack.co/testing213987t some site
+=> https://blog.laack.co/testing213990t some site
+=> https://blog.laack.co/testing213989t some site
+=> ssh://blog.laack.co/testing213989t some site
+=> sftp://blog.laack.co/testing213989t some site
+=> something://blog.laack.co/testing213989t some site
+=> gemino://blog.laack.co/testing213989t some site
+
+    => gemini://laack.co/this/is-not-valid
+    => gemini://laack.co/this/is-not-valid
+    => gemini://laack.co/this/is-not-valid
+    => gemini://laack.co/this/is-not-valid
+    => gemini://laack.co/this/is-not-valid
+ => gemini://laack.co/this/is-not-valid
+ => gemini://laack.co/this/is-not-valid
diff --git a/crawl/test_data/preformatedLinks.gmi b/crawl/test_data/preformatedLinks.gmi
@@ -0,0 +1,17 @@
+> Any line whose first three characters are "```" (i.e. three consecutive back ticks with no leading whitespace) are preformatted toggle lines
+
+```this is alt text (who knew!)
+=> gemini://example.org/1
+=> gemini://example.org/2
+=> gemini://example.org/3
+=> gemini://example.org/4
+```
+
+=> gemini://laack.co/1
+=> gemini://laack.co/2
+
+```
+=> gemini://example.org/1
+=> gemini://example.org/2
+=> gemini://example.org/3
+=> gemini://example.org/4
diff --git a/crawl/test_data/relativeLinks.gmi b/crawl/test_data/relativeLinks.gmi
@@ -0,0 +1,15 @@
+# Example relative links
+
+Some text
+
+## Example relative links
+
+> A quote
+
+### Example relative links
+
+=> /blog
+=> /blog/index.gmi
+=> /blogging
+=> blogging/
+=> who_do_you/think/you/are.gmi

	gemini-search A simple search engine for Geminispace
	git clone git://git.laack.co/gemini-search.git
	Log \| Files \| Refs \| README

M	crawl/main.go	\|	64	----------------------------------------------------------------
A	crawl/parse.go	\|	102	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	crawl/parse_test.go	\|	84	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	crawl/test_data/absoluteLinks.gmi	\|	15	+++++++++++++++
A	crawl/test_data/nonStandard.gmi	\|	63	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	crawl/test_data/preformatedLinks.gmi	\|	17	+++++++++++++++++
A	crawl/test_data/relativeLinks.gmi	\|	15	+++++++++++++++