commit 47b532d1127bf3f977a7bcd52f6dfcf9f4d13dbb
parent 711491341a21a9af94a17fc8782eacf3867e07a1
Author: Andrew Laack <andrew@laack.co>
Date: Tue, 5 May 2026 22:24:48 -0500
Refactoring + tests + parsing fixes
Diffstat:
7 files changed, 296 insertions(+), 64 deletions(-)
diff --git a/crawl/main.go b/crawl/main.go
@@ -34,70 +34,6 @@ func fetchSite(currentUrl string) (string, error){
}
-func parseLinks(body string, currentUrl string) []string {
-
- base, err := url.Parse(currentUrl)
-
- if err != nil {
- panic(err)
- }
-
- lines := strings.Split(body, "\n")
-
- links := []string{}
-
- escaped := false
- escape := "```"
-
- for _, item := range lines {
-
- if strings.Compare(escape, item) == 0 {
- escaped = !escaped
- }
-
- if len(item) > 3 && !escaped {
- if item[0] == '=' && item[1] == '>' {
- // TODO: is =>link valid?
-
- // TODO: I've seen =>{tab}link{tab}
- // and stuff like that from tuxmachines. is that reasonable to support?
- // is tab == space?
-
- links = append(links, item[3:])
- }
- }
-
- }
-
- geminiLinks := []string{}
-
- for _, item := range links {
-
- indexOfSpace := strings.Index(item, " ")
-
- if indexOfSpace == -1 {
- indexOfSpace = len(item)
- }
- if len(item) >= 10 && strings.Compare(item[:9], "gemini://") == 0 {
- geminiLinks = append(geminiLinks, item[0:indexOfSpace])
- }
- if strings.Contains(item, "://") == false {
- // relative link
-
- u, err := url.Parse(item[0:indexOfSpace])
-
- if err != nil {
- fmt.Printf("Unable to parse link: %s, %s\n", item, err)
- continue
- }
-
- geminiLinks = append(geminiLinks, base.ResolveReference(u).String())
- }
- }
-
- return geminiLinks
-}
-
func setupDB(dir string, filename string) *sql.DB {
os.MkdirAll(dir, 0755)
diff --git a/crawl/parse.go b/crawl/parse.go
@@ -0,0 +1,102 @@
+package main
+
+import (
+ "fmt"
+ "net/url"
+ "strings"
+)
+
+func stripLeadingWhiteSpace(text string) string {
+
+ for len(text) > 0 {
+ if text[0] == ' ' || text[0] == '\t' {
+ if len(text) > 1 {
+ text = text[1:]
+ } else {
+ text = ""
+ return text
+ }
+ } else {
+ return text
+ }
+ }
+
+ return text
+}
+
+func parseLinks(body string, currentUrl string) []string {
+
+ base, err := url.Parse(currentUrl)
+
+ if err != nil {
+ panic(err)
+ }
+
+ lines := strings.Split(body, "\n")
+
+ links := []string{}
+
+ escaped := false
+ escape := "```"
+
+ for _, item := range lines {
+
+ // must start with escape characters, the rest doesn't matter to us
+ // > Any line whose first three characters are "```" (...) are preformatted toggle lines
+ if len(item) >= 3 && strings.Compare(escape, item[:3]) == 0 {
+ escaped = !escaped
+ }
+
+ if len(item) > 3 && !escaped {
+ if item[0] == '=' && item[1] == '>' {
+
+ // sometimes links end with a \r, but that isn't valid so we won't allow it
+ links = append(links, stripLeadingWhiteSpace(item[2:]))
+ }
+ }
+
+ }
+
+ geminiLinks := []string{}
+
+ for _, item := range links {
+
+ // this is for finding the text associated with the link
+
+ indexOfSpace := strings.Index(item, " ")
+ indexOfTab := strings.Index(item, "\t")
+
+ // default if there aren't any
+ indexOfSpaceOrTab := len(item)
+
+ if indexOfSpace != -1 {
+ indexOfSpaceOrTab = indexOfSpace
+ }
+
+ if indexOfTab != -1 {
+ if indexOfTab < indexOfSpace || indexOfSpace == -1{
+ indexOfSpaceOrTab = indexOfTab
+ }
+ }
+
+
+ if len(item) >= 10 && strings.Compare(item[:9], "gemini://") == 0 {
+ geminiLinks = append(geminiLinks, item[0:indexOfSpaceOrTab])
+ }
+ if strings.Contains(item, "://") == false {
+ // relative link
+
+ u, err := url.Parse(item[0:indexOfSpaceOrTab])
+
+ if err != nil {
+ fmt.Printf("Unable to parse link: %s, %s\n", item, err)
+ continue
+ }
+
+ geminiLinks = append(geminiLinks, base.ResolveReference(u).String())
+ }
+ }
+
+ return geminiLinks
+}
+
diff --git a/crawl/parse_test.go b/crawl/parse_test.go
@@ -0,0 +1,84 @@
+package main
+
+import (
+ "testing"
+ "strings"
+ "os"
+)
+
+var DATA_DIR = "test_data/"
+var URL = "gemini://laack.co"
+
+func hasSpaceOrTab(text string) bool {
+ return strings.Contains(text," ") || strings.Contains(text,"\t")
+}
+
+
+func readGemtext(path string , t *testing.T) string {
+
+ file, err := os.ReadFile(path)
+
+ if err != nil {
+ t.Errorf("Unable to read %s", path)
+ return ""
+ }
+
+ return string(file)
+}
+
+func TestAbsoluteRelativeParsingMatch(t *testing.T) {
+
+ absBody := readGemtext(DATA_DIR+"absoluteLinks.gmi", t)
+ absLinks := parseLinks(absBody, URL)
+
+ if len(absLinks) != 5 {
+ t.Errorf("Unexpected number of absolute links")
+ }
+
+ relBody := readGemtext(DATA_DIR+"relativeLinks.gmi", t)
+ relLinks := parseLinks(relBody, URL)
+
+ if len(relLinks) != 5 {
+ t.Errorf("Unexpected number of relative links")
+ }
+
+
+ for index, link := range relLinks {
+ if strings.Compare(link, absLinks[index]) != 0 {
+ t.Errorf("Links don't match: %s => %s", link, absLinks[index])
+ }
+ }
+}
+
+func TestParseNonStandardLinks(t *testing.T) {
+
+ body := readGemtext(DATA_DIR+"nonStandard.gmi", t)
+ links := parseLinks(body, URL)
+
+ if len(links) != 12 {
+ t.Errorf("Unexpected link count: %v", links)
+ }
+
+
+ for _, link := range links {
+ if hasSpaceOrTab(link) {
+
+ // this seeems like the best place to do this check because these nonstandard links
+ // test all variations of whitespace that can exist
+
+ t.Errorf("Link contains a space or tab, improper whitespace handling: %s", link)
+ }
+ }
+
+}
+
+func TestParsePreformattedLinks(t *testing.T) {
+
+ body := readGemtext(DATA_DIR+"preformatedLinks.gmi", t)
+ links := parseLinks(body, URL)
+
+ if len(links) != 2 {
+ t.Errorf("Unexpected link count: %v", links)
+ }
+
+}
diff --git a/crawl/test_data/absoluteLinks.gmi b/crawl/test_data/absoluteLinks.gmi
@@ -0,0 +1,15 @@
+# Example absolute links
+
+Some text
+
+## Example absolute links
+
+> A quote
+
+### Example absolute links
+
+=> gemini://laack.co/blog
+=> gemini://laack.co/blog/index.gmi
+=> gemini://laack.co/blogging
+=> gemini://laack.co/blogging/
+=> gemini://laack.co/who_do_you/think/you/are.gmi
diff --git a/crawl/test_data/nonStandard.gmi b/crawl/test_data/nonStandard.gmi
@@ -0,0 +1,63 @@
+# These are some non-standard but gemtext compliant links
+
+https://geminiprotocol.net/docs/gemtext-specification.gmi (no this is not a link)
+
+> All lines beginning with the two characters "=>" are link lines
+> =>[<whitespace>]<URL>[<whitespace><USER-FRIENDLY LINK NAME>]
+
+## These are valid links as <whitespace> is optional (square bracketed)
+
+=>gemini://blog.laack.co/test testing123
+=>testing/ test
+=>/testing/ test
+
+## These are valid links with tabs
+
+=> gemini://blog.laack.co a link
+=> gemini://blog.laack.co/testing123 another link with a tab before the link text
+=> gemini://blog.laack.co/testing1234 another link with two tabs before the link text
+
+## These are valid links with spaces
+
+=> gemini://blog.laack.co/test there is a tab after this link
+=> testing/ there are two tabs after this link
+=> /testing/ there is a space then a tab after this link
+
+
+## These are valid links with spaces and tabs
+
+=> gemini://blog.laack.co/test aorsienat iersntier nto iersntories otire
+=> testing/ rasotieanrst aorsienat iersntier nto iersntories otire
+=> /testing/ aorsienat iersntier nto iersntories otireaorsienat iersntier nto iersntories otire
+
+> URLs in link lines MUST have reserved characters and spaces percent-encoded as per RFC 3986.
+
+I'm not doing any validation for that, I'd expect the requests library to handle that when appropriate.
+
+## Some http links (we don't care about these)
+
+=> http://blog.laack.co/testing2139 other site
+=> https://blog.laack.co/testing213987t some site
+=> https://blog.laack.co/testing213991t some site
+=> https://blog.laack.co/testing213995t some site
+=> https://blog.laack.co/testing213990t some site
+=> https://blog.laack.co/testing2139nonstandard87t some site
+=> https://blog.laack.co/nonstandardtesting213987t some site
+=> https://blog.laack.co/testing213988nonstandardt some site
+=> https://blog.laack.co/testing213987t some site
+=> https://blog.laack.co/testing213990t some site
+=> https://blog.laack.co/testing213987t some site
+=> https://blog.laack.co/testing213990t some site
+=> https://blog.laack.co/testing213989t some site
+=> ssh://blog.laack.co/testing213989t some site
+=> sftp://blog.laack.co/testing213989t some site
+=> something://blog.laack.co/testing213989t some site
+=> gemino://blog.laack.co/testing213989t some site
+
+ => gemini://laack.co/this/is-not-valid
+ => gemini://laack.co/this/is-not-valid
+ => gemini://laack.co/this/is-not-valid
+ => gemini://laack.co/this/is-not-valid
+ => gemini://laack.co/this/is-not-valid
+ => gemini://laack.co/this/is-not-valid
+ => gemini://laack.co/this/is-not-valid
diff --git a/crawl/test_data/preformatedLinks.gmi b/crawl/test_data/preformatedLinks.gmi
@@ -0,0 +1,17 @@
+> Any line whose first three characters are "```" (i.e. three consecutive back ticks with no leading whitespace) are preformatted toggle lines
+
+```this is alt text (who knew!)
+=> gemini://example.org/1
+=> gemini://example.org/2
+=> gemini://example.org/3
+=> gemini://example.org/4
+```
+
+=> gemini://laack.co/1
+=> gemini://laack.co/2
+
+```
+=> gemini://example.org/1
+=> gemini://example.org/2
+=> gemini://example.org/3
+=> gemini://example.org/4
diff --git a/crawl/test_data/relativeLinks.gmi b/crawl/test_data/relativeLinks.gmi
@@ -0,0 +1,15 @@
+# Example relative links
+
+Some text
+
+## Example relative links
+
+> A quote
+
+### Example relative links
+
+=> /blog
+=> /blog/index.gmi
+=> /blogging
+=> blogging/
+=> who_do_you/think/you/are.gmi