commit 5f6c510a5e5bc484f72e855e2d51ae0ff3cf2fb2
parent 5676aad6afb2448cc5185cc809feeedaef0fd184
Author: Andrew Laack <andrew@laack.co>
Date: Wed, 6 May 2026 14:38:03 -0500
Renaming modules, setting up indexing
Diffstat:
13 files changed, 185 insertions(+), 192 deletions(-)
diff --git a/crawl/main.go b/crawl/main.go
@@ -12,7 +12,7 @@ import (
"strings"
"sync"
"time"
- "gemini-search/gemtext_parser"
+ "geminisearch/gemtextparser"
)
var OUTPUT_DIR = "outputs/crawled"
@@ -272,7 +272,7 @@ func worker(db *sql.DB, wg *sync.WaitGroup) {
fmt.Printf("Fetched %s\n", currentUrl)
- forwardGeminiLinks := gemtext_parser.ParseLinks(body, currentUrl)
+ forwardGeminiLinks := gemtextparser.ParseLinks(body, currentUrl)
outputLocation, err := writeSiteOutput(OUTPUT_DIR, body, currentUrl)
diff --git a/gemtext_parser/parse.go b/gemtext_parser/parse.go
@@ -1,100 +0,0 @@
-package gemtext_parser
-
-import (
- "fmt"
- "net/url"
- "strings"
-)
-
-func stripLeadingWhiteSpace(text string) string {
-
- for len(text) > 0 {
- if text[0] == ' ' || text[0] == '\t' {
- if len(text) > 1 {
- text = text[1:]
- } else {
- text = ""
- return text
- }
- } else {
- return text
- }
- }
-
- return text
-}
-
-func ParseLinks(body string, currentUrl string) []string {
-
- base, err := url.Parse(currentUrl)
-
- if err != nil {
- panic(err)
- }
-
- lines := strings.Split(body, "\n")
-
- links := []string{}
-
- escaped := false
- escape := "```"
-
- for _, item := range lines {
-
- // must start with escape characters, the rest doesn't matter to us
- // > Any line whose first three characters are "```" (...) are preformatted toggle lines
- if len(item) >= 3 && strings.Compare(escape, item[:3]) == 0 {
- escaped = !escaped
- }
-
- if len(item) > 3 && !escaped {
- if item[0] == '=' && item[1] == '>' {
-
- // sometimes links end with a \r, but that isn't valid so we won't allow it
- links = append(links, stripLeadingWhiteSpace(item[2:]))
- }
- }
-
- }
-
- geminiLinks := []string{}
-
- for _, item := range links {
-
- // this is for finding the text associated with the link
-
- indexOfSpace := strings.Index(item, " ")
- indexOfTab := strings.Index(item, "\t")
-
- // default if there aren't any
- indexOfSpaceOrTab := len(item)
-
- if indexOfSpace != -1 {
- indexOfSpaceOrTab = indexOfSpace
- }
-
- if indexOfTab != -1 {
- if indexOfTab < indexOfSpace || indexOfSpace == -1 {
- indexOfSpaceOrTab = indexOfTab
- }
- }
-
- if len(item) >= 10 && strings.Compare(item[:9], "gemini://") == 0 {
- geminiLinks = append(geminiLinks, item[0:indexOfSpaceOrTab])
- }
- if strings.Contains(item, "://") == false {
- // relative link
-
- u, err := url.Parse(item[0:indexOfSpaceOrTab])
-
- if err != nil {
- fmt.Printf("Unable to parse link: %s, %s\n", item, err)
- continue
- }
-
- geminiLinks = append(geminiLinks, base.ResolveReference(u).String())
- }
- }
-
- return geminiLinks
-}
diff --git a/gemtext_parser/parse_test.go b/gemtext_parser/parse_test.go
@@ -1,81 +0,0 @@
-package gemtext_parser
-
-import (
- "os"
- "strings"
- "testing"
-)
-
-var DATA_DIR = "test_data/"
-var URL = "gemini://laack.co"
-
-func hasSpaceOrTab(text string) bool {
- return strings.Contains(text, " ") || strings.Contains(text, "\t")
-}
-
-func readGemtext(path string, t *testing.T) string {
-
- file, err := os.ReadFile(path)
-
- if err != nil {
- t.Errorf("Unable to read %s", path)
- return ""
- }
-
- return string(file)
-}
-
-func TestAbsoluteRelativeParsingMatch(t *testing.T) {
-
- absBody := readGemtext(DATA_DIR+"absoluteLinks.gmi", t)
- absLinks := ParseLinks(absBody, URL)
-
- if len(absLinks) != 5 {
- t.Errorf("Unexpected number of absolute links")
- }
-
- relBody := readGemtext(DATA_DIR+"relativeLinks.gmi", t)
- relLinks := ParseLinks(relBody, URL)
-
- if len(relLinks) != 5 {
- t.Errorf("Unexpected number of relative links")
- }
-
- for index, link := range relLinks {
- if strings.Compare(link, absLinks[index]) != 0 {
- t.Errorf("Links don't match: %s => %s", link, absLinks[index])
- }
- }
-}
-
-func TestParseNonStandardLinks(t *testing.T) {
-
- body := readGemtext(DATA_DIR+"nonStandard.gmi", t)
- links := ParseLinks(body, URL)
-
- if len(links) != 12 {
- t.Errorf("Unexpected link count: %v", links)
- }
-
- for _, link := range links {
- if hasSpaceOrTab(link) {
-
- // this seeems like the best place to do this check because these nonstandard links
- // test all variations of whitespace that can exist
-
- t.Errorf("Link contains a space or tab, improper whitespace handling: %s", link)
- }
- }
-
-}
-
-func TestParsePreformattedLinks(t *testing.T) {
-
- body := readGemtext(DATA_DIR+"preformatedLinks.gmi", t)
- links := ParseLinks(body, URL)
-
- if len(links) != 2 {
- t.Errorf("Unexpected link count: %v", links)
- }
-
-}
diff --git a/gemtextparser/parse.go b/gemtextparser/parse.go
@@ -0,0 +1,100 @@
+package gemtextparser
+
+import (
+ "fmt"
+ "net/url"
+ "strings"
+)
+
+func stripLeadingWhiteSpace(text string) string {
+
+ for len(text) > 0 {
+ if text[0] == ' ' || text[0] == '\t' {
+ if len(text) > 1 {
+ text = text[1:]
+ } else {
+ text = ""
+ return text
+ }
+ } else {
+ return text
+ }
+ }
+
+ return text
+}
+
+func ParseLinks(body string, currentUrl string) []string {
+
+ base, err := url.Parse(currentUrl)
+
+ if err != nil {
+ panic(err)
+ }
+
+ lines := strings.Split(body, "\n")
+
+ links := []string{}
+
+ escaped := false
+ escape := "```"
+
+ for _, item := range lines {
+
+ // must start with escape characters, the rest doesn't matter to us
+ // > Any line whose first three characters are "```" (...) are preformatted toggle lines
+ if len(item) >= 3 && strings.Compare(escape, item[:3]) == 0 {
+ escaped = !escaped
+ }
+
+ if len(item) > 3 && !escaped {
+ if item[0] == '=' && item[1] == '>' {
+
+ // sometimes links end with a \r, but that isn't valid so we won't allow it
+ links = append(links, stripLeadingWhiteSpace(item[2:]))
+ }
+ }
+
+ }
+
+ geminiLinks := []string{}
+
+ for _, item := range links {
+
+ // this is for finding the text associated with the link
+
+ indexOfSpace := strings.Index(item, " ")
+ indexOfTab := strings.Index(item, "\t")
+
+ // default if there aren't any
+ indexOfSpaceOrTab := len(item)
+
+ if indexOfSpace != -1 {
+ indexOfSpaceOrTab = indexOfSpace
+ }
+
+ if indexOfTab != -1 {
+ if indexOfTab < indexOfSpace || indexOfSpace == -1 {
+ indexOfSpaceOrTab = indexOfTab
+ }
+ }
+
+ if len(item) >= 10 && strings.Compare(item[:9], "gemini://") == 0 {
+ geminiLinks = append(geminiLinks, item[0:indexOfSpaceOrTab])
+ }
+ if strings.Contains(item, "://") == false {
+ // relative link
+
+ u, err := url.Parse(item[0:indexOfSpaceOrTab])
+
+ if err != nil {
+ fmt.Printf("Unable to parse link: %s, %s\n", item, err)
+ continue
+ }
+
+ geminiLinks = append(geminiLinks, base.ResolveReference(u).String())
+ }
+ }
+
+ return geminiLinks
+}
diff --git a/gemtextparser/parse_test.go b/gemtextparser/parse_test.go
@@ -0,0 +1,81 @@
+package gemtextparser
+
+import (
+ "os"
+ "strings"
+ "testing"
+)
+
+var DATA_DIR = "test_data/"
+var URL = "gemini://laack.co"
+
+func hasSpaceOrTab(text string) bool {
+ return strings.Contains(text, " ") || strings.Contains(text, "\t")
+}
+
+func readGemtext(path string, t *testing.T) string {
+
+ file, err := os.ReadFile(path)
+
+ if err != nil {
+ t.Errorf("Unable to read %s", path)
+ return ""
+ }
+
+ return string(file)
+}
+
+func TestAbsoluteRelativeParsingMatch(t *testing.T) {
+
+ absBody := readGemtext(DATA_DIR+"absoluteLinks.gmi", t)
+ absLinks := ParseLinks(absBody, URL)
+
+ if len(absLinks) != 5 {
+ t.Errorf("Unexpected number of absolute links")
+ }
+
+ relBody := readGemtext(DATA_DIR+"relativeLinks.gmi", t)
+ relLinks := ParseLinks(relBody, URL)
+
+ if len(relLinks) != 5 {
+ t.Errorf("Unexpected number of relative links")
+ }
+
+ for index, link := range relLinks {
+ if strings.Compare(link, absLinks[index]) != 0 {
+ t.Errorf("Links don't match: %s => %s", link, absLinks[index])
+ }
+ }
+}
+
+func TestParseNonStandardLinks(t *testing.T) {
+
+ body := readGemtext(DATA_DIR+"nonStandard.gmi", t)
+ links := ParseLinks(body, URL)
+
+ if len(links) != 12 {
+ t.Errorf("Unexpected link count: %v", links)
+ }
+
+ for _, link := range links {
+ if hasSpaceOrTab(link) {
+
+ // this seeems like the best place to do this check because these nonstandard links
+ // test all variations of whitespace that can exist
+
+ t.Errorf("Link contains a space or tab, improper whitespace handling: %s", link)
+ }
+ }
+
+}
+
+func TestParsePreformattedLinks(t *testing.T) {
+
+ body := readGemtext(DATA_DIR+"preformatedLinks.gmi", t)
+ links := ParseLinks(body, URL)
+
+ if len(links) != 2 {
+ t.Errorf("Unexpected link count: %v", links)
+ }
+
+}
diff --git a/gemtext_parser/test_data/absoluteLinks.gmi b/gemtextparser/test_data/absoluteLinks.gmi
diff --git a/gemtext_parser/test_data/nonStandard.gmi b/gemtextparser/test_data/nonStandard.gmi
diff --git a/gemtext_parser/test_data/preformatedLinks.gmi b/gemtextparser/test_data/preformatedLinks.gmi
diff --git a/gemtext_parser/test_data/relativeLinks.gmi b/gemtextparser/test_data/relativeLinks.gmi
diff --git a/go.mod b/go.mod
@@ -1,4 +1,4 @@
-module gemini-search
+module geminisearch
go 1.26.2
diff --git a/index/go.mod b/index/go.mod
@@ -1,5 +0,0 @@
-module main
-
-go 1.26.2
-
-require github.com/mattn/go-sqlite3 v1.14.44
diff --git a/index/go.sum b/index/go.sum
@@ -1,2 +0,0 @@
-github.com/mattn/go-sqlite3 v1.14.44 h1:3VSe+xafpbzsLbdr2AWlAZk9yRHiBhTBakioXaCKTF8=
-github.com/mattn/go-sqlite3 v1.14.44/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ=
diff --git a/index/main.go b/index/main.go
@@ -6,7 +6,7 @@ import (
_ "github.com/mattn/go-sqlite3"
)
-var CRAWLED_DIR = "../outputs/crawled"
+var CRAWLED_DIR = "outputs/crawled"
var CRAWLED_DB_NAME = "main.db"
func connectToCrawlDB() *sql.DB {