gemini-search

A simple search engine for Geminispace
git clone git://git.laack.co/gemini-search.git
Log | Files | Refs | README

commit 099cf03e267ba92f31349539e9e9897272cfcbba
parent 8ae4721500148d1c8af322f0ad5478ead1eb5949
Author: Andrew Laack <andrew@laack.co>
Date:   Tue,  5 May 2026 10:17:32 -0500

Updated hierarchy

Diffstat:
Mcrawl/main.go | 83+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
1 file changed, 66 insertions(+), 17 deletions(-)

diff --git a/crawl/main.go b/crawl/main.go @@ -2,22 +2,24 @@ package main import ( "database/sql" - "github.com/google/uuid" "fmt" + "errors" "github.com/makeworld-the-better-one/go-gemini" _ "github.com/mattn/go-sqlite3" "io" "net/url" "os" "strings" + "time" ) -func fetchSite(currentUrl string) string { +func fetchSite(currentUrl string) (string, error){ - resp, err := gemini.Fetch(currentUrl) + client := &gemini.Client{ConnectTimeout: 5 * time.Second} + resp, err := client.Fetch(currentUrl) if err != nil { - panic(err) + return "", err } bodyBytes, err := io.ReadAll(resp.Body) @@ -28,7 +30,7 @@ func fetchSite(currentUrl string) string { panic(err) } - return body + return body, nil } @@ -71,7 +73,7 @@ func parseLinks(body string, currentUrl string) []string { if indexOfSpace == -1 { indexOfSpace = len(item) } - if strings.Compare(item[:9], "gemini://") == 0 { + if len(item) >= 10 && strings.Compare(item[:9], "gemini://") == 0 { geminiLinks = append(geminiLinks, item[0:indexOfSpace]) } if strings.Contains(item, "://") == false { @@ -100,7 +102,6 @@ func setupDB(dir string, filename string) *sql.DB { } createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, CONSTRAINT unq UNIQUE (source, destination));" - createPage := "CREATE TABLE IF NOT EXISTS page (url UNIQUE, path);" _, err = db.Exec(createLink) @@ -118,18 +119,55 @@ func setupDB(dir string, filename string) *sql.DB { return db } -func writeSiteOutput(outputDir string, body string) string{ +func missingFilename(path string) bool { + return strings.Compare(path, "") == 0 || strings.Compare(path, "/") == 0 +} + +// The reason I chose to do {host}/{escaped_path} instead of a UUID or just {escaped_url} is to +// have a better hierarchy for looking at files that are downloaded. + +// I could've done a more deeply nested structure, but cases like the following become annoying: +// {site}/blog +// {site}/blog/entry.gmi +// I could probably assume the first one is /blog/index.gmi, but since the '.' character is valid in urls +// there doesn't seem to be a reliable way to differentiate between directories and files in paths. +// This is to say, different server implementation may do weird stuff, and we shouldn't make assumptions about +// that sort of thing. I also don't want to limit myself to **just** gemtext so this seems good enough for now. + +func writeSiteOutput(outputDir string, body string, currentUrl string) (string, error){ + + u, err := url.Parse(currentUrl) + + if err != nil { + panic(err) + } + + if !u.IsAbs() { + return "", errors.New("Must use absolute paths.") + } + + filename := u.EscapedPath() + dirname := u.Host - filename := uuid.New().String() - os.MkdirAll(outputDir+"/"+"sites",0755) - err := os.WriteFile(outputDir+"/"+"sites"+"/"+filename, []byte(body), 0644) + if missingFilename(filename) { + filename = "index.gmi" + } + + filename = url.PathEscape(filename) + + + outputDirectory := outputDir+"/"+"sites"+"/"+dirname+"/" + outputPath := outputDirectory+filename + + os.MkdirAll(outputDirectory,0755) + err = os.WriteFile(outputPath, []byte(body), 0644) if err != nil { panic(err) } - return outputDir+"/"+"sites"+"/"+filename + return outputPath, nil } func main() { @@ -141,7 +179,7 @@ func main() { db := setupDB(outputDir, dbName) defer db.Close() - links := []string{"gemini://perso.pw/blog/index.gmi"} + links := []string{"gemini://tlgs.one"} for len(links) > 0 { @@ -149,18 +187,29 @@ func main() { currentUrl := links[0] links = links[1:] - body := fetchSite(currentUrl) + body, err := fetchSite(currentUrl) + + if err != nil { + fmt.Printf("Unable to fetch %s: %s\n", currentUrl, err) + continue + } + + fmt.Printf("Fetched %s\n", currentUrl) + forwardGeminiLinks := parseLinks(body, currentUrl) - outputLocation := writeSiteOutput(outputDir, body) - fmt.Println(outputLocation) + outputLocation, err := writeSiteOutput(outputDir, body, currentUrl) + + if err != nil { + panic(err) + } insertPageQuery := "INSERT OR REPLACE INTO page (url, path) VALUES (?, ?)" insertLinkQuery := "INSERT INTO link (source, destination) VALUES (?, ?) ON CONFLICT DO NOTHING" tx, _ := db.Begin() - _, err := tx.Exec(insertPageQuery, currentUrl, outputLocation) + _, err = tx.Exec(insertPageQuery, currentUrl, outputLocation) if err != nil { panic(err)