commit 099cf03e267ba92f31349539e9e9897272cfcbba
parent 8ae4721500148d1c8af322f0ad5478ead1eb5949
Author: Andrew Laack <andrew@laack.co>
Date: Tue, 5 May 2026 10:17:32 -0500
Updated hierarchy
Diffstat:
| M | crawl/main.go | | | 83 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------- |
1 file changed, 66 insertions(+), 17 deletions(-)
diff --git a/crawl/main.go b/crawl/main.go
@@ -2,22 +2,24 @@ package main
import (
"database/sql"
- "github.com/google/uuid"
"fmt"
+ "errors"
"github.com/makeworld-the-better-one/go-gemini"
_ "github.com/mattn/go-sqlite3"
"io"
"net/url"
"os"
"strings"
+ "time"
)
-func fetchSite(currentUrl string) string {
+func fetchSite(currentUrl string) (string, error){
- resp, err := gemini.Fetch(currentUrl)
+ client := &gemini.Client{ConnectTimeout: 5 * time.Second}
+ resp, err := client.Fetch(currentUrl)
if err != nil {
- panic(err)
+ return "", err
}
bodyBytes, err := io.ReadAll(resp.Body)
@@ -28,7 +30,7 @@ func fetchSite(currentUrl string) string {
panic(err)
}
- return body
+ return body, nil
}
@@ -71,7 +73,7 @@ func parseLinks(body string, currentUrl string) []string {
if indexOfSpace == -1 {
indexOfSpace = len(item)
}
- if strings.Compare(item[:9], "gemini://") == 0 {
+ if len(item) >= 10 && strings.Compare(item[:9], "gemini://") == 0 {
geminiLinks = append(geminiLinks, item[0:indexOfSpace])
}
if strings.Contains(item, "://") == false {
@@ -100,7 +102,6 @@ func setupDB(dir string, filename string) *sql.DB {
}
createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, CONSTRAINT unq UNIQUE (source, destination));"
-
createPage := "CREATE TABLE IF NOT EXISTS page (url UNIQUE, path);"
_, err = db.Exec(createLink)
@@ -118,18 +119,55 @@ func setupDB(dir string, filename string) *sql.DB {
return db
}
-func writeSiteOutput(outputDir string, body string) string{
+func missingFilename(path string) bool {
+ return strings.Compare(path, "") == 0 || strings.Compare(path, "/") == 0
+}
+
+// The reason I chose to do {host}/{escaped_path} instead of a UUID or just {escaped_url} is to
+// have a better hierarchy for looking at files that are downloaded.
+
+// I could've done a more deeply nested structure, but cases like the following become annoying:
+// {site}/blog
+// {site}/blog/entry.gmi
+// I could probably assume the first one is /blog/index.gmi, but since the '.' character is valid in urls
+// there doesn't seem to be a reliable way to differentiate between directories and files in paths.
+// This is to say, different server implementation may do weird stuff, and we shouldn't make assumptions about
+// that sort of thing. I also don't want to limit myself to **just** gemtext so this seems good enough for now.
+
+func writeSiteOutput(outputDir string, body string, currentUrl string) (string, error){
+
+ u, err := url.Parse(currentUrl)
+
+ if err != nil {
+ panic(err)
+ }
+
+ if !u.IsAbs() {
+ return "", errors.New("Must use absolute paths.")
+ }
+
+ filename := u.EscapedPath()
+ dirname := u.Host
- filename := uuid.New().String()
- os.MkdirAll(outputDir+"/"+"sites",0755)
- err := os.WriteFile(outputDir+"/"+"sites"+"/"+filename, []byte(body), 0644)
+ if missingFilename(filename) {
+ filename = "index.gmi"
+ }
+
+ filename = url.PathEscape(filename)
+
+
+ outputDirectory := outputDir+"/"+"sites"+"/"+dirname+"/"
+ outputPath := outputDirectory+filename
+
+ os.MkdirAll(outputDirectory,0755)
+ err = os.WriteFile(outputPath, []byte(body), 0644)
if err != nil {
panic(err)
}
- return outputDir+"/"+"sites"+"/"+filename
+ return outputPath, nil
}
func main() {
@@ -141,7 +179,7 @@ func main() {
db := setupDB(outputDir, dbName)
defer db.Close()
- links := []string{"gemini://perso.pw/blog/index.gmi"}
+ links := []string{"gemini://tlgs.one"}
for len(links) > 0 {
@@ -149,18 +187,29 @@ func main() {
currentUrl := links[0]
links = links[1:]
- body := fetchSite(currentUrl)
+ body, err := fetchSite(currentUrl)
+
+ if err != nil {
+ fmt.Printf("Unable to fetch %s: %s\n", currentUrl, err)
+ continue
+ }
+
+ fmt.Printf("Fetched %s\n", currentUrl)
+
forwardGeminiLinks := parseLinks(body, currentUrl)
- outputLocation := writeSiteOutput(outputDir, body)
- fmt.Println(outputLocation)
+ outputLocation, err := writeSiteOutput(outputDir, body, currentUrl)
+
+ if err != nil {
+ panic(err)
+ }
insertPageQuery := "INSERT OR REPLACE INTO page (url, path) VALUES (?, ?)"
insertLinkQuery := "INSERT INTO link (source, destination) VALUES (?, ?) ON CONFLICT DO NOTHING"
tx, _ := db.Begin()
- _, err := tx.Exec(insertPageQuery, currentUrl, outputLocation)
+ _, err = tx.Exec(insertPageQuery, currentUrl, outputLocation)
if err != nil {
panic(err)