commit a752a8ed25dcbec2960b66b978c7c5007f16c975
parent 47b532d1127bf3f977a7bcd52f6dfcf9f4d13dbb
Author: Andrew Laack <andrew@laack.co>
Date: Wed, 6 May 2026 03:32:24 -0500
Refactoring, reading and writing to queue, redirect handling, status code handling, only gemtext, etc...
Diffstat:
| M | crawl/main.go | | | 201 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------- |
1 file changed, 175 insertions(+), 26 deletions(-)
diff --git a/crawl/main.go b/crawl/main.go
@@ -3,6 +3,7 @@ package main
import (
"database/sql"
"fmt"
+ "sync"
"errors"
"github.com/makeworld-the-better-one/go-gemini"
_ "github.com/mattn/go-sqlite3"
@@ -13,24 +14,109 @@ import (
"time"
)
-func fetchSite(currentUrl string) (string, error){
+var OUTPUT_DIR = "outputs"
+var DB_NAME = "main.db"
+
+var NotGemtextError = errors.New("Meta stated resource is not gemtext.")
+var TooManyRedirects = errors.New("Too many redirects.")
+var InteractiveStatusCode = errors.New("Interactive status code received.")
+var TempFailureStatusCode = errors.New("Temporary failure status code received.")
+var FailureStatusCode = errors.New("Failure status code received.")
+var CertStatusCode = errors.New("Certificate related status code received.")
+
+
+// TODO: Limit size returned, could fuck up ram
+func fetchGemtext(currentUrl string) (string, string, error){
client := &gemini.Client{ConnectTimeout: 5 * time.Second}
resp, err := client.Fetch(currentUrl)
if err != nil {
- return "", err
+ return "", "", err
}
- bodyBytes, err := io.ReadAll(resp.Body)
+ redirects := 0
- body := string(bodyBytes)
+ for resp.Status / 10 == 3 && redirects < 5 {
+ redirects += 1
+
+ fmt.Printf("redirect count: %d, original url: %s\n", redirects, currentUrl)
+
+ base, err := url.Parse(currentUrl)
+
+ if err != nil {
+ return "", "", err
+ }
+
+ redirect, err := url.Parse(resp.Meta)
+
+ if err != nil {
+ return "", "", err
+ }
+
+ currentUrl = base.ResolveReference(redirect).String()
+
+ fmt.Printf("redirect count: %d, resulting url: %s\n", redirects, currentUrl)
+
+ resp, err = client.Fetch(currentUrl)
+
+ if err != nil {
+ return "", "", err
+ }
+ }
+
+ fmt.Printf("Final URL: %s\n", currentUrl)
+
+ if resp.Status / 10 == 3 {
+ return "", "", TooManyRedirects
+ }
+
+
+ // Followed redirects first, then check the rest of the status codes just in case
+ // https://github.com/AyrA/Gemini/blob/master/Protocol.md
+ // 1X are interactive
+
+ if resp.Status / 10 == 1 {
+ return "", "", InteractiveStatusCode
+ }
+
+ if resp.Status / 10 == 4 {
+ return "", "", TempFailureStatusCode
+ }
+ if resp.Status / 10 == 5 {
+ return "", "", FailureStatusCode
+ }
+ if resp.Status / 10 == 6 {
+ return "", "", CertStatusCode
+ }
+
+ // Only success (20) should fall through at this point.
+
+ meta := resp.Meta
+
+ fmt.Println(meta)
+
+ if !strings.Contains(meta, "text/gemini") {
+ err = NotGemtextError
+ fmt.Printf("Non-gemtext type: %s\n", meta)
+ return "", "", err
+ }
+
+ bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
- panic(err)
+
+ // saw this happen with
+ // read tcp {local_ip}:40396->{remote_ip}:1965: read: connection reset by peer
+
+ return "", "", err
}
- return body, nil
+ body := string(bodyBytes)
+
+
+ // CurrentURL reflects the url after redirects and such applied.
+ return body, currentUrl, nil
}
@@ -43,7 +129,7 @@ func setupDB(dir string, filename string) *sql.DB {
panic(err)
}
- createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, CONSTRAINT unq UNIQUE (source, destination));"
+ createLink := "CREATE TABLE IF NOT EXISTS query_queue (source UNIQUE, added_unix_timestamp datetime);"
createPage := "CREATE TABLE IF NOT EXISTS page (url UNIQUE, path);"
_, err = db.Exec(createLink)
@@ -58,6 +144,10 @@ func setupDB(dir string, filename string) *sql.DB {
panic(err)
}
+ db.Exec("PRAGMA journal_mode=WAL;")
+ db.Exec("PRAGMA busy_timeout=1000;")
+ db.Exec("PRAGMA synchronous=NORMAL;")
+
return db
}
@@ -109,7 +199,15 @@ func writeSiteOutput(outputDir string, body string, currentUrl string) (string,
outputPath := outputDirectory+filename
os.MkdirAll(outputDirectory,0755)
- err = os.WriteFile(outputPath, []byte(body), 0644)
+
+ // verified correct for ext4, see here for details:
+ // https://serverfault.com/questions/9546/filename-length-limits-on-linux
+
+ if len(filename) <= 255 {
+ err = os.WriteFile(outputPath, []byte(body), 0644)
+ } else {
+ fmt.Println("Skipping file creation; filename too long")
+ }
if err != nil {
panic(err)
@@ -118,43 +216,64 @@ func writeSiteOutput(outputDir string, body string, currentUrl string) (string,
return outputPath, nil
}
-func main() {
+func getNext(db *sql.DB) string {
- outputDir := "outputs"
- dbName := "main.db"
+ link := ""
+ popLinkQuery := ` DELETE FROM query_queue
+ WHERE source = (SELECT source FROM query_queue ORDER BY added_unix_timestamp ASC LIMIT 1)
+ RETURNING source`
- db := setupDB(outputDir, dbName)
- defer db.Close()
+ err := db.QueryRow(popLinkQuery).Scan(&link)
- links := []string{"gemini://tlgs.one"}
+ if err == sql.ErrNoRows {
+ return ""
+ } else if err != nil {
+ panic(err)
+ }
-
- for len(links) > 0 {
+ return link
+}
- currentUrl := links[0]
- links = links[1:]
+func worker(db *sql.DB, wg *sync.WaitGroup) {
+ defer wg.Done()
- body, err := fetchSite(currentUrl)
+ insertPageQuery := "INSERT OR REPLACE INTO page (url, path) VALUES (?, ?)"
+ insertLinkQuery := "INSERT INTO query_queue (source, added_unix_timestamp) VALUES (?, CURRENT_TIMESTAMP) ON CONFLICT DO UPDATE SET added_unix_timestamp=CURRENT_TIMESTAMP"
+ for true {
- if err != nil {
+ currentUrl := getNext(db)
+
+ if currentUrl == "" {
+ fmt.Printf("No links remaining.\n")
+ return
+ }
+ fmt.Printf("Querying: %s\n", currentUrl)
+
+ body, finalUrl, err := fetchGemtext(currentUrl)
+
+ if err == NotGemtextError {
+ fmt.Println("Not gemtext... Continuing")
+ } else if err != nil {
fmt.Printf("Unable to fetch %s: %s\n", currentUrl, err)
continue
}
+ if finalUrl != "" && finalUrl != currentUrl {
+ fmt.Printf("Updating url to reflect redirects: %s => %s\n", currentUrl, finalUrl)
+ currentUrl = finalUrl
+ }
+
fmt.Printf("Fetched %s\n", currentUrl)
forwardGeminiLinks := parseLinks(body, currentUrl)
- outputLocation, err := writeSiteOutput(outputDir, body, currentUrl)
+ outputLocation, err := writeSiteOutput(OUTPUT_DIR, body, currentUrl)
if err != nil {
panic(err)
}
- insertPageQuery := "INSERT OR REPLACE INTO page (url, path) VALUES (?, ?)"
- insertLinkQuery := "INSERT INTO link (source, destination) VALUES (?, ?) ON CONFLICT DO NOTHING"
-
tx, _ := db.Begin()
_, err = tx.Exec(insertPageQuery, currentUrl, outputLocation)
@@ -165,8 +284,7 @@ func main() {
for _, link := range forwardGeminiLinks {
- links = append(links, link)
- _, err := tx.Exec(insertLinkQuery, currentUrl, link)
+ _, err := tx.Exec(insertLinkQuery, link)
if err != nil {
panic(err)
@@ -177,3 +295,34 @@ func main() {
}
}
+
+func main() {
+
+
+ db := setupDB(OUTPUT_DIR, DB_NAME)
+ defer db.Close()
+
+ base := "gemini://tlgs.one"
+
+
+ insertLinkQuery := "INSERT INTO query_queue (source, added_unix_timestamp) VALUES (?, CURRENT_TIMESTAMP) ON CONFLICT DO UPDATE SET added_unix_timestamp=CURRENT_TIMESTAMP"
+
+ // bootstrap
+ _, err := db.Exec(insertLinkQuery, base)
+
+ if err != nil {
+ panic(err)
+ }
+
+ var wg sync.WaitGroup
+
+ // TODO: This requires some "seeding" to get going bc these will find there is nothing to pull and stop early
+ // when queue starts with <20 entries.
+
+ for i := 0; i < 20; i++ {
+ wg.Add(1)
+ go worker(db, &wg)
+ }
+ wg.Wait()
+
+}