gemini-search

A simple search engine for Geminispace
git clone git://git.laack.co/gemini-search.git
Log | Files | Refs | README

commit a752a8ed25dcbec2960b66b978c7c5007f16c975
parent 47b532d1127bf3f977a7bcd52f6dfcf9f4d13dbb
Author: Andrew Laack <andrew@laack.co>
Date:   Wed,  6 May 2026 03:32:24 -0500

Refactoring, reading and writing to queue, redirect handling, status code handling, only gemtext, etc...

Diffstat:
Mcrawl/main.go | 201++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 175 insertions(+), 26 deletions(-)

diff --git a/crawl/main.go b/crawl/main.go @@ -3,6 +3,7 @@ package main import ( "database/sql" "fmt" + "sync" "errors" "github.com/makeworld-the-better-one/go-gemini" _ "github.com/mattn/go-sqlite3" @@ -13,24 +14,109 @@ import ( "time" ) -func fetchSite(currentUrl string) (string, error){ +var OUTPUT_DIR = "outputs" +var DB_NAME = "main.db" + +var NotGemtextError = errors.New("Meta stated resource is not gemtext.") +var TooManyRedirects = errors.New("Too many redirects.") +var InteractiveStatusCode = errors.New("Interactive status code received.") +var TempFailureStatusCode = errors.New("Temporary failure status code received.") +var FailureStatusCode = errors.New("Failure status code received.") +var CertStatusCode = errors.New("Certificate related status code received.") + + +// TODO: Limit size returned, could fuck up ram +func fetchGemtext(currentUrl string) (string, string, error){ client := &gemini.Client{ConnectTimeout: 5 * time.Second} resp, err := client.Fetch(currentUrl) if err != nil { - return "", err + return "", "", err } - bodyBytes, err := io.ReadAll(resp.Body) + redirects := 0 - body := string(bodyBytes) + for resp.Status / 10 == 3 && redirects < 5 { + redirects += 1 + + fmt.Printf("redirect count: %d, original url: %s\n", redirects, currentUrl) + + base, err := url.Parse(currentUrl) + + if err != nil { + return "", "", err + } + + redirect, err := url.Parse(resp.Meta) + + if err != nil { + return "", "", err + } + + currentUrl = base.ResolveReference(redirect).String() + + fmt.Printf("redirect count: %d, resulting url: %s\n", redirects, currentUrl) + + resp, err = client.Fetch(currentUrl) + + if err != nil { + return "", "", err + } + } + + fmt.Printf("Final URL: %s\n", currentUrl) + + if resp.Status / 10 == 3 { + return "", "", TooManyRedirects + } + + + // Followed redirects first, then check the rest of the status codes just in case + // https://github.com/AyrA/Gemini/blob/master/Protocol.md + // 1X are interactive + + if resp.Status / 10 == 1 { + return "", "", InteractiveStatusCode + } + + if resp.Status / 10 == 4 { + return "", "", TempFailureStatusCode + } + if resp.Status / 10 == 5 { + return "", "", FailureStatusCode + } + if resp.Status / 10 == 6 { + return "", "", CertStatusCode + } + + // Only success (20) should fall through at this point. + + meta := resp.Meta + + fmt.Println(meta) + + if !strings.Contains(meta, "text/gemini") { + err = NotGemtextError + fmt.Printf("Non-gemtext type: %s\n", meta) + return "", "", err + } + + bodyBytes, err := io.ReadAll(resp.Body) if err != nil { - panic(err) + + // saw this happen with + // read tcp {local_ip}:40396->{remote_ip}:1965: read: connection reset by peer + + return "", "", err } - return body, nil + body := string(bodyBytes) + + + // CurrentURL reflects the url after redirects and such applied. + return body, currentUrl, nil } @@ -43,7 +129,7 @@ func setupDB(dir string, filename string) *sql.DB { panic(err) } - createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, CONSTRAINT unq UNIQUE (source, destination));" + createLink := "CREATE TABLE IF NOT EXISTS query_queue (source UNIQUE, added_unix_timestamp datetime);" createPage := "CREATE TABLE IF NOT EXISTS page (url UNIQUE, path);" _, err = db.Exec(createLink) @@ -58,6 +144,10 @@ func setupDB(dir string, filename string) *sql.DB { panic(err) } + db.Exec("PRAGMA journal_mode=WAL;") + db.Exec("PRAGMA busy_timeout=1000;") + db.Exec("PRAGMA synchronous=NORMAL;") + return db } @@ -109,7 +199,15 @@ func writeSiteOutput(outputDir string, body string, currentUrl string) (string, outputPath := outputDirectory+filename os.MkdirAll(outputDirectory,0755) - err = os.WriteFile(outputPath, []byte(body), 0644) + + // verified correct for ext4, see here for details: + // https://serverfault.com/questions/9546/filename-length-limits-on-linux + + if len(filename) <= 255 { + err = os.WriteFile(outputPath, []byte(body), 0644) + } else { + fmt.Println("Skipping file creation; filename too long") + } if err != nil { panic(err) @@ -118,43 +216,64 @@ func writeSiteOutput(outputDir string, body string, currentUrl string) (string, return outputPath, nil } -func main() { +func getNext(db *sql.DB) string { - outputDir := "outputs" - dbName := "main.db" + link := "" + popLinkQuery := ` DELETE FROM query_queue + WHERE source = (SELECT source FROM query_queue ORDER BY added_unix_timestamp ASC LIMIT 1) + RETURNING source` - db := setupDB(outputDir, dbName) - defer db.Close() + err := db.QueryRow(popLinkQuery).Scan(&link) - links := []string{"gemini://tlgs.one"} + if err == sql.ErrNoRows { + return "" + } else if err != nil { + panic(err) + } - - for len(links) > 0 { + return link +} - currentUrl := links[0] - links = links[1:] +func worker(db *sql.DB, wg *sync.WaitGroup) { + defer wg.Done() - body, err := fetchSite(currentUrl) + insertPageQuery := "INSERT OR REPLACE INTO page (url, path) VALUES (?, ?)" + insertLinkQuery := "INSERT INTO query_queue (source, added_unix_timestamp) VALUES (?, CURRENT_TIMESTAMP) ON CONFLICT DO UPDATE SET added_unix_timestamp=CURRENT_TIMESTAMP" + for true { - if err != nil { + currentUrl := getNext(db) + + if currentUrl == "" { + fmt.Printf("No links remaining.\n") + return + } + fmt.Printf("Querying: %s\n", currentUrl) + + body, finalUrl, err := fetchGemtext(currentUrl) + + if err == NotGemtextError { + fmt.Println("Not gemtext... Continuing") + } else if err != nil { fmt.Printf("Unable to fetch %s: %s\n", currentUrl, err) continue } + if finalUrl != "" && finalUrl != currentUrl { + fmt.Printf("Updating url to reflect redirects: %s => %s\n", currentUrl, finalUrl) + currentUrl = finalUrl + } + fmt.Printf("Fetched %s\n", currentUrl) forwardGeminiLinks := parseLinks(body, currentUrl) - outputLocation, err := writeSiteOutput(outputDir, body, currentUrl) + outputLocation, err := writeSiteOutput(OUTPUT_DIR, body, currentUrl) if err != nil { panic(err) } - insertPageQuery := "INSERT OR REPLACE INTO page (url, path) VALUES (?, ?)" - insertLinkQuery := "INSERT INTO link (source, destination) VALUES (?, ?) ON CONFLICT DO NOTHING" - tx, _ := db.Begin() _, err = tx.Exec(insertPageQuery, currentUrl, outputLocation) @@ -165,8 +284,7 @@ func main() { for _, link := range forwardGeminiLinks { - links = append(links, link) - _, err := tx.Exec(insertLinkQuery, currentUrl, link) + _, err := tx.Exec(insertLinkQuery, link) if err != nil { panic(err) @@ -177,3 +295,34 @@ func main() { } } + +func main() { + + + db := setupDB(OUTPUT_DIR, DB_NAME) + defer db.Close() + + base := "gemini://tlgs.one" + + + insertLinkQuery := "INSERT INTO query_queue (source, added_unix_timestamp) VALUES (?, CURRENT_TIMESTAMP) ON CONFLICT DO UPDATE SET added_unix_timestamp=CURRENT_TIMESTAMP" + + // bootstrap + _, err := db.Exec(insertLinkQuery, base) + + if err != nil { + panic(err) + } + + var wg sync.WaitGroup + + // TODO: This requires some "seeding" to get going bc these will find there is nothing to pull and stop early + // when queue starts with <20 entries. + + for i := 0; i < 20; i++ { + wg.Add(1) + go worker(db, &wg) + } + wg.Wait() + +}