gemini-search

A simple search engine for Geminispace
git clone git://git.laack.co/gemini-search.git
Log | Files | Refs | README

commit 390daf67fdf852102628bbc6df4aff499ccb566c
parent a752a8ed25dcbec2960b66b978c7c5007f16c975
Author: Andrew Laack <andrew@laack.co>
Date:   Wed,  6 May 2026 03:34:38 -0500

Formatter + comment

Diffstat:
Mcrawl/main.go | 43+++++++++++++++++++------------------------
1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/crawl/main.go b/crawl/main.go @@ -2,15 +2,15 @@ package main import ( "database/sql" - "fmt" - "sync" "errors" + "fmt" "github.com/makeworld-the-better-one/go-gemini" _ "github.com/mattn/go-sqlite3" "io" "net/url" "os" "strings" + "sync" "time" ) @@ -24,9 +24,9 @@ var TempFailureStatusCode = errors.New("Temporary failure status code received." var FailureStatusCode = errors.New("Failure status code received.") var CertStatusCode = errors.New("Certificate related status code received.") - +// TODO: Respect robots.txt (does this go here? Does this go on insertion for querying?) // TODO: Limit size returned, could fuck up ram -func fetchGemtext(currentUrl string) (string, string, error){ +func fetchGemtext(currentUrl string) (string, string, error) { client := &gemini.Client{ConnectTimeout: 5 * time.Second} resp, err := client.Fetch(currentUrl) @@ -37,7 +37,7 @@ func fetchGemtext(currentUrl string) (string, string, error){ redirects := 0 - for resp.Status / 10 == 3 && redirects < 5 { + for resp.Status/10 == 3 && redirects < 5 { redirects += 1 fmt.Printf("redirect count: %d, original url: %s\n", redirects, currentUrl) @@ -67,31 +67,30 @@ func fetchGemtext(currentUrl string) (string, string, error){ fmt.Printf("Final URL: %s\n", currentUrl) - if resp.Status / 10 == 3 { + if resp.Status/10 == 3 { return "", "", TooManyRedirects } - // Followed redirects first, then check the rest of the status codes just in case // https://github.com/AyrA/Gemini/blob/master/Protocol.md // 1X are interactive - if resp.Status / 10 == 1 { + if resp.Status/10 == 1 { return "", "", InteractiveStatusCode } - if resp.Status / 10 == 4 { + if resp.Status/10 == 4 { return "", "", TempFailureStatusCode } - if resp.Status / 10 == 5 { + if resp.Status/10 == 5 { return "", "", FailureStatusCode } - if resp.Status / 10 == 6 { + if resp.Status/10 == 6 { return "", "", CertStatusCode } // Only success (20) should fall through at this point. - + meta := resp.Meta fmt.Println(meta) @@ -114,7 +113,6 @@ func fetchGemtext(currentUrl string) (string, string, error){ body := string(bodyBytes) - // CurrentURL reflects the url after redirects and such applied. return body, currentUrl, nil @@ -152,21 +150,21 @@ func setupDB(dir string, filename string) *sql.DB { } func missingFilename(path string) bool { - return strings.Compare(path, "") == 0 || strings.Compare(path, "/") == 0 + return strings.Compare(path, "") == 0 || strings.Compare(path, "/") == 0 } // The reason I chose to do {host}/{escaped_path} instead of a UUID or just {escaped_url} is to -// have a better hierarchy for looking at files that are downloaded. +// have a better hierarchy for looking at files that are downloaded. // I could've done a more deeply nested structure, but cases like the following become annoying: // {site}/blog // {site}/blog/entry.gmi // I could probably assume the first one is /blog/index.gmi, but since the '.' character is valid in urls // there doesn't seem to be a reliable way to differentiate between directories and files in paths. -// This is to say, different server implementation may do weird stuff, and we shouldn't make assumptions about +// This is to say, different server implementation may do weird stuff, and we shouldn't make assumptions about // that sort of thing. I also don't want to limit myself to **just** gemtext so this seems good enough for now. -func writeSiteOutput(outputDir string, body string, currentUrl string) (string, error){ +func writeSiteOutput(outputDir string, body string, currentUrl string) (string, error) { u, err := url.Parse(currentUrl) @@ -183,7 +181,7 @@ func writeSiteOutput(outputDir string, body string, currentUrl string) (string, dirname := u.Host // include subdomains if missingFilename(filename) { - + // this is presumptuous, albeit quite standard. // main reason for this is to have uniquely identifying names for files. // unfortunately no way to create a file with an empty filename. @@ -194,12 +192,11 @@ func writeSiteOutput(outputDir string, body string, currentUrl string) (string, filename = url.PathEscape(filename) + outputDirectory := outputDir + "/" + "sites" + "/" + dirname + "/" + outputPath := outputDirectory + filename - outputDirectory := outputDir+"/"+"sites"+"/"+dirname+"/" - outputPath := outputDirectory+filename + os.MkdirAll(outputDirectory, 0755) - os.MkdirAll(outputDirectory,0755) - // verified correct for ext4, see here for details: // https://serverfault.com/questions/9546/filename-length-limits-on-linux @@ -298,13 +295,11 @@ func worker(db *sql.DB, wg *sync.WaitGroup) { func main() { - db := setupDB(OUTPUT_DIR, DB_NAME) defer db.Close() base := "gemini://tlgs.one" - insertLinkQuery := "INSERT INTO query_queue (source, added_unix_timestamp) VALUES (?, CURRENT_TIMESTAMP) ON CONFLICT DO UPDATE SET added_unix_timestamp=CURRENT_TIMESTAMP" // bootstrap