commit 390daf67fdf852102628bbc6df4aff499ccb566c
parent a752a8ed25dcbec2960b66b978c7c5007f16c975
Author: Andrew Laack <andrew@laack.co>
Date: Wed, 6 May 2026 03:34:38 -0500
Formatter + comment
Diffstat:
1 file changed, 19 insertions(+), 24 deletions(-)
diff --git a/crawl/main.go b/crawl/main.go
@@ -2,15 +2,15 @@ package main
import (
"database/sql"
- "fmt"
- "sync"
"errors"
+ "fmt"
"github.com/makeworld-the-better-one/go-gemini"
_ "github.com/mattn/go-sqlite3"
"io"
"net/url"
"os"
"strings"
+ "sync"
"time"
)
@@ -24,9 +24,9 @@ var TempFailureStatusCode = errors.New("Temporary failure status code received."
var FailureStatusCode = errors.New("Failure status code received.")
var CertStatusCode = errors.New("Certificate related status code received.")
-
+// TODO: Respect robots.txt (does this go here? Does this go on insertion for querying?)
// TODO: Limit size returned, could fuck up ram
-func fetchGemtext(currentUrl string) (string, string, error){
+func fetchGemtext(currentUrl string) (string, string, error) {
client := &gemini.Client{ConnectTimeout: 5 * time.Second}
resp, err := client.Fetch(currentUrl)
@@ -37,7 +37,7 @@ func fetchGemtext(currentUrl string) (string, string, error){
redirects := 0
- for resp.Status / 10 == 3 && redirects < 5 {
+ for resp.Status/10 == 3 && redirects < 5 {
redirects += 1
fmt.Printf("redirect count: %d, original url: %s\n", redirects, currentUrl)
@@ -67,31 +67,30 @@ func fetchGemtext(currentUrl string) (string, string, error){
fmt.Printf("Final URL: %s\n", currentUrl)
- if resp.Status / 10 == 3 {
+ if resp.Status/10 == 3 {
return "", "", TooManyRedirects
}
-
// Followed redirects first, then check the rest of the status codes just in case
// https://github.com/AyrA/Gemini/blob/master/Protocol.md
// 1X are interactive
- if resp.Status / 10 == 1 {
+ if resp.Status/10 == 1 {
return "", "", InteractiveStatusCode
}
- if resp.Status / 10 == 4 {
+ if resp.Status/10 == 4 {
return "", "", TempFailureStatusCode
}
- if resp.Status / 10 == 5 {
+ if resp.Status/10 == 5 {
return "", "", FailureStatusCode
}
- if resp.Status / 10 == 6 {
+ if resp.Status/10 == 6 {
return "", "", CertStatusCode
}
// Only success (20) should fall through at this point.
-
+
meta := resp.Meta
fmt.Println(meta)
@@ -114,7 +113,6 @@ func fetchGemtext(currentUrl string) (string, string, error){
body := string(bodyBytes)
-
// CurrentURL reflects the url after redirects and such applied.
return body, currentUrl, nil
@@ -152,21 +150,21 @@ func setupDB(dir string, filename string) *sql.DB {
}
func missingFilename(path string) bool {
- return strings.Compare(path, "") == 0 || strings.Compare(path, "/") == 0
+ return strings.Compare(path, "") == 0 || strings.Compare(path, "/") == 0
}
// The reason I chose to do {host}/{escaped_path} instead of a UUID or just {escaped_url} is to
-// have a better hierarchy for looking at files that are downloaded.
+// have a better hierarchy for looking at files that are downloaded.
// I could've done a more deeply nested structure, but cases like the following become annoying:
// {site}/blog
// {site}/blog/entry.gmi
// I could probably assume the first one is /blog/index.gmi, but since the '.' character is valid in urls
// there doesn't seem to be a reliable way to differentiate between directories and files in paths.
-// This is to say, different server implementation may do weird stuff, and we shouldn't make assumptions about
+// This is to say, different server implementation may do weird stuff, and we shouldn't make assumptions about
// that sort of thing. I also don't want to limit myself to **just** gemtext so this seems good enough for now.
-func writeSiteOutput(outputDir string, body string, currentUrl string) (string, error){
+func writeSiteOutput(outputDir string, body string, currentUrl string) (string, error) {
u, err := url.Parse(currentUrl)
@@ -183,7 +181,7 @@ func writeSiteOutput(outputDir string, body string, currentUrl string) (string,
dirname := u.Host // include subdomains
if missingFilename(filename) {
-
+
// this is presumptuous, albeit quite standard.
// main reason for this is to have uniquely identifying names for files.
// unfortunately no way to create a file with an empty filename.
@@ -194,12 +192,11 @@ func writeSiteOutput(outputDir string, body string, currentUrl string) (string,
filename = url.PathEscape(filename)
+ outputDirectory := outputDir + "/" + "sites" + "/" + dirname + "/"
+ outputPath := outputDirectory + filename
- outputDirectory := outputDir+"/"+"sites"+"/"+dirname+"/"
- outputPath := outputDirectory+filename
+ os.MkdirAll(outputDirectory, 0755)
- os.MkdirAll(outputDirectory,0755)
-
// verified correct for ext4, see here for details:
// https://serverfault.com/questions/9546/filename-length-limits-on-linux
@@ -298,13 +295,11 @@ func worker(db *sql.DB, wg *sync.WaitGroup) {
func main() {
-
db := setupDB(OUTPUT_DIR, DB_NAME)
defer db.Close()
base := "gemini://tlgs.one"
-
insertLinkQuery := "INSERT INTO query_queue (source, added_unix_timestamp) VALUES (?, CURRENT_TIMESTAMP) ON CONFLICT DO UPDATE SET added_unix_timestamp=CURRENT_TIMESTAMP"
// bootstrap