main.go (8637B)
1 package main 2 3 import ( 4 "database/sql" 5 "errors" 6 "fmt" 7 "geminisearch/gemtextparser" 8 "github.com/makeworld-the-better-one/go-gemini" 9 _ "github.com/mattn/go-sqlite3" 10 "io" 11 "net/url" 12 "os" 13 "strings" 14 "sync" 15 "time" 16 ) 17 18 var OUTPUT_DIR = "outputs/crawled" 19 var DB_NAME = "main.db" 20 21 var NotGemtextError = errors.New("Meta stated resource is not gemtext.") 22 var TooManyRedirects = errors.New("Too many redirects.") 23 var InteractiveStatusCode = errors.New("Interactive status code received.") 24 var TempFailureStatusCode = errors.New("Temporary failure status code received.") 25 var FailureStatusCode = errors.New("Failure status code received.") 26 var CertStatusCode = errors.New("Certificate related status code received.") 27 28 // TODO: Respect robots.txt (does this go here? Does this go on insertion for querying?) 29 // TODO: Limit size returned, could fuck up ram 30 31 func fetchGemtext(currentUrl string) (string, string, error) { 32 33 client := &gemini.Client{ConnectTimeout: 5 * time.Second} 34 resp, err := client.Fetch(currentUrl) 35 36 if err != nil { 37 return "", "", err 38 } 39 40 redirects := 0 41 42 for resp.Status/10 == 3 && redirects < 5 { 43 redirects += 1 44 45 fmt.Printf("redirect count: %d, original url: %s\n", redirects, currentUrl) 46 47 base, err := url.Parse(currentUrl) 48 49 if err != nil { 50 return "", "", err 51 } 52 53 redirect, err := url.Parse(resp.Meta) 54 55 if err != nil { 56 return "", "", err 57 } 58 59 currentUrl = base.ResolveReference(redirect).String() 60 61 fmt.Printf("redirect count: %d, resulting url: %s\n", redirects, currentUrl) 62 63 resp, err = client.Fetch(currentUrl) 64 65 if err != nil { 66 return "", "", err 67 } 68 } 69 70 fmt.Printf("Final URL: %s\n", currentUrl) 71 72 if resp.Status/10 == 3 { 73 return "", "", TooManyRedirects 74 } 75 76 // Followed redirects first, then check the rest of the status codes just in case 77 // https://github.com/AyrA/Gemini/blob/master/Protocol.md 78 // 1X are interactive 79 80 if resp.Status/10 == 1 { 81 return "", "", InteractiveStatusCode 82 } 83 84 if resp.Status/10 == 4 { 85 return "", "", TempFailureStatusCode 86 } 87 if resp.Status/10 == 5 { 88 return "", "", FailureStatusCode 89 } 90 if resp.Status/10 == 6 { 91 return "", "", CertStatusCode 92 } 93 94 // Only success (20) should fall through at this point. 95 96 meta := resp.Meta 97 98 fmt.Println(meta) 99 100 if !strings.Contains(meta, "text/gemini") { 101 err = NotGemtextError 102 fmt.Printf("Non-gemtext type: %s\n", meta) 103 return "", "", err 104 } 105 106 bodyBytes, err := io.ReadAll(resp.Body) 107 108 if err != nil { 109 110 // saw this happen with 111 // read tcp {local_ip}:40396->{remote_ip}:1965: read: connection reset by peer 112 113 return "", "", err 114 } 115 116 body := string(bodyBytes) 117 118 // CurrentURL reflects the url after redirects and such applied. 119 return body, currentUrl, nil 120 121 } 122 123 func setupDB(dir string, filename string) *sql.DB { 124 125 os.MkdirAll(dir, 0755) 126 db, err := sql.Open("sqlite3", "./"+dir+"/"+filename) 127 128 if err != nil { 129 panic(err) 130 } 131 132 createLink := "CREATE TABLE IF NOT EXISTS query_queue (source UNIQUE, added_timestamp datetime);" 133 createPage := "CREATE TABLE IF NOT EXISTS page (url UNIQUE, path, added_timestamp datetime);" 134 135 _, err = db.Exec(createLink) 136 137 if err != nil { 138 panic(err) 139 } 140 141 _, err = db.Exec(createPage) 142 143 if err != nil { 144 panic(err) 145 } 146 147 db.Exec("PRAGMA journal_mode=WAL;") 148 db.Exec("PRAGMA busy_timeout=2000;") // :( is it so soon the migration to postgresql is initiated? 149 db.Exec("PRAGMA synchronous=NORMAL;") 150 151 return db 152 } 153 154 func missingFilename(path string) bool { 155 return strings.Compare(path, "") == 0 || strings.Compare(path, "/") == 0 156 } 157 158 // The reason I chose to do {host}/{escaped_path} instead of a UUID or just {escaped_url} is to 159 // have a better hierarchy for looking at files that are downloaded. 160 161 // I could've done a more deeply nested structure, but cases like the following become annoying: 162 // {site}/blog 163 // {site}/blog/entry.gmi 164 // I could probably assume the first one is /blog/index.gmi, but since the '.' character is valid in urls 165 // there doesn't seem to be a reliable way to differentiate between directories and files in paths. 166 // This is to say, different server implementation may do weird stuff, and we shouldn't make assumptions about 167 // that sort of thing. I also don't want to limit myself to **just** gemtext so this seems good enough for now. 168 169 func writeSiteOutput(outputDir string, body string, currentUrl string) (string, error) { 170 171 u, err := url.Parse(currentUrl) 172 173 if err != nil { 174 panic(err) 175 } 176 177 if !u.IsAbs() { 178 return "", errors.New("Must use absolute paths.") 179 } 180 181 // This sometimes seems to include a leading '/' so drop that, no need, directory 182 // hierarchy gives this information to us on the fs. 183 184 filename := u.EscapedPath() 185 186 if len(filename) > 1 && filename[0] == '/' { 187 filename = filename[1:] 188 } 189 190 dirname := u.Host // include subdomains 191 192 if missingFilename(filename) { 193 194 // this is presumptuous, albeit quite standard. 195 // main reason for this is to have uniquely identifying names for files. 196 // unfortunately no way to create a file with an empty filename. 197 // this can result in ambiguity between {site}/index.gmi and {site}, but we shall assume they are the same. 198 199 filename = "index.gmi" 200 } 201 202 filename = url.PathEscape(filename) 203 204 outputDirectory := outputDir + "/" + "sites" + "/" + dirname + "/" 205 outputPath := outputDirectory + filename 206 207 os.MkdirAll(outputDirectory, 0755) 208 209 // verified correct for ext4, see here for details: 210 // https://serverfault.com/questions/9546/filename-length-limits-on-linux 211 212 if len(filename) <= 255 { 213 err = os.WriteFile(outputPath, []byte(body), 0644) 214 } else { 215 fmt.Println("Skipping file creation; filename too long") 216 } 217 218 if err != nil { 219 panic(err) 220 } 221 222 return outputPath, nil 223 } 224 225 func getNext(db *sql.DB) string { 226 227 link := "" 228 229 popLinkQuery := ` DELETE FROM query_queue 230 WHERE source = (SELECT source FROM query_queue ORDER BY added_timestamp ASC LIMIT 1) 231 RETURNING source` 232 233 err := db.QueryRow(popLinkQuery).Scan(&link) 234 235 if err == sql.ErrNoRows { 236 return "" 237 } else if err != nil { 238 panic(err) 239 } 240 241 return link 242 } 243 244 func worker(db *sql.DB, wg *sync.WaitGroup) { 245 defer wg.Done() 246 247 insertPageQuery := "INSERT OR REPLACE INTO page (url, path, added_timestamp) VALUES (?, ?, CURRENT_TIMESTAMP)" 248 insertLinkQuery := "INSERT INTO query_queue (source, added_timestamp) VALUES (?, CURRENT_TIMESTAMP) ON CONFLICT DO UPDATE SET added_timestamp=CURRENT_TIMESTAMP" 249 for true { 250 251 currentUrl := getNext(db) 252 253 if currentUrl == "" { 254 fmt.Printf("No links remaining.\n") 255 return 256 } 257 fmt.Printf("Querying: %s\n", currentUrl) 258 259 body, finalUrl, err := fetchGemtext(currentUrl) 260 261 if err == NotGemtextError { 262 fmt.Println("Not gemtext... Continuing") 263 } else if err != nil { 264 fmt.Printf("Unable to fetch %s: %s\n", currentUrl, err) 265 continue 266 } 267 268 if finalUrl != "" && finalUrl != currentUrl { 269 fmt.Printf("Updating url to reflect redirects: %s => %s\n", currentUrl, finalUrl) 270 currentUrl = finalUrl 271 } 272 273 fmt.Printf("Fetched %s\n", currentUrl) 274 275 forwardGeminiLinks := gemtextparser.ParseLinks(body, currentUrl) 276 277 // TODO: We should check to make sure the filename isn't too long before we get to this point 278 // otherwise we have an annoying db inconsistency. 279 // I noticed this when performing indexing and it said "file name too long" which means the path 280 // was inserted into the db, but presumably nothing was written to disk because it'd've been too long 281 // also, there is a generally worrying incosistency that occurs when writing to disk and using sqlite 282 // without some sort of locking mechanism. 283 284 // TODO: When the meta != text/gemini, what happens? I'm seeing a bunch of files that are empty; this 285 // isn't really what i'd expect, I'd expect some short circuiting. 286 287 outputLocation, err := writeSiteOutput(OUTPUT_DIR, body, currentUrl) 288 289 if err != nil { 290 panic(err) 291 } 292 293 tx, _ := db.Begin() 294 295 _, err = tx.Exec(insertPageQuery, currentUrl, outputLocation) 296 297 if err != nil { 298 panic(err) 299 } 300 301 for _, link := range forwardGeminiLinks { 302 303 _, err := tx.Exec(insertLinkQuery, link) 304 305 if err != nil { 306 panic(err) 307 } 308 309 } 310 tx.Commit() 311 } 312 313 } 314 315 func main() { 316 317 db := setupDB(OUTPUT_DIR, DB_NAME) 318 defer db.Close() 319 320 base := "gemini://tlgs.one" 321 322 insertLinkQuery := "INSERT INTO query_queue (source, added_timestamp) VALUES (?, CURRENT_TIMESTAMP) ON CONFLICT DO UPDATE SET added_timestamp=CURRENT_TIMESTAMP" 323 324 // bootstrap 325 _, err := db.Exec(insertLinkQuery, base) 326 327 if err != nil { 328 panic(err) 329 } 330 331 var wg sync.WaitGroup 332 333 for i := 0; i < 20; i++ { 334 wg.Add(1) 335 go worker(db, &wg) 336 time.Sleep(time.Second * 5) // This allows the queue to be built up before dispatching to help with fresh runs 337 } 338 wg.Wait() 339 340 }