gemini-search

A simple search engine for Geminispace
git clone git://git.laack.co/gemini-search.git
Log | Files | Refs | README

main.go (8637B)


      1 package main
      2 
      3 import (
      4 	"database/sql"
      5 	"errors"
      6 	"fmt"
      7 	"geminisearch/gemtextparser"
      8 	"github.com/makeworld-the-better-one/go-gemini"
      9 	_ "github.com/mattn/go-sqlite3"
     10 	"io"
     11 	"net/url"
     12 	"os"
     13 	"strings"
     14 	"sync"
     15 	"time"
     16 )
     17 
     18 var OUTPUT_DIR = "outputs/crawled"
     19 var DB_NAME = "main.db"
     20 
     21 var NotGemtextError = errors.New("Meta stated resource is not gemtext.")
     22 var TooManyRedirects = errors.New("Too many redirects.")
     23 var InteractiveStatusCode = errors.New("Interactive status code received.")
     24 var TempFailureStatusCode = errors.New("Temporary failure status code received.")
     25 var FailureStatusCode = errors.New("Failure status code received.")
     26 var CertStatusCode = errors.New("Certificate related status code received.")
     27 
     28 // TODO: Respect robots.txt (does this go here? Does this go on insertion for querying?)
     29 // TODO: Limit size returned, could fuck up ram
     30 
     31 func fetchGemtext(currentUrl string) (string, string, error) {
     32 
     33 	client := &gemini.Client{ConnectTimeout: 5 * time.Second}
     34 	resp, err := client.Fetch(currentUrl)
     35 
     36 	if err != nil {
     37 		return "", "", err
     38 	}
     39 
     40 	redirects := 0
     41 
     42 	for resp.Status/10 == 3 && redirects < 5 {
     43 		redirects += 1
     44 
     45 		fmt.Printf("redirect count: %d, original url: %s\n", redirects, currentUrl)
     46 
     47 		base, err := url.Parse(currentUrl)
     48 
     49 		if err != nil {
     50 			return "", "", err
     51 		}
     52 
     53 		redirect, err := url.Parse(resp.Meta)
     54 
     55 		if err != nil {
     56 			return "", "", err
     57 		}
     58 
     59 		currentUrl = base.ResolveReference(redirect).String()
     60 
     61 		fmt.Printf("redirect count: %d, resulting url: %s\n", redirects, currentUrl)
     62 
     63 		resp, err = client.Fetch(currentUrl)
     64 
     65 		if err != nil {
     66 			return "", "", err
     67 		}
     68 	}
     69 
     70 	fmt.Printf("Final URL: %s\n", currentUrl)
     71 
     72 	if resp.Status/10 == 3 {
     73 		return "", "", TooManyRedirects
     74 	}
     75 
     76 	// Followed redirects first, then check the rest of the status codes just in case
     77 	// https://github.com/AyrA/Gemini/blob/master/Protocol.md
     78 	// 1X are interactive
     79 
     80 	if resp.Status/10 == 1 {
     81 		return "", "", InteractiveStatusCode
     82 	}
     83 
     84 	if resp.Status/10 == 4 {
     85 		return "", "", TempFailureStatusCode
     86 	}
     87 	if resp.Status/10 == 5 {
     88 		return "", "", FailureStatusCode
     89 	}
     90 	if resp.Status/10 == 6 {
     91 		return "", "", CertStatusCode
     92 	}
     93 
     94 	// Only success (20) should fall through at this point.
     95 
     96 	meta := resp.Meta
     97 
     98 	fmt.Println(meta)
     99 
    100 	if !strings.Contains(meta, "text/gemini") {
    101 		err = NotGemtextError
    102 		fmt.Printf("Non-gemtext type: %s\n", meta)
    103 		return "", "", err
    104 	}
    105 
    106 	bodyBytes, err := io.ReadAll(resp.Body)
    107 
    108 	if err != nil {
    109 
    110 		// saw this happen with
    111 		// read tcp {local_ip}:40396->{remote_ip}:1965: read: connection reset by peer
    112 
    113 		return "", "", err
    114 	}
    115 
    116 	body := string(bodyBytes)
    117 
    118 	// CurrentURL reflects the url after redirects and such applied.
    119 	return body, currentUrl, nil
    120 
    121 }
    122 
    123 func setupDB(dir string, filename string) *sql.DB {
    124 
    125 	os.MkdirAll(dir, 0755)
    126 	db, err := sql.Open("sqlite3", "./"+dir+"/"+filename)
    127 
    128 	if err != nil {
    129 		panic(err)
    130 	}
    131 
    132 	createLink := "CREATE TABLE IF NOT EXISTS query_queue (source UNIQUE, added_timestamp datetime);"
    133 	createPage := "CREATE TABLE IF NOT EXISTS page (url UNIQUE, path, added_timestamp datetime);"
    134 
    135 	_, err = db.Exec(createLink)
    136 
    137 	if err != nil {
    138 		panic(err)
    139 	}
    140 
    141 	_, err = db.Exec(createPage)
    142 
    143 	if err != nil {
    144 		panic(err)
    145 	}
    146 
    147 	db.Exec("PRAGMA journal_mode=WAL;")
    148 	db.Exec("PRAGMA busy_timeout=2000;") // :( is it so soon the migration to postgresql is initiated?
    149 	db.Exec("PRAGMA synchronous=NORMAL;")
    150 
    151 	return db
    152 }
    153 
    154 func missingFilename(path string) bool {
    155 	return strings.Compare(path, "") == 0 || strings.Compare(path, "/") == 0
    156 }
    157 
    158 // The reason I chose to do {host}/{escaped_path} instead of a UUID or just {escaped_url} is to
    159 // have a better hierarchy for looking at files that are downloaded.
    160 
    161 // I could've done a more deeply nested structure, but cases like the following become annoying:
    162 // {site}/blog
    163 // {site}/blog/entry.gmi
    164 // I could probably assume the first one is /blog/index.gmi, but since the '.' character is valid in urls
    165 // there doesn't seem to be a reliable way to differentiate between directories and files in paths.
    166 // This is to say, different server implementation may do weird stuff, and we shouldn't make assumptions about
    167 // that sort of thing. I also don't want to limit myself to **just** gemtext so this seems good enough for now.
    168 
    169 func writeSiteOutput(outputDir string, body string, currentUrl string) (string, error) {
    170 
    171 	u, err := url.Parse(currentUrl)
    172 
    173 	if err != nil {
    174 		panic(err)
    175 	}
    176 
    177 	if !u.IsAbs() {
    178 		return "", errors.New("Must use absolute paths.")
    179 	}
    180 
    181 	// This sometimes seems to include a leading '/' so drop that, no need, directory
    182 	// hierarchy gives this information to us on the fs.
    183 
    184 	filename := u.EscapedPath()
    185 
    186 	if len(filename) > 1 && filename[0] == '/' {
    187 		filename = filename[1:]
    188 	}
    189 
    190 	dirname := u.Host // include subdomains
    191 
    192 	if missingFilename(filename) {
    193 
    194 		// this is presumptuous, albeit quite standard.
    195 		// main reason for this is to have uniquely identifying names for files.
    196 		// unfortunately no way to create a file with an empty filename. 
    197 		// this can result in ambiguity between {site}/index.gmi and {site}, but we shall assume they are the same.
    198 
    199 		filename = "index.gmi"
    200 	}
    201 
    202 	filename = url.PathEscape(filename)
    203 
    204 	outputDirectory := outputDir + "/" + "sites" + "/" + dirname + "/"
    205 	outputPath := outputDirectory + filename
    206 
    207 	os.MkdirAll(outputDirectory, 0755)
    208 
    209 	// verified correct for ext4, see here for details:
    210 	// https://serverfault.com/questions/9546/filename-length-limits-on-linux
    211 
    212 	if len(filename) <= 255 {
    213 		err = os.WriteFile(outputPath, []byte(body), 0644)
    214 	} else {
    215 		fmt.Println("Skipping file creation; filename too long")
    216 	}
    217 
    218 	if err != nil {
    219 		panic(err)
    220 	}
    221 
    222 	return outputPath, nil
    223 }
    224 
    225 func getNext(db *sql.DB) string {
    226 
    227 	link := ""
    228 
    229 	popLinkQuery := `	DELETE FROM query_queue
    230 						WHERE source = (SELECT source FROM query_queue ORDER BY added_timestamp ASC LIMIT 1)
    231 						RETURNING source`
    232 
    233 	err := db.QueryRow(popLinkQuery).Scan(&link)
    234 
    235 	if err == sql.ErrNoRows {
    236 		return ""
    237 	} else if err != nil {
    238 		panic(err)
    239 	}
    240 
    241 	return link
    242 }
    243 
    244 func worker(db *sql.DB, wg *sync.WaitGroup) {
    245 	defer wg.Done()
    246 
    247 	insertPageQuery := "INSERT OR REPLACE INTO page (url, path, added_timestamp) VALUES (?, ?, CURRENT_TIMESTAMP)"
    248 	insertLinkQuery := "INSERT INTO query_queue (source, added_timestamp) VALUES (?, CURRENT_TIMESTAMP) ON CONFLICT DO UPDATE SET added_timestamp=CURRENT_TIMESTAMP"
    249 	for true {
    250 
    251 		currentUrl := getNext(db)
    252 
    253 		if currentUrl == "" {
    254 			fmt.Printf("No links remaining.\n")
    255 			return
    256 		}
    257 		fmt.Printf("Querying: %s\n", currentUrl)
    258 
    259 		body, finalUrl, err := fetchGemtext(currentUrl)
    260 
    261 		if err == NotGemtextError {
    262 			fmt.Println("Not gemtext... Continuing")
    263 		} else if err != nil {
    264 			fmt.Printf("Unable to fetch %s: %s\n", currentUrl, err)
    265 			continue
    266 		}
    267 
    268 		if finalUrl != "" && finalUrl != currentUrl {
    269 			fmt.Printf("Updating url to reflect redirects: %s => %s\n", currentUrl, finalUrl)
    270 			currentUrl = finalUrl
    271 		}
    272 
    273 		fmt.Printf("Fetched %s\n", currentUrl)
    274 
    275 		forwardGeminiLinks := gemtextparser.ParseLinks(body, currentUrl)
    276 
    277 		// TODO: We should check to make sure the filename isn't too long before we get to this point
    278 		// otherwise we have an annoying db inconsistency. 
    279 		// I noticed this when performing indexing and it said "file name too long" which means the path
    280 		// was inserted into the db, but presumably nothing was written to disk because it'd've been too long
    281 		// also, there is a generally worrying incosistency that occurs when writing to disk and using sqlite
    282 		// without some sort of locking mechanism. 
    283 
    284 		// TODO: When the meta != text/gemini, what happens? I'm seeing a bunch of files that are empty; this 
    285 		// isn't really what i'd expect, I'd expect some short circuiting.
    286 
    287 		outputLocation, err := writeSiteOutput(OUTPUT_DIR, body, currentUrl)
    288 
    289 		if err != nil {
    290 			panic(err)
    291 		}
    292 
    293 		tx, _ := db.Begin()
    294 
    295 		_, err = tx.Exec(insertPageQuery, currentUrl, outputLocation)
    296 
    297 		if err != nil {
    298 			panic(err)
    299 		}
    300 
    301 		for _, link := range forwardGeminiLinks {
    302 
    303 			_, err := tx.Exec(insertLinkQuery, link)
    304 
    305 			if err != nil {
    306 				panic(err)
    307 			}
    308 
    309 		}
    310 		tx.Commit()
    311 	}
    312 
    313 }
    314 
    315 func main() {
    316 
    317 	db := setupDB(OUTPUT_DIR, DB_NAME)
    318 	defer db.Close()
    319 
    320 	base := "gemini://tlgs.one"
    321 
    322 	insertLinkQuery := "INSERT INTO query_queue (source, added_timestamp) VALUES (?, CURRENT_TIMESTAMP) ON CONFLICT DO UPDATE SET added_timestamp=CURRENT_TIMESTAMP"
    323 
    324 	// bootstrap
    325 	_, err := db.Exec(insertLinkQuery, base)
    326 
    327 	if err != nil {
    328 		panic(err)
    329 	}
    330 
    331 	var wg sync.WaitGroup
    332 
    333 	for i := 0; i < 20; i++ {
    334 		wg.Add(1)
    335 		go worker(db, &wg)
    336 		time.Sleep(time.Second * 5) // This allows the queue to be built up before dispatching to help with fresh runs
    337 	}
    338 	wg.Wait()
    339 
    340 }