gemini-search

A simple search engine for Geminispace
git clone git://git.laack.co/gemini-search.git
Log | Files | Refs | README

main.go (6493B)


      1 package main
      2 
      3 import (
      4 	"database/sql"
      5 	"errors"
      6 	"fmt"
      7 	"geminisearch/gemtextparser"
      8 	_ "github.com/mattn/go-sqlite3"
      9 	"os"
     10 )
     11 
     12 // TODO: centralize this?
     13 var CRAWLED_DIR = "outputs/crawled"
     14 var INDEXED_DIR = "outputs/indexed"
     15 var CRAWLED_DB_NAME = "main.db"
     16 var INDEXED_DB_NAME = "main.db"
     17 
     18 // TODO: Probably return error instead of panic...
     19 func connectToDB(directory string, name string) *sql.DB {
     20 
     21 	if _, err := os.Stat(directory + "/" + name); errors.Is(err, os.ErrNotExist) {
     22 		panic(err)
     23 	}
     24 	db, err := sql.Open("sqlite3", directory+"/"+name)
     25 
     26 	if err != nil {
     27 		panic(err)
     28 	}
     29 
     30 	return db
     31 }
     32 
     33 func ensureIndexDB(dir string, filename string) {
     34 
     35 	os.MkdirAll(dir, 0755)
     36 	db, err := sql.Open("sqlite3", "./"+dir+"/"+filename)
     37 
     38 	if err != nil {
     39 		panic(err)
     40 	}
     41 
     42 	defer db.Close()
     43 
     44 	// There could be multiple links from the same source to the same destination, though that'd be kinda stupid
     45 	createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);"
     46 	createInvertedIndex := "CREATE TABLE IF NOT EXISTS inverted_index (term, url, count);"
     47 	createTermIndex := "CREATE INDEX IF NOT EXISTS term_idx ON inverted_index(term);"
     48 	createDocumentIndex := "CREATE INDEX IF NOT EXISTS url_idx ON inverted_index(url);"
     49 
     50 	// TODO: Add filesystem location for this (mv {old} {new} location)
     51 	createDocument := "CREATE TABLE IF NOT EXISTS document (url UNIQUE, indexed_timestamp datetime);"
     52 
     53 	_, err = db.Exec(createLink)
     54 
     55 	if err != nil {
     56 		panic(err)
     57 	}
     58 
     59 	_, err = db.Exec(createInvertedIndex)
     60 
     61 	if err != nil {
     62 		panic(err)
     63 	}
     64 
     65 	_, err = db.Exec(createTermIndex)
     66 
     67 	if err != nil {
     68 		panic(err)
     69 	}
     70 
     71 	_, err = db.Exec(createDocumentIndex)
     72 
     73 	if err != nil {
     74 		panic(err)
     75 	}
     76 
     77 	_, err = db.Exec(createDocument)
     78 
     79 	if err != nil {
     80 		panic(err)
     81 	}
     82 
     83 
     84 	db.Exec("PRAGMA journal_mode=WAL;")
     85 	db.Exec("PRAGMA busy_timeout=2000;")
     86 	db.Exec("PRAGMA synchronous=NORMAL;")
     87 
     88 }
     89 
     90 func getNextPath(dbCrawl *sql.DB) (string, string) {
     91 
     92 	// It's not guaranteed this site will be indexed because it's possible
     93 	// the process fails during that but is still removed from the db
     94 	// We could fix this with a locking mechanism, but I don't know how necessary that'd be
     95 
     96 	// TODO: Make sure the page from the filesystem is deleted after indexing.
     97 	popPage := `DELETE FROM page
     98 						WHERE path = (SELECT path FROM page ORDER BY added_timestamp ASC LIMIT 1)
     99 						RETURNING url, path`
    100 
    101 	selectedUrl := ""
    102 	selectedPath := ""
    103 
    104 	err := dbCrawl.QueryRow(popPage).Scan(&selectedUrl, &selectedPath)
    105 
    106 	if err == sql.ErrNoRows {
    107 		return "", ""
    108 	} else if err != nil {
    109 		panic(err)
    110 	}
    111 
    112 	return selectedUrl, selectedPath
    113 }
    114 
    115 func insertLinks(db *sql.DB, source string, destinations []string) error {
    116 	// createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);"
    117 
    118 	insertLinkQuery := "INSERT INTO link (source, destination, added_timestamp) VALUES (?, ?, CURRENT_TIMESTAMP)"
    119 	// TODO: Why is it not parsing error return?
    120 	tx, _ := db.Begin()
    121 
    122 	for _, current := range destinations {
    123 		_, err := tx.Exec(insertLinkQuery, source, current)
    124 
    125 		if err != nil {
    126 			return err
    127 		}
    128 	}
    129 
    130 	tx.Commit()
    131 	return nil
    132 }
    133 
    134 
    135 // document (url, indexed_timestamp datetime)"
    136 func insertDocument(db *sql.DB, url string) error {
    137 
    138 	// createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);"
    139 
    140 	// we don't do on conflict resolution because data sould be deleted before insertion
    141 	// although... it's maybe possible that two threads could try indexing and re-indexing a site
    142 	// at the same time, but seems incredibly unlikely.
    143 
    144 	insertDocumentQuery := "INSERT INTO document (url, indexed_timestamp) VALUES (?, CURRENT_TIMESTAMP)"
    145 
    146 	// TODO: Why is it not parsing error return?
    147 
    148 	tx, _ := db.Begin()
    149 
    150 	_, err := tx.Exec(insertDocumentQuery, url)
    151 
    152 	if err != nil {
    153 		return err
    154 	}
    155 
    156 	tx.Commit()
    157 	return nil
    158 }
    159 
    160 func insertTerms(db *sql.DB, url string, terms []string) error {
    161 
    162 	insertTerm := "INSERT INTO inverted_index (term, url, count) VALUES (?, ?, ?)"
    163 
    164 	termMap := make(map[string]int)
    165 
    166 	for _, term := range terms {
    167 		termMap[term]++
    168 	}
    169 
    170 	tx, _ := db.Begin()
    171 	for key, value := range termMap {
    172 		_, err := tx.Exec(insertTerm, key, url, value)
    173 
    174 		if err != nil {
    175 			return err
    176 		}
    177 	}
    178 
    179 	tx.Commit()
    180 	return nil
    181 																					
    182 }
    183 
    184 
    185 // 	createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);"
    186 // 	createInvertedIndex := "CREATE TABLE IF NOT EXISTS inverted_index (term, url, count);"
    187 // 	createTermIndex := "CREATE INDEX IF NOT EXISTS term_idx ON inverted_index(term);"
    188 // 	createDocumentIndex := "CREATE INDEX IF NOT EXISTS url_idx ON inverted_index(url);"
    189 // 	createDocument := "CREATE TABLE IF NOT EXISTS document (url UNIQUE, indexed_timestamp datetime);"
    190 
    191 func ensureNotIndexed(url string, db *sql.DB) {
    192 	deleteDocument := "DELETE FROM document WHERE url = ?"
    193 	deleteTerm := "DELETE FROM inverted_index WHERE url = ?"
    194 	deleteLinks := "DELETE FROM link WHERE source = ?"
    195 
    196 	// TODO: This should probably be one transaction in case of crashes and such.
    197 	_, err := db.Exec(deleteDocument, url)
    198 	if err != nil {
    199 		panic(err)
    200 	}
    201 	_, err = db.Exec(deleteTerm, url)
    202 	if err != nil {
    203 		panic(err)
    204 	}
    205 	_, err = db.Exec(deleteLinks, url)
    206 	if err != nil {
    207 		panic(err)
    208 	}
    209 }
    210 
    211 func main() {
    212 
    213 	dbCrawl := connectToDB(CRAWLED_DIR, CRAWLED_DB_NAME)
    214 	defer dbCrawl.Close()
    215 
    216 	ensureIndexDB(INDEXED_DIR, INDEXED_DB_NAME)
    217 	dbIndex := connectToDB(INDEXED_DIR, INDEXED_DB_NAME)
    218 	defer dbIndex.Close()
    219 
    220 	for true {
    221 
    222 		selectedUrl, selectedPath := getNextPath(dbCrawl)
    223 		if selectedUrl == "" && selectedPath == "" {
    224 			fmt.Println("No more documents do index, exiting gracefully.")
    225 			return
    226 		}
    227 		ensureNotIndexed(selectedUrl, dbIndex)
    228 
    229 		fmt.Printf("Indexing url: %s\nIndexing path: %s\n", selectedUrl, selectedPath)
    230 
    231 		bodyBytes, err := os.ReadFile(selectedPath)
    232 
    233 		if err != nil {
    234 			panic(err)
    235 		}
    236 
    237 		body := string(bodyBytes)
    238 
    239 		links := gemtextparser.ParseLinks(body, selectedUrl)
    240 		fmt.Printf("Links: %v\n", links)
    241 		err = insertLinks(dbIndex, selectedUrl, links)
    242 
    243 
    244 		if err != nil {
    245 			panic(err)
    246 		}
    247 
    248 		// TODO: This should be in the same transaction as the link insertions and term insertions
    249 		err = insertDocument(dbIndex, selectedUrl)
    250 
    251 
    252 		if err != nil {
    253 			panic(err)
    254 		}
    255 
    256 		words := gemtextparser.StemmedDocument(body)
    257 		err = insertTerms(dbIndex, selectedUrl, words)
    258 
    259 		if err != nil {
    260 			panic(err)
    261 		}
    262 
    263 
    264 		for _, word := range words {
    265 			// TODO: Setup inverted index
    266 			fmt.Printf("%s,", word)
    267 		}
    268 	}
    269 
    270 }