main.go (6493B)
1 package main 2 3 import ( 4 "database/sql" 5 "errors" 6 "fmt" 7 "geminisearch/gemtextparser" 8 _ "github.com/mattn/go-sqlite3" 9 "os" 10 ) 11 12 // TODO: centralize this? 13 var CRAWLED_DIR = "outputs/crawled" 14 var INDEXED_DIR = "outputs/indexed" 15 var CRAWLED_DB_NAME = "main.db" 16 var INDEXED_DB_NAME = "main.db" 17 18 // TODO: Probably return error instead of panic... 19 func connectToDB(directory string, name string) *sql.DB { 20 21 if _, err := os.Stat(directory + "/" + name); errors.Is(err, os.ErrNotExist) { 22 panic(err) 23 } 24 db, err := sql.Open("sqlite3", directory+"/"+name) 25 26 if err != nil { 27 panic(err) 28 } 29 30 return db 31 } 32 33 func ensureIndexDB(dir string, filename string) { 34 35 os.MkdirAll(dir, 0755) 36 db, err := sql.Open("sqlite3", "./"+dir+"/"+filename) 37 38 if err != nil { 39 panic(err) 40 } 41 42 defer db.Close() 43 44 // There could be multiple links from the same source to the same destination, though that'd be kinda stupid 45 createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);" 46 createInvertedIndex := "CREATE TABLE IF NOT EXISTS inverted_index (term, url, count);" 47 createTermIndex := "CREATE INDEX IF NOT EXISTS term_idx ON inverted_index(term);" 48 createDocumentIndex := "CREATE INDEX IF NOT EXISTS url_idx ON inverted_index(url);" 49 50 // TODO: Add filesystem location for this (mv {old} {new} location) 51 createDocument := "CREATE TABLE IF NOT EXISTS document (url UNIQUE, indexed_timestamp datetime);" 52 53 _, err = db.Exec(createLink) 54 55 if err != nil { 56 panic(err) 57 } 58 59 _, err = db.Exec(createInvertedIndex) 60 61 if err != nil { 62 panic(err) 63 } 64 65 _, err = db.Exec(createTermIndex) 66 67 if err != nil { 68 panic(err) 69 } 70 71 _, err = db.Exec(createDocumentIndex) 72 73 if err != nil { 74 panic(err) 75 } 76 77 _, err = db.Exec(createDocument) 78 79 if err != nil { 80 panic(err) 81 } 82 83 84 db.Exec("PRAGMA journal_mode=WAL;") 85 db.Exec("PRAGMA busy_timeout=2000;") 86 db.Exec("PRAGMA synchronous=NORMAL;") 87 88 } 89 90 func getNextPath(dbCrawl *sql.DB) (string, string) { 91 92 // It's not guaranteed this site will be indexed because it's possible 93 // the process fails during that but is still removed from the db 94 // We could fix this with a locking mechanism, but I don't know how necessary that'd be 95 96 // TODO: Make sure the page from the filesystem is deleted after indexing. 97 popPage := `DELETE FROM page 98 WHERE path = (SELECT path FROM page ORDER BY added_timestamp ASC LIMIT 1) 99 RETURNING url, path` 100 101 selectedUrl := "" 102 selectedPath := "" 103 104 err := dbCrawl.QueryRow(popPage).Scan(&selectedUrl, &selectedPath) 105 106 if err == sql.ErrNoRows { 107 return "", "" 108 } else if err != nil { 109 panic(err) 110 } 111 112 return selectedUrl, selectedPath 113 } 114 115 func insertLinks(db *sql.DB, source string, destinations []string) error { 116 // createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);" 117 118 insertLinkQuery := "INSERT INTO link (source, destination, added_timestamp) VALUES (?, ?, CURRENT_TIMESTAMP)" 119 // TODO: Why is it not parsing error return? 120 tx, _ := db.Begin() 121 122 for _, current := range destinations { 123 _, err := tx.Exec(insertLinkQuery, source, current) 124 125 if err != nil { 126 return err 127 } 128 } 129 130 tx.Commit() 131 return nil 132 } 133 134 135 // document (url, indexed_timestamp datetime)" 136 func insertDocument(db *sql.DB, url string) error { 137 138 // createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);" 139 140 // we don't do on conflict resolution because data sould be deleted before insertion 141 // although... it's maybe possible that two threads could try indexing and re-indexing a site 142 // at the same time, but seems incredibly unlikely. 143 144 insertDocumentQuery := "INSERT INTO document (url, indexed_timestamp) VALUES (?, CURRENT_TIMESTAMP)" 145 146 // TODO: Why is it not parsing error return? 147 148 tx, _ := db.Begin() 149 150 _, err := tx.Exec(insertDocumentQuery, url) 151 152 if err != nil { 153 return err 154 } 155 156 tx.Commit() 157 return nil 158 } 159 160 func insertTerms(db *sql.DB, url string, terms []string) error { 161 162 insertTerm := "INSERT INTO inverted_index (term, url, count) VALUES (?, ?, ?)" 163 164 termMap := make(map[string]int) 165 166 for _, term := range terms { 167 termMap[term]++ 168 } 169 170 tx, _ := db.Begin() 171 for key, value := range termMap { 172 _, err := tx.Exec(insertTerm, key, url, value) 173 174 if err != nil { 175 return err 176 } 177 } 178 179 tx.Commit() 180 return nil 181 182 } 183 184 185 // createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);" 186 // createInvertedIndex := "CREATE TABLE IF NOT EXISTS inverted_index (term, url, count);" 187 // createTermIndex := "CREATE INDEX IF NOT EXISTS term_idx ON inverted_index(term);" 188 // createDocumentIndex := "CREATE INDEX IF NOT EXISTS url_idx ON inverted_index(url);" 189 // createDocument := "CREATE TABLE IF NOT EXISTS document (url UNIQUE, indexed_timestamp datetime);" 190 191 func ensureNotIndexed(url string, db *sql.DB) { 192 deleteDocument := "DELETE FROM document WHERE url = ?" 193 deleteTerm := "DELETE FROM inverted_index WHERE url = ?" 194 deleteLinks := "DELETE FROM link WHERE source = ?" 195 196 // TODO: This should probably be one transaction in case of crashes and such. 197 _, err := db.Exec(deleteDocument, url) 198 if err != nil { 199 panic(err) 200 } 201 _, err = db.Exec(deleteTerm, url) 202 if err != nil { 203 panic(err) 204 } 205 _, err = db.Exec(deleteLinks, url) 206 if err != nil { 207 panic(err) 208 } 209 } 210 211 func main() { 212 213 dbCrawl := connectToDB(CRAWLED_DIR, CRAWLED_DB_NAME) 214 defer dbCrawl.Close() 215 216 ensureIndexDB(INDEXED_DIR, INDEXED_DB_NAME) 217 dbIndex := connectToDB(INDEXED_DIR, INDEXED_DB_NAME) 218 defer dbIndex.Close() 219 220 for true { 221 222 selectedUrl, selectedPath := getNextPath(dbCrawl) 223 if selectedUrl == "" && selectedPath == "" { 224 fmt.Println("No more documents do index, exiting gracefully.") 225 return 226 } 227 ensureNotIndexed(selectedUrl, dbIndex) 228 229 fmt.Printf("Indexing url: %s\nIndexing path: %s\n", selectedUrl, selectedPath) 230 231 bodyBytes, err := os.ReadFile(selectedPath) 232 233 if err != nil { 234 panic(err) 235 } 236 237 body := string(bodyBytes) 238 239 links := gemtextparser.ParseLinks(body, selectedUrl) 240 fmt.Printf("Links: %v\n", links) 241 err = insertLinks(dbIndex, selectedUrl, links) 242 243 244 if err != nil { 245 panic(err) 246 } 247 248 // TODO: This should be in the same transaction as the link insertions and term insertions 249 err = insertDocument(dbIndex, selectedUrl) 250 251 252 if err != nil { 253 panic(err) 254 } 255 256 words := gemtextparser.StemmedDocument(body) 257 err = insertTerms(dbIndex, selectedUrl, words) 258 259 if err != nil { 260 panic(err) 261 } 262 263 264 for _, word := range words { 265 // TODO: Setup inverted index 266 fmt.Printf("%s,", word) 267 } 268 } 269 270 }