gemini-search

A simple search engine for Geminispace
git clone git://git.laack.co/gemini-search.git
Log | Files | Refs | README

commit bdfe38a18bdcf7dc4ac5427c1d9f0273111f665f
parent 510179519c68ffa2fca7d03aa83172a761914742
Author: Andrew Laack <andrew@laack.co>
Date:   Wed,  6 May 2026 18:59:52 -0500

Improved indexing

Diffstat:
Mindex/main.go | 88+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 86 insertions(+), 2 deletions(-)

diff --git a/index/main.go b/index/main.go @@ -45,6 +45,10 @@ func ensureIndexDB(dir string, filename string) { createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);" createInvertedIndex := "CREATE TABLE IF NOT EXISTS inverted_index (term, url, count);" createTermIndex := "CREATE INDEX IF NOT EXISTS term_idx ON inverted_index(term);" + createDocumentIndex := "CREATE INDEX IF NOT EXISTS url_idx ON inverted_index(url);" + + // TODO: Add filesystem location for this (mv {old} {new} location) + createDocument := "CREATE TABLE IF NOT EXISTS document (url UNIQUE, indexed_timestamp datetime);" _, err = db.Exec(createLink) @@ -64,6 +68,18 @@ func ensureIndexDB(dir string, filename string) { panic(err) } + _, err = db.Exec(createDocumentIndex) + + if err != nil { + panic(err) + } + + _, err = db.Exec(createDocument) + + if err != nil { + panic(err) + } + db.Exec("PRAGMA journal_mode=WAL;") db.Exec("PRAGMA busy_timeout=2000;") @@ -87,8 +103,9 @@ func getNextPath(dbCrawl *sql.DB) (string, string) { err := dbCrawl.QueryRow(popPage).Scan(&selectedUrl, &selectedPath) - // TODO: Handle out of things to index gracefully - if err != nil { + if err == sql.ErrNoRows { + return "", "" + } else if err != nil { panic(err) } @@ -114,6 +131,32 @@ func insertLinks(db *sql.DB, source string, destinations []string) error { return nil } + +// document (url, indexed_timestamp datetime)" +func insertDocument(db *sql.DB, url string) error { + + // createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);" + + // we don't do on conflict resolution because data sould be deleted before insertion + // although... it's maybe possible that two threads could try indexing and re-indexing a site + // at the same time, but seems incredibly unlikely. + + insertDocumentQuery := "INSERT INTO document (url, indexed_timestamp) VALUES (?, CURRENT_TIMESTAMP)" + + // TODO: Why is it not parsing error return? + + tx, _ := db.Begin() + + _, err := tx.Exec(insertDocumentQuery, url) + + if err != nil { + return err + } + + tx.Commit() + return nil +} + func insertTerms(db *sql.DB, url string, terms []string) error { insertTerm := "INSERT INTO inverted_index (term, url, count) VALUES (?, ?, ?)" @@ -138,6 +181,33 @@ func insertTerms(db *sql.DB, url string, terms []string) error { } + +// createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);" +// createInvertedIndex := "CREATE TABLE IF NOT EXISTS inverted_index (term, url, count);" +// createTermIndex := "CREATE INDEX IF NOT EXISTS term_idx ON inverted_index(term);" +// createDocumentIndex := "CREATE INDEX IF NOT EXISTS url_idx ON inverted_index(url);" +// createDocument := "CREATE TABLE IF NOT EXISTS document (url UNIQUE, indexed_timestamp datetime);" + +func ensureNotIndexed(url string, db *sql.DB) { + deleteDocument := "DELETE FROM document WHERE url = ?" + deleteTerm := "DELETE FROM inverted_index WHERE url = ?" + deleteLinks := "DELETE FROM link WHERE source = ?" + + // TODO: This should probably be one transaction in case of crashes and such. + _, err := db.Exec(deleteDocument, url) + if err != nil { + panic(err) + } + _, err = db.Exec(deleteTerm, url) + if err != nil { + panic(err) + } + _, err = db.Exec(deleteLinks, url) + if err != nil { + panic(err) + } +} + func main() { dbCrawl := connectToDB(CRAWLED_DIR, CRAWLED_DB_NAME) @@ -150,6 +220,11 @@ func main() { for true { selectedUrl, selectedPath := getNextPath(dbCrawl) + if selectedUrl == "" && selectedPath == "" { + fmt.Println("No more documents do index, exiting gracefully.") + return + } + ensureNotIndexed(selectedUrl, dbIndex) fmt.Printf("Indexing url: %s\nIndexing path: %s\n", selectedUrl, selectedPath) @@ -165,6 +240,15 @@ func main() { fmt.Printf("Links: %v\n", links) err = insertLinks(dbIndex, selectedUrl, links) + + if err != nil { + panic(err) + } + + // TODO: This should be in the same transaction as the link insertions and term insertions + err = insertDocument(dbIndex, selectedUrl) + + if err != nil { panic(err) }