gemini-search

A simple search engine for Geminispace
git clone git://git.laack.co/gemini-search.git
Log | Files | Refs | README

commit 510179519c68ffa2fca7d03aa83172a761914742
parent 032781954f3de4b21eecd6199133d97f88ceadee
Author: Andrew Laack <andrew@laack.co>
Date:   Wed,  6 May 2026 17:27:38 -0500

Basic inverted index

Diffstat:
Mcrawl/main.go | 2+-
Mindex/main.go | 85++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
2 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/crawl/main.go b/crawl/main.go @@ -193,7 +193,7 @@ func writeSiteOutput(outputDir string, body string, currentUrl string) (string, // this is presumptuous, albeit quite standard. // main reason for this is to have uniquely identifying names for files. - // unfortunately no way to create a file with an empty filename. + // unfortunately no way to create a file with an empty filename. // this can result in ambiguity between {site}/index.gmi and {site}, but we shall assume they are the same. filename = "index.gmi" diff --git a/index/main.go b/index/main.go @@ -43,6 +43,8 @@ func ensureIndexDB(dir string, filename string) { // There could be multiple links from the same source to the same destination, though that'd be kinda stupid createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);" + createInvertedIndex := "CREATE TABLE IF NOT EXISTS inverted_index (term, url, count);" + createTermIndex := "CREATE INDEX IF NOT EXISTS term_idx ON inverted_index(term);" _, err = db.Exec(createLink) @@ -50,6 +52,19 @@ func ensureIndexDB(dir string, filename string) { panic(err) } + _, err = db.Exec(createInvertedIndex) + + if err != nil { + panic(err) + } + + _, err = db.Exec(createTermIndex) + + if err != nil { + panic(err) + } + + db.Exec("PRAGMA journal_mode=WAL;") db.Exec("PRAGMA busy_timeout=2000;") db.Exec("PRAGMA synchronous=NORMAL;") @@ -62,6 +77,7 @@ func getNextPath(dbCrawl *sql.DB) (string, string) { // the process fails during that but is still removed from the db // We could fix this with a locking mechanism, but I don't know how necessary that'd be + // TODO: Make sure the page from the filesystem is deleted after indexing. popPage := `DELETE FROM page WHERE path = (SELECT path FROM page ORDER BY added_timestamp ASC LIMIT 1) RETURNING url, path` @@ -71,6 +87,7 @@ func getNextPath(dbCrawl *sql.DB) (string, string) { err := dbCrawl.QueryRow(popPage).Scan(&selectedUrl, &selectedPath) + // TODO: Handle out of things to index gracefully if err != nil { panic(err) } @@ -82,7 +99,6 @@ func insertLinks(db *sql.DB, source string, destinations []string) error { // createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);" insertLinkQuery := "INSERT INTO link (source, destination, added_timestamp) VALUES (?, ?, CURRENT_TIMESTAMP)" - // TODO: Why is it not parsing error return? tx, _ := db.Begin() @@ -98,6 +114,30 @@ func insertLinks(db *sql.DB, source string, destinations []string) error { return nil } +func insertTerms(db *sql.DB, url string, terms []string) error { + + insertTerm := "INSERT INTO inverted_index (term, url, count) VALUES (?, ?, ?)" + + termMap := make(map[string]int) + + for _, term := range terms { + termMap[term]++ + } + + tx, _ := db.Begin() + for key, value := range termMap { + _, err := tx.Exec(insertTerm, key, url, value) + + if err != nil { + return err + } + } + + tx.Commit() + return nil + +} + func main() { dbCrawl := connectToDB(CRAWLED_DIR, CRAWLED_DB_NAME) @@ -107,31 +147,40 @@ func main() { dbIndex := connectToDB(INDEXED_DIR, INDEXED_DB_NAME) defer dbIndex.Close() - selectedUrl, selectedPath := getNextPath(dbCrawl) + for true { - fmt.Printf("Indexing url: %s\nIndexing path: %s\n", selectedUrl, selectedPath) + selectedUrl, selectedPath := getNextPath(dbCrawl) - bodyBytes, err := os.ReadFile(selectedPath) + fmt.Printf("Indexing url: %s\nIndexing path: %s\n", selectedUrl, selectedPath) - if err != nil { - panic(err) - } + bodyBytes, err := os.ReadFile(selectedPath) + + if err != nil { + panic(err) + } - body := string(bodyBytes) + body := string(bodyBytes) - links := gemtextparser.ParseLinks(body, selectedUrl) - fmt.Printf("Links: %v\n", links) - err = insertLinks(dbIndex, selectedUrl, links) + links := gemtextparser.ParseLinks(body, selectedUrl) + fmt.Printf("Links: %v\n", links) + err = insertLinks(dbIndex, selectedUrl, links) - if err != nil { - panic(err) - } + if err != nil { + panic(err) + } - words := gemtextparser.StemmedDocument(body) + words := gemtextparser.StemmedDocument(body) + err = insertTerms(dbIndex, selectedUrl, words) - for _, word := range words { - // TODO: Setup inverted index - fmt.Printf("%s,", word) + if err != nil { + panic(err) + } + + + for _, word := range words { + // TODO: Setup inverted index + fmt.Printf("%s,", word) + } } }