commit 510179519c68ffa2fca7d03aa83172a761914742
parent 032781954f3de4b21eecd6199133d97f88ceadee
Author: Andrew Laack <andrew@laack.co>
Date: Wed, 6 May 2026 17:27:38 -0500
Basic inverted index
Diffstat:
2 files changed, 68 insertions(+), 19 deletions(-)
diff --git a/crawl/main.go b/crawl/main.go
@@ -193,7 +193,7 @@ func writeSiteOutput(outputDir string, body string, currentUrl string) (string,
// this is presumptuous, albeit quite standard.
// main reason for this is to have uniquely identifying names for files.
- // unfortunately no way to create a file with an empty filename.
+ // unfortunately no way to create a file with an empty filename.
// this can result in ambiguity between {site}/index.gmi and {site}, but we shall assume they are the same.
filename = "index.gmi"
diff --git a/index/main.go b/index/main.go
@@ -43,6 +43,8 @@ func ensureIndexDB(dir string, filename string) {
// There could be multiple links from the same source to the same destination, though that'd be kinda stupid
createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);"
+ createInvertedIndex := "CREATE TABLE IF NOT EXISTS inverted_index (term, url, count);"
+ createTermIndex := "CREATE INDEX IF NOT EXISTS term_idx ON inverted_index(term);"
_, err = db.Exec(createLink)
@@ -50,6 +52,19 @@ func ensureIndexDB(dir string, filename string) {
panic(err)
}
+ _, err = db.Exec(createInvertedIndex)
+
+ if err != nil {
+ panic(err)
+ }
+
+ _, err = db.Exec(createTermIndex)
+
+ if err != nil {
+ panic(err)
+ }
+
+
db.Exec("PRAGMA journal_mode=WAL;")
db.Exec("PRAGMA busy_timeout=2000;")
db.Exec("PRAGMA synchronous=NORMAL;")
@@ -62,6 +77,7 @@ func getNextPath(dbCrawl *sql.DB) (string, string) {
// the process fails during that but is still removed from the db
// We could fix this with a locking mechanism, but I don't know how necessary that'd be
+ // TODO: Make sure the page from the filesystem is deleted after indexing.
popPage := `DELETE FROM page
WHERE path = (SELECT path FROM page ORDER BY added_timestamp ASC LIMIT 1)
RETURNING url, path`
@@ -71,6 +87,7 @@ func getNextPath(dbCrawl *sql.DB) (string, string) {
err := dbCrawl.QueryRow(popPage).Scan(&selectedUrl, &selectedPath)
+ // TODO: Handle out of things to index gracefully
if err != nil {
panic(err)
}
@@ -82,7 +99,6 @@ func insertLinks(db *sql.DB, source string, destinations []string) error {
// createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);"
insertLinkQuery := "INSERT INTO link (source, destination, added_timestamp) VALUES (?, ?, CURRENT_TIMESTAMP)"
-
// TODO: Why is it not parsing error return?
tx, _ := db.Begin()
@@ -98,6 +114,30 @@ func insertLinks(db *sql.DB, source string, destinations []string) error {
return nil
}
+func insertTerms(db *sql.DB, url string, terms []string) error {
+
+ insertTerm := "INSERT INTO inverted_index (term, url, count) VALUES (?, ?, ?)"
+
+ termMap := make(map[string]int)
+
+ for _, term := range terms {
+ termMap[term]++
+ }
+
+ tx, _ := db.Begin()
+ for key, value := range termMap {
+ _, err := tx.Exec(insertTerm, key, url, value)
+
+ if err != nil {
+ return err
+ }
+ }
+
+ tx.Commit()
+ return nil
+
+}
+
func main() {
dbCrawl := connectToDB(CRAWLED_DIR, CRAWLED_DB_NAME)
@@ -107,31 +147,40 @@ func main() {
dbIndex := connectToDB(INDEXED_DIR, INDEXED_DB_NAME)
defer dbIndex.Close()
- selectedUrl, selectedPath := getNextPath(dbCrawl)
+ for true {
- fmt.Printf("Indexing url: %s\nIndexing path: %s\n", selectedUrl, selectedPath)
+ selectedUrl, selectedPath := getNextPath(dbCrawl)
- bodyBytes, err := os.ReadFile(selectedPath)
+ fmt.Printf("Indexing url: %s\nIndexing path: %s\n", selectedUrl, selectedPath)
- if err != nil {
- panic(err)
- }
+ bodyBytes, err := os.ReadFile(selectedPath)
+
+ if err != nil {
+ panic(err)
+ }
- body := string(bodyBytes)
+ body := string(bodyBytes)
- links := gemtextparser.ParseLinks(body, selectedUrl)
- fmt.Printf("Links: %v\n", links)
- err = insertLinks(dbIndex, selectedUrl, links)
+ links := gemtextparser.ParseLinks(body, selectedUrl)
+ fmt.Printf("Links: %v\n", links)
+ err = insertLinks(dbIndex, selectedUrl, links)
- if err != nil {
- panic(err)
- }
+ if err != nil {
+ panic(err)
+ }
- words := gemtextparser.StemmedDocument(body)
+ words := gemtextparser.StemmedDocument(body)
+ err = insertTerms(dbIndex, selectedUrl, words)
- for _, word := range words {
- // TODO: Setup inverted index
- fmt.Printf("%s,", word)
+ if err != nil {
+ panic(err)
+ }
+
+
+ for _, word := range words {
+ // TODO: Setup inverted index
+ fmt.Printf("%s,", word)
+ }
}
}