commit bdfe38a18bdcf7dc4ac5427c1d9f0273111f665f
parent 510179519c68ffa2fca7d03aa83172a761914742
Author: Andrew Laack <andrew@laack.co>
Date: Wed, 6 May 2026 18:59:52 -0500
Improved indexing
Diffstat:
| M | index/main.go | | | 88 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- |
1 file changed, 86 insertions(+), 2 deletions(-)
diff --git a/index/main.go b/index/main.go
@@ -45,6 +45,10 @@ func ensureIndexDB(dir string, filename string) {
createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);"
createInvertedIndex := "CREATE TABLE IF NOT EXISTS inverted_index (term, url, count);"
createTermIndex := "CREATE INDEX IF NOT EXISTS term_idx ON inverted_index(term);"
+ createDocumentIndex := "CREATE INDEX IF NOT EXISTS url_idx ON inverted_index(url);"
+
+ // TODO: Add filesystem location for this (mv {old} {new} location)
+ createDocument := "CREATE TABLE IF NOT EXISTS document (url UNIQUE, indexed_timestamp datetime);"
_, err = db.Exec(createLink)
@@ -64,6 +68,18 @@ func ensureIndexDB(dir string, filename string) {
panic(err)
}
+ _, err = db.Exec(createDocumentIndex)
+
+ if err != nil {
+ panic(err)
+ }
+
+ _, err = db.Exec(createDocument)
+
+ if err != nil {
+ panic(err)
+ }
+
db.Exec("PRAGMA journal_mode=WAL;")
db.Exec("PRAGMA busy_timeout=2000;")
@@ -87,8 +103,9 @@ func getNextPath(dbCrawl *sql.DB) (string, string) {
err := dbCrawl.QueryRow(popPage).Scan(&selectedUrl, &selectedPath)
- // TODO: Handle out of things to index gracefully
- if err != nil {
+ if err == sql.ErrNoRows {
+ return "", ""
+ } else if err != nil {
panic(err)
}
@@ -114,6 +131,32 @@ func insertLinks(db *sql.DB, source string, destinations []string) error {
return nil
}
+
+// document (url, indexed_timestamp datetime)"
+func insertDocument(db *sql.DB, url string) error {
+
+ // createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);"
+
+ // we don't do on conflict resolution because data sould be deleted before insertion
+ // although... it's maybe possible that two threads could try indexing and re-indexing a site
+ // at the same time, but seems incredibly unlikely.
+
+ insertDocumentQuery := "INSERT INTO document (url, indexed_timestamp) VALUES (?, CURRENT_TIMESTAMP)"
+
+ // TODO: Why is it not parsing error return?
+
+ tx, _ := db.Begin()
+
+ _, err := tx.Exec(insertDocumentQuery, url)
+
+ if err != nil {
+ return err
+ }
+
+ tx.Commit()
+ return nil
+}
+
func insertTerms(db *sql.DB, url string, terms []string) error {
insertTerm := "INSERT INTO inverted_index (term, url, count) VALUES (?, ?, ?)"
@@ -138,6 +181,33 @@ func insertTerms(db *sql.DB, url string, terms []string) error {
}
+
+// createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);"
+// createInvertedIndex := "CREATE TABLE IF NOT EXISTS inverted_index (term, url, count);"
+// createTermIndex := "CREATE INDEX IF NOT EXISTS term_idx ON inverted_index(term);"
+// createDocumentIndex := "CREATE INDEX IF NOT EXISTS url_idx ON inverted_index(url);"
+// createDocument := "CREATE TABLE IF NOT EXISTS document (url UNIQUE, indexed_timestamp datetime);"
+
+func ensureNotIndexed(url string, db *sql.DB) {
+ deleteDocument := "DELETE FROM document WHERE url = ?"
+ deleteTerm := "DELETE FROM inverted_index WHERE url = ?"
+ deleteLinks := "DELETE FROM link WHERE source = ?"
+
+ // TODO: This should probably be one transaction in case of crashes and such.
+ _, err := db.Exec(deleteDocument, url)
+ if err != nil {
+ panic(err)
+ }
+ _, err = db.Exec(deleteTerm, url)
+ if err != nil {
+ panic(err)
+ }
+ _, err = db.Exec(deleteLinks, url)
+ if err != nil {
+ panic(err)
+ }
+}
+
func main() {
dbCrawl := connectToDB(CRAWLED_DIR, CRAWLED_DB_NAME)
@@ -150,6 +220,11 @@ func main() {
for true {
selectedUrl, selectedPath := getNextPath(dbCrawl)
+ if selectedUrl == "" && selectedPath == "" {
+ fmt.Println("No more documents do index, exiting gracefully.")
+ return
+ }
+ ensureNotIndexed(selectedUrl, dbIndex)
fmt.Printf("Indexing url: %s\nIndexing path: %s\n", selectedUrl, selectedPath)
@@ -165,6 +240,15 @@ func main() {
fmt.Printf("Links: %v\n", links)
err = insertLinks(dbIndex, selectedUrl, links)
+
+ if err != nil {
+ panic(err)
+ }
+
+ // TODO: This should be in the same transaction as the link insertions and term insertions
+ err = insertDocument(dbIndex, selectedUrl)
+
+
if err != nil {
panic(err)
}