gemini-search

A simple search engine for Geminispace
git clone git://git.laack.co/gemini-search.git
Log | Files | Refs | README

commit 9f030f04661d3469a6fd50c14c377aa8ebf130eb
parent 5fcff806414493fe51e81e7a0b5b5b8ec28480e0
Author: Andrew Laack <andrew@laack.co>
Date:   Wed,  6 May 2026 15:41:10 -0500

Fixed issue w/ non-hierarchical links

Diffstat:
Mgemtextparser/parse.go | 4+++-
Mgemtextparser/test_data/nonStandard.gmi | 5+++++
Mindex/main.go | 109++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
3 files changed, 105 insertions(+), 13 deletions(-)

diff --git a/gemtextparser/parse.go b/gemtextparser/parse.go @@ -82,7 +82,9 @@ func ParseLinks(body string, currentUrl string) []string { if len(item) >= 10 && strings.Compare(item[:9], "gemini://") == 0 { geminiLinks = append(geminiLinks, item[0:indexOfSpaceOrTab]) } - if strings.Contains(item, "://") == false { + + // there are urls that aren't relative that don't have // like like mailto: and monero: + if strings.Contains(item, ":") == false { // relative link u, err := url.Parse(item[0:indexOfSpaceOrTab]) diff --git a/gemtextparser/test_data/nonStandard.gmi b/gemtextparser/test_data/nonStandard.gmi @@ -54,6 +54,11 @@ I'm not doing any validation for that, I'd expect the requests library to handle => something://blog.laack.co/testing213989t some site => gemino://blog.laack.co/testing213989t some site +> this one is funky + +=> mailto:gemini@dummy.com +=> monero:4981027349182734narositenarosite17203498123749 + => gemini://laack.co/this/is-not-valid => gemini://laack.co/this/is-not-valid => gemini://laack.co/this/is-not-valid diff --git a/index/main.go b/index/main.go @@ -2,17 +2,26 @@ package main import ( "database/sql" + "os" + "errors" "fmt" + "geminisearch/gemtextparser" _ "github.com/mattn/go-sqlite3" ) +// TODO: centralize this? var CRAWLED_DIR = "outputs/crawled" +var INDEXED_DIR = "outputs/indexed" var CRAWLED_DB_NAME = "main.db" +var INDEXED_DB_NAME = "main.db" -func connectToCrawlDB() *sql.DB { +// TODO: Probably return error instead of panic... +func connectToDB(directory string, name string) *sql.DB { - // TODO: Is a side effect like this okay here? - db, err := sql.Open("sqlite3", CRAWLED_DIR+"/"+CRAWLED_DB_NAME) + if _, err := os.Stat(directory+"/"+name); errors.Is(err, os.ErrNotExist) { + panic(err) + } + db, err := sql.Open("sqlite3", directory+"/"+name) if err != nil { panic(err) @@ -21,24 +30,100 @@ func connectToCrawlDB() *sql.DB { return db } -func getNextPath(dbCrawl *sql.DB) string{ +func ensureIndexDB(dir string, filename string) { + + os.MkdirAll(dir, 0755) + db, err := sql.Open("sqlite3", "./"+dir+"/"+filename) + defer db.Close() + + if err != nil { + panic(err) + } + + // There could be multiple links from the same source to the same destination, though that'd be kinda stupid + createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);" + + _, err = db.Exec(createLink) + + if err != nil { + panic(err) + } + + db.Exec("PRAGMA journal_mode=WAL;") + db.Exec("PRAGMA busy_timeout=2000;") + db.Exec("PRAGMA synchronous=NORMAL;") + +} + +func getNextPath(dbCrawl *sql.DB) (string, string) { + + // It's not guaranteed this site will be indexed because it's possible + // the process fails during that but is still removed from the db + // We could fix this with a locking mechanism, but I don't know how necessary that'd be popPage := `DELETE FROM page WHERE path = (SELECT path FROM page ORDER BY added_timestamp ASC LIMIT 1) - RETURNING path` + RETURNING url, path` + + selectedUrl := "" + selectedPath := "" + + err := dbCrawl.QueryRow(popPage).Scan(&selectedUrl, &selectedPath) + + if err != nil { + panic(err) + } - res := "" - dbCrawl.QueryRow(popPage).Scan(&res) - return res + return selectedUrl, selectedPath +} + +func insertLinks(db *sql.DB, source string, destinations []string) error { + // createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);" + + insertLinkQuery := "INSERT INTO link (source, destination, added_timestamp) VALUES (?, ?, CURRENT_TIMESTAMP)" + + // TODO: Why is it not parsing error return? + tx, _ := db.Begin() + + for _, current := range destinations { + _, err := tx.Exec(insertLinkQuery, source, current) + + if err != nil { + return err + } + } + + tx.Commit() + return nil } func main() { - dbCrawl := connectToCrawlDB() + + dbCrawl := connectToDB(CRAWLED_DIR, CRAWLED_DB_NAME) defer dbCrawl.Close() + ensureIndexDB(INDEXED_DIR, INDEXED_DB_NAME) + dbIndex := connectToDB(INDEXED_DIR, INDEXED_DB_NAME) + defer dbIndex.Close() + + selectedUrl, selectedPath := getNextPath(dbCrawl) - // insertPageQuery := "INSERT OR REPLACE INTO page (url, path) VALUES (?, ?)" + fmt.Printf("Indexing url: %s\nIndexing path: %s\n", selectedUrl, selectedPath) + + bodyBytes, err := os.ReadFile(selectedPath) + + if err != nil { + panic(err) + } + + body := string(bodyBytes) + + links := gemtextparser.ParseLinks(body, selectedUrl) + fmt.Printf("Links: %v\n", links) + err = insertLinks(dbIndex, selectedUrl, links) + + if err != nil { + panic(err) + } - res := getNextPath(dbCrawl) - fmt.Printf("Indexing: %s\n", res) }