commit 9f030f04661d3469a6fd50c14c377aa8ebf130eb
parent 5fcff806414493fe51e81e7a0b5b5b8ec28480e0
Author: Andrew Laack <andrew@laack.co>
Date: Wed, 6 May 2026 15:41:10 -0500
Fixed issue w/ non-hierarchical links
Diffstat:
3 files changed, 105 insertions(+), 13 deletions(-)
diff --git a/gemtextparser/parse.go b/gemtextparser/parse.go
@@ -82,7 +82,9 @@ func ParseLinks(body string, currentUrl string) []string {
if len(item) >= 10 && strings.Compare(item[:9], "gemini://") == 0 {
geminiLinks = append(geminiLinks, item[0:indexOfSpaceOrTab])
}
- if strings.Contains(item, "://") == false {
+
+ // there are urls that aren't relative that don't have // like like mailto: and monero:
+ if strings.Contains(item, ":") == false {
// relative link
u, err := url.Parse(item[0:indexOfSpaceOrTab])
diff --git a/gemtextparser/test_data/nonStandard.gmi b/gemtextparser/test_data/nonStandard.gmi
@@ -54,6 +54,11 @@ I'm not doing any validation for that, I'd expect the requests library to handle
=> something://blog.laack.co/testing213989t some site
=> gemino://blog.laack.co/testing213989t some site
+> this one is funky
+
+=> mailto:gemini@dummy.com
+=> monero:4981027349182734narositenarosite17203498123749
+
=> gemini://laack.co/this/is-not-valid
=> gemini://laack.co/this/is-not-valid
=> gemini://laack.co/this/is-not-valid
diff --git a/index/main.go b/index/main.go
@@ -2,17 +2,26 @@ package main
import (
"database/sql"
+ "os"
+ "errors"
"fmt"
+ "geminisearch/gemtextparser"
_ "github.com/mattn/go-sqlite3"
)
+// TODO: centralize this?
var CRAWLED_DIR = "outputs/crawled"
+var INDEXED_DIR = "outputs/indexed"
var CRAWLED_DB_NAME = "main.db"
+var INDEXED_DB_NAME = "main.db"
-func connectToCrawlDB() *sql.DB {
+// TODO: Probably return error instead of panic...
+func connectToDB(directory string, name string) *sql.DB {
- // TODO: Is a side effect like this okay here?
- db, err := sql.Open("sqlite3", CRAWLED_DIR+"/"+CRAWLED_DB_NAME)
+ if _, err := os.Stat(directory+"/"+name); errors.Is(err, os.ErrNotExist) {
+ panic(err)
+ }
+ db, err := sql.Open("sqlite3", directory+"/"+name)
if err != nil {
panic(err)
@@ -21,24 +30,100 @@ func connectToCrawlDB() *sql.DB {
return db
}
-func getNextPath(dbCrawl *sql.DB) string{
+func ensureIndexDB(dir string, filename string) {
+
+ os.MkdirAll(dir, 0755)
+ db, err := sql.Open("sqlite3", "./"+dir+"/"+filename)
+ defer db.Close()
+
+ if err != nil {
+ panic(err)
+ }
+
+ // There could be multiple links from the same source to the same destination, though that'd be kinda stupid
+ createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);"
+
+ _, err = db.Exec(createLink)
+
+ if err != nil {
+ panic(err)
+ }
+
+ db.Exec("PRAGMA journal_mode=WAL;")
+ db.Exec("PRAGMA busy_timeout=2000;")
+ db.Exec("PRAGMA synchronous=NORMAL;")
+
+}
+
+func getNextPath(dbCrawl *sql.DB) (string, string) {
+
+ // It's not guaranteed this site will be indexed because it's possible
+ // the process fails during that but is still removed from the db
+ // We could fix this with a locking mechanism, but I don't know how necessary that'd be
popPage := `DELETE FROM page
WHERE path = (SELECT path FROM page ORDER BY added_timestamp ASC LIMIT 1)
- RETURNING path`
+ RETURNING url, path`
+
+ selectedUrl := ""
+ selectedPath := ""
+
+ err := dbCrawl.QueryRow(popPage).Scan(&selectedUrl, &selectedPath)
+
+ if err != nil {
+ panic(err)
+ }
- res := ""
- dbCrawl.QueryRow(popPage).Scan(&res)
- return res
+ return selectedUrl, selectedPath
+}
+
+func insertLinks(db *sql.DB, source string, destinations []string) error {
+ // createLink := "CREATE TABLE IF NOT EXISTS link (source, destination, added_timestamp datetime);"
+
+ insertLinkQuery := "INSERT INTO link (source, destination, added_timestamp) VALUES (?, ?, CURRENT_TIMESTAMP)"
+
+ // TODO: Why is it not parsing error return?
+ tx, _ := db.Begin()
+
+ for _, current := range destinations {
+ _, err := tx.Exec(insertLinkQuery, source, current)
+
+ if err != nil {
+ return err
+ }
+ }
+
+ tx.Commit()
+ return nil
}
func main() {
- dbCrawl := connectToCrawlDB()
+
+ dbCrawl := connectToDB(CRAWLED_DIR, CRAWLED_DB_NAME)
defer dbCrawl.Close()
+ ensureIndexDB(INDEXED_DIR, INDEXED_DB_NAME)
+ dbIndex := connectToDB(INDEXED_DIR, INDEXED_DB_NAME)
+ defer dbIndex.Close()
+
+ selectedUrl, selectedPath := getNextPath(dbCrawl)
- // insertPageQuery := "INSERT OR REPLACE INTO page (url, path) VALUES (?, ?)"
+ fmt.Printf("Indexing url: %s\nIndexing path: %s\n", selectedUrl, selectedPath)
+
+ bodyBytes, err := os.ReadFile(selectedPath)
+
+ if err != nil {
+ panic(err)
+ }
+
+ body := string(bodyBytes)
+
+ links := gemtextparser.ParseLinks(body, selectedUrl)
+ fmt.Printf("Links: %v\n", links)
+ err = insertLinks(dbIndex, selectedUrl, links)
+
+ if err != nil {
+ panic(err)
+ }
- res := getNextPath(dbCrawl)
- fmt.Printf("Indexing: %s\n", res)
}