Improved link tracking - information-retrieval - Unnamed repository; edit this file 'description' to name the repository.

commit 6654bdb0e73dbb6f1328cd341aed8f23936dcb96
parent 7844a7045ce08b808a43213e2f2a24588b3109cd
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Thu,  1 Jan 2026 16:33:02 -0600

Improved link tracking

Diffstat:
M TODO.md  | 3 +++
M collection/spider.py  | 58 +++++++++++++++++++++++++++++++++-------------------------

2 files changed, 36 insertions(+), 25 deletions(-)
diff --git a/TODO.md b/TODO.md
@@ -31,6 +31,9 @@
 
 ---
 
+- add centralized indexing
+    - i added an indexed field to support this idea for incremental indexing
+- smarter queueing
 - url lookup table
     - fixes some of the memory issues
 - ensure pruning prior to writing
diff --git a/collection/spider.py b/collection/spider.py
@@ -17,10 +17,10 @@ from urllib.parse import urljoin, urlparse
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from bs4 import BeautifulSoup
 import sys
-import base64
 import sqlite3
 import random
 from prune import process_file
+import time
 
 # Layout:
 # - sites
@@ -33,7 +33,7 @@ from prune import process_file
             # no, this is not 3nf, no I don't care, this is faster.
         # tables:
             # site
-                # url, filepath, date
+                # url, filepath, date, indexed
             # tf
                 # TODO
     #   - urls.db
@@ -45,12 +45,16 @@ from prune import process_file
 MAX_SIZE = 2_000_000
 MAX_WORKERS = 5
 MAX_URLS_PER_SITE = 100
+NOT_INDEXED = 0
+INDEXED = 1
+REINDEX_FREQUENCY_DAYS = 7
 
-def url_to_filename(url):
-    return base64.urlsafe_b64encode(url.encode()).decode() + ".html"
-
-def filename_to_url(filename):
-    return base64.urlsafe_b64decode(filename[:-5]).decode()
+def should_queue(url, cur):
+    cutoff = time.time() - (REINDEX_FREQUENCY_DAYS * 86400)
+    cur.execute("""
+    SELECT 1 FROM site WHERE url = ? AND date > ? LIMIT 1
+    """, (url, cutoff))
+    return cur.fetchone() is None
 
 def search_url(url, filepath):
     links = set()
@@ -59,6 +63,8 @@ def search_url(url, filepath):
     }
     try:
         source_code = requests.get(url, headers=headers, timeout=1) # natural limit to file size in memory
+        if source_code.status_code != 200:
+            return "", "", []
         soup = BeautifulSoup(source_code.content, 'html.parser')
         content = soup.prettify()
 
@@ -70,7 +76,6 @@ def search_url(url, filepath):
 
             # we don't want to spider from bad sites.
             # process_file does some regexp checks on the site to see if it is short / bad in some other way.
-
             if deleted:
                 return "", "", []
         else:
@@ -102,11 +107,16 @@ def search_url(url, filepath):
 
     return filepath, url, links
 
-def get_links(num_links, cur_link):
+# pop links so multiple processes can run concurrently
+def get_links(num_links, cur_link, con_links):
     cur_link.execute("""
-        SELECT url FROM link ORDER BY priority DESC LIMIT ?
+        DELETE FROM link 
+        WHERE url IN (SELECT url FROM link ORDER BY priority DESC LIMIT ?)
+        RETURNING url
     """, (num_links,))
-    return {row[0] for row in cur_link.fetchall()}
+    urls = {row[0] for row in cur_link.fetchall()}
+    con_links.commit()
+    return urls
 
 
 if __name__ == "__main__":
@@ -117,8 +127,9 @@ if __name__ == "__main__":
     con.execute('PRAGMA journal_mode=WAL')
     cur = con.cursor()
 
-    cur.execute("CREATE TABLE IF NOT EXISTS site(url, filepath, date)")
+    cur.execute("CREATE TABLE IF NOT EXISTS site(url, filepath, date, indexed)")
     cur.execute("CREATE INDEX IF NOT EXISTS idx_site_url ON site(url)")
+    cur.execute("CREATE INDEX IF NOT EXISTS idx_site_indexed ON site(indexed)")
     cur.execute("CREATE INDEX IF NOT EXISTS idx_site_filepath ON site(filepath)")
 
 
@@ -143,7 +154,7 @@ if __name__ == "__main__":
     # TODO: better stopping. only stops when all links have been traversed
     while True:
         if len(urls) == 0:
-            urls = get_links(MAX_WORKERS, cur_link)
+            urls = get_links(MAX_WORKERS, cur_link, con_links)
             if len(urls) == 0:
                 print("NO MORE QUEUED LINKS TO SEARCH, EXITING")
                 break
@@ -161,24 +172,21 @@ if __name__ == "__main__":
             for future in as_completed(futures):
                 filepath, url, links = future.result()
                 if filepath != '' and url != '':
-                    # dequeue: this assumes we can only queue once. it might make sense to not do this in the future.
-                    cur_link.execute("""
-                        DELETE FROM link where url = ?
-                    """, (url, ))
-                    con_links.commit()
-
                     # insert into site list
                     cur.execute("""
-                        INSERT INTO site VALUES (?, ?, ?)
-                    """, (url, filepath, datetime.datetime.now().timestamp()))
+                        INSERT INTO site VALUES (?, ?, ?, ?)
+                    """, (url, filepath, datetime.datetime.now().timestamp(), NOT_INDEXED))
                     con.commit()
                 for link in links:
                     # TODO: Make priority better, also speed this up with transactions
                     # also, do we want duplicates? we assume earlier ones are better than the current, but that is weird
-                    cur_link.execute("""
-                        INSERT OR IGNORE INTO link VALUES (?, ?)
-                    """, (link, random.randint(0,10000)))
-                    con_links.commit()
+                    if should_queue(link, cur):
+                        cur_link.execute("""
+                            INSERT OR IGNORE INTO link VALUES (?, ?)
+                        """, (link, random.randint(0,10000)))
+                        con_links.commit()
+                    else:
+                        print(f"Skipping '{link}' for indexing")
         urls = set()

	information-retrieval Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs

M	TODO.md	\|	3	+++
M	collection/spider.py	\|	58	+++++++++++++++++++++++++++++++++-------------------------