commit 6654bdb0e73dbb6f1328cd341aed8f23936dcb96
parent 7844a7045ce08b808a43213e2f2a24588b3109cd
Author: Andrew Laack <andrew.laack@imbue.com>
Date: Thu, 1 Jan 2026 16:33:02 -0600
Improved link tracking
Diffstat:
2 files changed, 36 insertions(+), 25 deletions(-)
diff --git a/TODO.md b/TODO.md
@@ -31,6 +31,9 @@
---
+- add centralized indexing
+ - i added an indexed field to support this idea for incremental indexing
+- smarter queueing
- url lookup table
- fixes some of the memory issues
- ensure pruning prior to writing
diff --git a/collection/spider.py b/collection/spider.py
@@ -17,10 +17,10 @@ from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
import sys
-import base64
import sqlite3
import random
from prune import process_file
+import time
# Layout:
# - sites
@@ -33,7 +33,7 @@ from prune import process_file
# no, this is not 3nf, no I don't care, this is faster.
# tables:
# site
- # url, filepath, date
+ # url, filepath, date, indexed
# tf
# TODO
# - urls.db
@@ -45,12 +45,16 @@ from prune import process_file
MAX_SIZE = 2_000_000
MAX_WORKERS = 5
MAX_URLS_PER_SITE = 100
+NOT_INDEXED = 0
+INDEXED = 1
+REINDEX_FREQUENCY_DAYS = 7
-def url_to_filename(url):
- return base64.urlsafe_b64encode(url.encode()).decode() + ".html"
-
-def filename_to_url(filename):
- return base64.urlsafe_b64decode(filename[:-5]).decode()
+def should_queue(url, cur):
+ cutoff = time.time() - (REINDEX_FREQUENCY_DAYS * 86400)
+ cur.execute("""
+ SELECT 1 FROM site WHERE url = ? AND date > ? LIMIT 1
+ """, (url, cutoff))
+ return cur.fetchone() is None
def search_url(url, filepath):
links = set()
@@ -59,6 +63,8 @@ def search_url(url, filepath):
}
try:
source_code = requests.get(url, headers=headers, timeout=1) # natural limit to file size in memory
+ if source_code.status_code != 200:
+ return "", "", []
soup = BeautifulSoup(source_code.content, 'html.parser')
content = soup.prettify()
@@ -70,7 +76,6 @@ def search_url(url, filepath):
# we don't want to spider from bad sites.
# process_file does some regexp checks on the site to see if it is short / bad in some other way.
-
if deleted:
return "", "", []
else:
@@ -102,11 +107,16 @@ def search_url(url, filepath):
return filepath, url, links
-def get_links(num_links, cur_link):
+# pop links so multiple processes can run concurrently
+def get_links(num_links, cur_link, con_links):
cur_link.execute("""
- SELECT url FROM link ORDER BY priority DESC LIMIT ?
+ DELETE FROM link
+ WHERE url IN (SELECT url FROM link ORDER BY priority DESC LIMIT ?)
+ RETURNING url
""", (num_links,))
- return {row[0] for row in cur_link.fetchall()}
+ urls = {row[0] for row in cur_link.fetchall()}
+ con_links.commit()
+ return urls
if __name__ == "__main__":
@@ -117,8 +127,9 @@ if __name__ == "__main__":
con.execute('PRAGMA journal_mode=WAL')
cur = con.cursor()
- cur.execute("CREATE TABLE IF NOT EXISTS site(url, filepath, date)")
+ cur.execute("CREATE TABLE IF NOT EXISTS site(url, filepath, date, indexed)")
cur.execute("CREATE INDEX IF NOT EXISTS idx_site_url ON site(url)")
+ cur.execute("CREATE INDEX IF NOT EXISTS idx_site_indexed ON site(indexed)")
cur.execute("CREATE INDEX IF NOT EXISTS idx_site_filepath ON site(filepath)")
@@ -143,7 +154,7 @@ if __name__ == "__main__":
# TODO: better stopping. only stops when all links have been traversed
while True:
if len(urls) == 0:
- urls = get_links(MAX_WORKERS, cur_link)
+ urls = get_links(MAX_WORKERS, cur_link, con_links)
if len(urls) == 0:
print("NO MORE QUEUED LINKS TO SEARCH, EXITING")
break
@@ -161,24 +172,21 @@ if __name__ == "__main__":
for future in as_completed(futures):
filepath, url, links = future.result()
if filepath != '' and url != '':
- # dequeue: this assumes we can only queue once. it might make sense to not do this in the future.
- cur_link.execute("""
- DELETE FROM link where url = ?
- """, (url, ))
- con_links.commit()
-
# insert into site list
cur.execute("""
- INSERT INTO site VALUES (?, ?, ?)
- """, (url, filepath, datetime.datetime.now().timestamp()))
+ INSERT INTO site VALUES (?, ?, ?, ?)
+ """, (url, filepath, datetime.datetime.now().timestamp(), NOT_INDEXED))
con.commit()
for link in links:
# TODO: Make priority better, also speed this up with transactions
# also, do we want duplicates? we assume earlier ones are better than the current, but that is weird
- cur_link.execute("""
- INSERT OR IGNORE INTO link VALUES (?, ?)
- """, (link, random.randint(0,10000)))
- con_links.commit()
+ if should_queue(link, cur):
+ cur_link.execute("""
+ INSERT OR IGNORE INTO link VALUES (?, ?)
+ """, (link, random.randint(0,10000)))
+ con_links.commit()
+ else:
+ print(f"Skipping '{link}' for indexing")
urls = set()