information-retrieval

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 6654bdb0e73dbb6f1328cd341aed8f23936dcb96
parent 7844a7045ce08b808a43213e2f2a24588b3109cd
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Thu,  1 Jan 2026 16:33:02 -0600

Improved link tracking

Diffstat:
MTODO.md | 3+++
Mcollection/spider.py | 58+++++++++++++++++++++++++++++++++-------------------------
2 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/TODO.md b/TODO.md @@ -31,6 +31,9 @@ --- +- add centralized indexing + - i added an indexed field to support this idea for incremental indexing +- smarter queueing - url lookup table - fixes some of the memory issues - ensure pruning prior to writing diff --git a/collection/spider.py b/collection/spider.py @@ -17,10 +17,10 @@ from urllib.parse import urljoin, urlparse from concurrent.futures import ThreadPoolExecutor, as_completed from bs4 import BeautifulSoup import sys -import base64 import sqlite3 import random from prune import process_file +import time # Layout: # - sites @@ -33,7 +33,7 @@ from prune import process_file # no, this is not 3nf, no I don't care, this is faster. # tables: # site - # url, filepath, date + # url, filepath, date, indexed # tf # TODO # - urls.db @@ -45,12 +45,16 @@ from prune import process_file MAX_SIZE = 2_000_000 MAX_WORKERS = 5 MAX_URLS_PER_SITE = 100 +NOT_INDEXED = 0 +INDEXED = 1 +REINDEX_FREQUENCY_DAYS = 7 -def url_to_filename(url): - return base64.urlsafe_b64encode(url.encode()).decode() + ".html" - -def filename_to_url(filename): - return base64.urlsafe_b64decode(filename[:-5]).decode() +def should_queue(url, cur): + cutoff = time.time() - (REINDEX_FREQUENCY_DAYS * 86400) + cur.execute(""" + SELECT 1 FROM site WHERE url = ? AND date > ? LIMIT 1 + """, (url, cutoff)) + return cur.fetchone() is None def search_url(url, filepath): links = set() @@ -59,6 +63,8 @@ def search_url(url, filepath): } try: source_code = requests.get(url, headers=headers, timeout=1) # natural limit to file size in memory + if source_code.status_code != 200: + return "", "", [] soup = BeautifulSoup(source_code.content, 'html.parser') content = soup.prettify() @@ -70,7 +76,6 @@ def search_url(url, filepath): # we don't want to spider from bad sites. # process_file does some regexp checks on the site to see if it is short / bad in some other way. - if deleted: return "", "", [] else: @@ -102,11 +107,16 @@ def search_url(url, filepath): return filepath, url, links -def get_links(num_links, cur_link): +# pop links so multiple processes can run concurrently +def get_links(num_links, cur_link, con_links): cur_link.execute(""" - SELECT url FROM link ORDER BY priority DESC LIMIT ? + DELETE FROM link + WHERE url IN (SELECT url FROM link ORDER BY priority DESC LIMIT ?) + RETURNING url """, (num_links,)) - return {row[0] for row in cur_link.fetchall()} + urls = {row[0] for row in cur_link.fetchall()} + con_links.commit() + return urls if __name__ == "__main__": @@ -117,8 +127,9 @@ if __name__ == "__main__": con.execute('PRAGMA journal_mode=WAL') cur = con.cursor() - cur.execute("CREATE TABLE IF NOT EXISTS site(url, filepath, date)") + cur.execute("CREATE TABLE IF NOT EXISTS site(url, filepath, date, indexed)") cur.execute("CREATE INDEX IF NOT EXISTS idx_site_url ON site(url)") + cur.execute("CREATE INDEX IF NOT EXISTS idx_site_indexed ON site(indexed)") cur.execute("CREATE INDEX IF NOT EXISTS idx_site_filepath ON site(filepath)") @@ -143,7 +154,7 @@ if __name__ == "__main__": # TODO: better stopping. only stops when all links have been traversed while True: if len(urls) == 0: - urls = get_links(MAX_WORKERS, cur_link) + urls = get_links(MAX_WORKERS, cur_link, con_links) if len(urls) == 0: print("NO MORE QUEUED LINKS TO SEARCH, EXITING") break @@ -161,24 +172,21 @@ if __name__ == "__main__": for future in as_completed(futures): filepath, url, links = future.result() if filepath != '' and url != '': - # dequeue: this assumes we can only queue once. it might make sense to not do this in the future. - cur_link.execute(""" - DELETE FROM link where url = ? - """, (url, )) - con_links.commit() - # insert into site list cur.execute(""" - INSERT INTO site VALUES (?, ?, ?) - """, (url, filepath, datetime.datetime.now().timestamp())) + INSERT INTO site VALUES (?, ?, ?, ?) + """, (url, filepath, datetime.datetime.now().timestamp(), NOT_INDEXED)) con.commit() for link in links: # TODO: Make priority better, also speed this up with transactions # also, do we want duplicates? we assume earlier ones are better than the current, but that is weird - cur_link.execute(""" - INSERT OR IGNORE INTO link VALUES (?, ?) - """, (link, random.randint(0,10000))) - con_links.commit() + if should_queue(link, cur): + cur_link.execute(""" + INSERT OR IGNORE INTO link VALUES (?, ?) + """, (link, random.randint(0,10000))) + con_links.commit() + else: + print(f"Skipping '{link}' for indexing") urls = set()