commit 3fd57e78245f76e883bf38fd86c4b40be798b05a
parent 677346e211c3c9544f3708ab117f2f61e2b27f01
Author: Andrew Laack <andrew.laack@imbue.com>
Date: Thu, 1 Jan 2026 23:24:40 -0600
Added language detection to pipeline and queries. improved spider prioritization.
Diffstat:
6 files changed, 146 insertions(+), 54 deletions(-)
diff --git a/TODO.md b/TODO.md
@@ -52,3 +52,71 @@
- forward / backlink calculation
- maybe after crawling as these are derived / indexing type values
+
+---
+
+- improve priority assignment
+ - how to do this?
+ - rank domains based on importance
+ - how?
+ - then use ranking to assign priority
+ - there should be some temporal priority too so maybe it is based on date/time inversion stuff
+
+- support sitemap .xml
+ - this should more fully search specific domains
+ - these should (probably) take priority over other links
+ - example: https://www.google.com/chromebook/sitemap.xml
+ - we should treat these differently because they don't have href stuffs
+ - regexp it is
+- something interesting would be link counts
+ - the more a page is linked to, the better
+- respect:
+ - noindex and nofollow
+ - https://en.wikipedia.org/wiki/Web_crawler
+- what I want
+ - I want to crawl a smaller subset of the internet that is useful
+ - I get lots of stuff from audible which is kind of useless in terms of importance
+ - could we do some sort of ranking per domain for average IC across documents?
+ - maybe, but would that just prioritize random garbage?
+ - probably....
+ - maybe prioritize longer documents
+ - that seems like it could be easily gamed, but also this is just the spider
+
+---
+
+- add regexp for url filtering
+ - don't use stuff with parameters (or logins maybe?)
+ - https://www.oed.com/shibboleth-login-redirect?returnUrl=https://oup-sp.sams-sigma.com/Shibboleth.sso/Login?SAMLDS%3D1%26target%3Dss%253Amem%253A292bfd0634875a2fa6b2ffc899913c45dca16f346eb3fd65a9b1269d9c16a659&entityID=https://idp.plymouthart.ac.uk/shibboleth,
+
+---
+
+next selection where C is the corpus:
+
+Where C_i,b is the backlinks of the i'th element
+Where C_i,t is the seconds since being added to the corpus
+(uncertain about this one) Where C_i,d is the distance from a seed authority (beta being a negative hyperparameter)
+
+s(C) = max( C_i,b * alpha + C_i,t * gamma + z + C_i,d * beta)
+
+---
+
+
+Other approaches:
+
+- breadth first
+ - The explanation given by the authors for this result is that "the most important pages have many links to them from numerous hosts, and those links will be found early, regardless of on which host or page the crawl originates
+ - this makes sense as a starting point, but then we have to consider other things too... because otherwise we never reindex things???
+ - or do we expect circularity to fix that issue?
+
+---
+
+- languages
+ - this is really important so put at top of list
+ - we want to calculate and add a column to the site table for the language of each site
+ - we then want these joined together for search querying
+
+---
+
+- sites should be prioritized by language as well...
+ - maybe, we don't actually know the language until we query it
+ - could use a heuristic based on subdomain
diff --git a/collection/lang-detect.py b/collection/lang-detect.py
@@ -1,20 +0,0 @@
-from langdetect import detect
-from indexing.utils import get_plaintext
-from indexing.utils import get_filepaths
-import tqdm
-
-def detect_language(text):
- return detect(text)
-
-if __name__ == "__main__":
- filepaths = get_filepaths("sites/")
- languages = {}
- for i in tqdm.tqdm(range(0, len(filepaths))):
- filepath = filepaths[i]
- current = detect_language(get_plaintext(filepath))
- if current not in languages:
- languages[current] = 1
- else:
- languages[current] += 1
- print(languages)
-
diff --git a/collection/spider.py b/collection/spider.py
@@ -21,7 +21,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
import sys
import sqlite3
-import random
from prune import process_file
import time
@@ -44,6 +43,10 @@ import time
# tables:
# url, priority, (possible add distance from authority here as well, uncertain)
+
+# there seems to be a memory leak somewhere which is limited by max workers, but this really bugs me.
+
+
# bytes
MAX_SIZE = 2_000_000
MAX_WORKERS = 50
@@ -52,6 +55,13 @@ NOT_INDEXED = 0
INDEXED = 1
REINDEX_FREQUENCY_DAYS = 7
+
+# if seconds weight == 1 then 3600 for backlink weight means
+# each additional backlink equates to a decrease of 1 hour
+
+BACKLINK_WEIGHT = 3600
+SECONDS_WEIGHT = 1
+
def should_queue(url, cur):
cutoff = time.time() - (REINDEX_FREQUENCY_DAYS * 86400)
cur.execute("""
@@ -59,7 +69,7 @@ def should_queue(url, cur):
""", (url, cutoff))
return cur.fetchone() is None
-def is_allowed(url, user_agent, timeout=.5):
+def is_allowed(url, user_agent, timeout=1):
try:
parsed = urlparse(url)
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
@@ -141,12 +151,18 @@ def search_url(url, filepath, respect_robots_txt=True):
return filepath, url, links
# pop links so multiple processes can run concurrently
-def get_links(num_links, cur_link, con_links):
+def get_links(num_links, cur_link, con_links, backlink_weight, seconds_weight):
+
cur_link.execute("""
DELETE FROM link
- WHERE url IN (SELECT url FROM link ORDER BY priority DESC LIMIT ?)
+ WHERE url IN (
+ SELECT url FROM link
+ ORDER BY (backlink_count * ?) + ( ? * (? - added)) DESC
+ LIMIT ?
+ )
RETURNING url
- """, (num_links,))
+ """, (backlink_weight, seconds_weight, int(time.time()), num_links))
+
urls = {row[0] for row in cur_link.fetchall()}
con_links.commit()
return urls
@@ -160,18 +176,20 @@ if __name__ == "__main__":
con.execute('PRAGMA journal_mode=WAL')
cur = con.cursor()
- cur.execute("CREATE TABLE IF NOT EXISTS site(url, filepath, date, indexed)")
+ cur.execute("CREATE TABLE IF NOT EXISTS site(url, filepath, date, indexed, language)")
cur.execute("CREATE INDEX IF NOT EXISTS idx_site_url ON site(url)")
cur.execute("CREATE INDEX IF NOT EXISTS idx_site_indexed ON site(indexed)")
cur.execute("CREATE INDEX IF NOT EXISTS idx_site_filepath ON site(filepath)")
+ cur.execute("CREATE INDEX IF NOT EXISTS idx_site_language ON site(filepath)")
con_links = sqlite3.connect('database/urls.db', timeout=60)
con_links.execute('PRAGMA journal_mode=WAL')
cur_link = con_links.cursor()
- cur_link.execute("CREATE TABLE IF NOT EXISTS link(url UNIQUE, priority)")
+ cur_link.execute("CREATE TABLE IF NOT EXISTS link(url UNIQUE, backlink_count, added)")
cur_link.execute("CREATE INDEX IF NOT EXISTS idx_link_url ON link(url)")
- cur_link.execute("CREATE INDEX IF NOT EXISTS idx_link_priority ON link(priority)")
+ cur_link.execute("CREATE INDEX IF NOT EXISTS idx_link_added ON link(added)")
+ cur_link.execute("CREATE INDEX IF NOT EXISTS idx_link_backlink_count ON link(backlink_count)")
urls = set()
@@ -187,7 +205,7 @@ if __name__ == "__main__":
# TODO: better stopping. only stops when all links have been traversed
while True:
if len(urls) == 0:
- urls = get_links(MAX_WORKERS, cur_link, con_links)
+ urls = get_links(MAX_WORKERS, cur_link, con_links, BACKLINK_WEIGHT, SECONDS_WEIGHT)
if len(urls) == 0:
print("NO MORE QUEUED LINKS TO SEARCH, EXITING")
break
@@ -207,16 +225,18 @@ if __name__ == "__main__":
if filepath != '' and url != '':
# insert into site list
cur.execute("""
- INSERT INTO site VALUES (?, ?, ?, ?)
- """, (url, filepath, datetime.datetime.now().timestamp(), NOT_INDEXED))
+ INSERT INTO site (url, filepath, date, indexed)
+ VALUES (?, ?, ?, ?)
+ """, (url, filepath, int(datetime.datetime.now().timestamp()), NOT_INDEXED))
con.commit()
for link in links:
# TODO: Make priority better, also speed this up with transactions
- # also, do we want duplicates? we assume earlier ones are better than the current, but that is weird
+ # link(url UNIQUE, backlink_count, added)
if should_queue(link, cur):
cur_link.execute("""
- INSERT OR IGNORE INTO link VALUES (?, ?)
- """, (link, random.randint(0,10000)))
+ INSERT INTO link VALUES (?, 1, ?)
+ ON CONFLICT(url) DO UPDATE SET backlink_count = backlink_count + 1
+ """, (link, int(datetime.datetime.now().timestamp())))
con_links.commit()
else:
print(f"Skipping '{link}' for indexing")
@@ -224,6 +244,7 @@ if __name__ == "__main__":
cur_link.close()
+
con_links.close()
cur.close()
con.close()
diff --git a/indexing/lang-detect.py b/indexing/lang-detect.py
@@ -0,0 +1,28 @@
+from langdetect import detect
+from tqdm import tqdm
+import sqlite3
+from utils import get_plaintext
+
+def detect_language(text):
+ return detect(text)
+
+if __name__ == "__main__":
+ con_sites = sqlite3.connect('database/manifest.db', timeout=60)
+ con_sites.execute('PRAGMA journal_mode=WAL')
+ cur_sites = con_sites.cursor()
+
+ cur_sites.execute("""
+ SELECT filepath FROM site WHERE language IS NULL
+ """)
+
+ missing_language = cur_sites.fetchall()
+
+ for i in tqdm(range(len(missing_language))):
+ filepath_tuple = missing_language[i]
+ filepath = filepath_tuple[0]
+ plaintext = get_plaintext(filepath)
+ lang = detect_language(plaintext)
+ cur_sites.execute("""
+ UPDATE site SET language = ? WHERE filepath = ?
+ """, (lang, filepath))
+ con_sites.commit()
diff --git a/search/query.py b/search/query.py
@@ -6,6 +6,7 @@ if __name__ == "__main__":
query = sys.argv[1]
limit = int(sys.argv[2])
+ language = sys.argv[3] # probably want en for english
query = get_words(query)
con = sqlite3.connect('database/manifest.db', timeout=60)
@@ -18,7 +19,7 @@ if __name__ == "__main__":
# term(name, idf);
cur.execute("SELECT idf from term where name = ?", (term,))
idf = cur.fetchone()
- if len(idf) > 0:
+ if idf is not None and len(idf) > 0:
idfs[term] = idf[0]
else:
@@ -33,31 +34,21 @@ if __name__ == "__main__":
tfidf = {}
for term in query:
- cur.execute("SELECT document_path, value from tf where term = ? ORDER BY value desc", (term,))
+ cur.execute("""
+ SELECT site.url, tf.value
+ FROM tf
+ JOIN site ON tf.document_path = site.filepath
+ WHERE tf.term = ? AND site.language = ?
+ ORDER BY tf.value DESC
+ """, (term, language))
rows = cur.fetchall()
for row in rows:
if tfidf.get(row[0]) is None:
tfidf[row[0]] = float(row[1]) * idfs[term]
else:
tfidf[row[0]] += float(row[1]) * idfs[term]
-
sorted_results = sorted(tfidf.items(), key=lambda x: x[1], reverse=True)
-
-
-# site(url, filepath, date, indexed)
-
-
- count = 0
- for doc_path, score in sorted_results:
- if count > limit:
- break
- cur.execute("SELECT url from site where filepath = ? ORDER BY date LIMIT 1", (doc_path,))
- rows = cur.fetchall()
- # TODO: Add check for safety
- if len(rows) > 0 and len(rows[0]) > 0:
- url = rows[0][0]
- print(f"{score:.4f} - {url}")
- count += 1
-
+ for url, score in sorted_results[:limit]:
+ print(f"{score:.4f} - {url}")
cur.close()
con.close()
diff --git a/seeds/dictionaries.txt b/seeds/dictionaries.txt
@@ -0,0 +1,4 @@
+https://www.dictionary.com
+https://www.merriam-webster.com
+https://www.oed.com
+https://dictionary.cambridge.org