commit e1c2a8a2d1046eb5cd1f4f7e1a62fe93fcaafd66
parent 2fd9d19d4eb4a93e01e89647fea0cd40de0850e3
Author: Andrew Laack <andrew.laack@imbue.com>
Date: Tue, 6 Jan 2026 13:39:37 -0600
Improving url indexing, starting with title indexing
Diffstat:
4 files changed, 39 insertions(+), 20 deletions(-)
diff --git a/indexing/README.md b/indexing/README.md
@@ -39,9 +39,10 @@ The indexer reads from the indexing queue and indexes results in said queue.
- page(url, language, url_term_count, term_count, last_updated_timestamp) -- currently only supports english
- document_term(term, url, tf, positional_postings)
- url_term(term, url, tf, positional_postings)
-- title_term(term, url, tf, positional_postings)
+- title_term(term, url, tf, positional_postings) -- not in use currently
- link(source, destination)
- term(name, document_count) -- should this be a computed value instead of document_count?
+ -- should we also add url_count for url counts here?
- collection(num_documents, average_document_length, average_url_length)
diff --git a/indexing/page_parsing.py b/indexing/page_parsing.py
@@ -1,4 +1,5 @@
from psycopg2.extras import execute_values
+import lxml.html
import tldextract
from urllib.parse import urlparse
import time
@@ -95,6 +96,8 @@ def prune_documents(conn, filepaths, max_workers):
# Use the existing data to derive term document_count per term
def full_term_update(conn, term):
+ # TODO: Probably want to do url term count here as well for idf later.
+
query = """
SELECT COUNT(url) FROM document_term WHERE term = %s;
"""
@@ -117,18 +120,28 @@ def full_term_update(conn, term):
def get_k_documents(conn, k):
+
+ # Duplicate urls in the same batch can be painful.
+
query = """
- UPDATE indexing_queue
- SET status = 'processing', claimed_at = NOW()
- WHERE id IN(
- SELECT id FROM indexing_queue
- WHERE status = 'pending'
- ORDER BY creation_timestamp asc
- LIMIT %s
- FOR UPDATE SKIP LOCKED
- )
- RETURNING url, filepath;
- """
+ WITH unique_urls AS (
+ SELECT DISTINCT ON (url) id
+ FROM indexing_queue
+ WHERE status = 'pending'
+ ORDER BY url, creation_timestamp ASC
+ ),
+ to_process AS (
+ SELECT id FROM indexing_queue
+ WHERE id IN (SELECT id FROM unique_urls)
+ ORDER BY creation_timestamp ASC
+ LIMIT %s
+ FOR UPDATE SKIP LOCKED
+ )
+ UPDATE indexing_queue
+ SET status = 'processing', claimed_at = NOW()
+ WHERE id IN (SELECT id FROM to_process)
+ RETURNING url, filepath;
+ """
cursor = conn.cursor()
cursor.execute(query, (k,))
@@ -139,7 +152,10 @@ def get_k_documents(conn, k):
filepaths = [res[1] for res in results]
return urls, filepaths
-# TODO: Should I apply stemming to this?
+def get_title_postings(filepath):
+ title = lxml.html.parse(filepath).find('.//title').text
+ words = get_words(title)
+ return words
def get_url_postings(url):
ext = tldextract.extract(url)
@@ -147,7 +163,7 @@ def get_url_postings(url):
parts = [p for p in parts if p and p != 'www']
path = urlparse(url).path
combined = '.'.join(parts) + path
- words = re.sub(r'[^a-zA-Z0-9\s]', ' ', combined).lower().split()
+ words = get_words(combined)
postings = {}
for position, word in enumerate(words):
@@ -263,10 +279,12 @@ if __name__ == "__main__":
document_positional_postings = {}
url_word_postings = {}
+ title_word_postings = {}
for filepath in remaining:
document_positional_postings[filepath] = get_plaintext_words(filepath)
url_word_postings[filepath] = get_url_postings(filepath_lookup[filepath])
+ title_word_postings[filepath] = get_title_postings(filepath)
count = 0
diff --git a/indexing/utils.py b/indexing/utils.py
@@ -32,10 +32,10 @@ def get_words(text):
lines = text
# There are special characters with diacritics we probably still want.
- # only word characters remain
+ # only word characters remain (minus underscores because some url stuff I wanted)
# Replaces symbols and such
- lines = re.sub(r"[^\w]+", " ", lines, flags=re.UNICODE)
+ lines = re.sub(r"[^0-9A-Za-z]+", " ", lines)
lines = lines.split(' ')
for i in range(0, len(lines)):
if lines[i] != '':
diff --git a/search/query.py b/search/query.py
@@ -3,6 +3,9 @@ import math
import sys
from indexing.page_parsing import get_words
+# TODO: Should there be seperate idf values for url, title, and body
+# I lean towards separate for body and url where title is treated the same as body.
+
def get_idf(conn, term):
cursor = conn.cursor()
@@ -29,10 +32,7 @@ def get_idf(conn, term):
if len(result) != 0:
documents_with_term = result[0][0]
- if documents_with_term == 0:
- return 0
-
- idf = math.log(total_documents / documents_with_term)
+ idf = math.log((total_documents + 1) / (documents_with_term + 1))
return idf
def get_url_tfs(conn, term):