Improving url indexing, starting with title indexing - information-retrieval - Unnamed repository; edit this file 'description' to name the repository.

commit e1c2a8a2d1046eb5cd1f4f7e1a62fe93fcaafd66
parent 2fd9d19d4eb4a93e01e89647fea0cd40de0850e3
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Tue,  6 Jan 2026 13:39:37 -0600

Improving url indexing, starting with title indexing

Diffstat:
M indexing/README.md  | 3 ++-
M indexing/page_parsing.py  | 44 +++++++++++++++++++++++++++++++-------------
M indexing/utils.py  | 4 ++--
M search/query.py  | 8 ++++----

4 files changed, 39 insertions(+), 20 deletions(-)
diff --git a/indexing/README.md b/indexing/README.md
@@ -39,9 +39,10 @@ The indexer reads from the indexing queue and indexes results in said queue.
 - page(url, language, url_term_count, term_count, last_updated_timestamp) -- currently only supports english
 - document_term(term, url, tf, positional_postings)
 - url_term(term, url, tf, positional_postings)
-- title_term(term, url, tf, positional_postings)
+- title_term(term, url, tf, positional_postings) -- not in use currently
 - link(source, destination)
 - term(name, document_count) -- should this be a computed value instead of document_count?
+                             -- should we also add url_count for url counts here?
 - collection(num_documents, average_document_length, average_url_length)
 
 
diff --git a/indexing/page_parsing.py b/indexing/page_parsing.py
@@ -1,4 +1,5 @@
 from psycopg2.extras import execute_values
+import lxml.html
 import tldextract
 from urllib.parse import urlparse
 import time
@@ -95,6 +96,8 @@ def prune_documents(conn, filepaths, max_workers):
 # Use the existing data to derive term document_count per term
 def full_term_update(conn, term):
 
+    # TODO: Probably want to do url term count here as well for idf later.
+
     query = """
         SELECT COUNT(url) FROM document_term WHERE term = %s;
     """
@@ -117,18 +120,28 @@ def full_term_update(conn, term):
 
 
 def get_k_documents(conn, k):
+
+    # Duplicate urls in the same batch can be painful.
+
     query = """
-                UPDATE indexing_queue
-                SET status = 'processing', claimed_at = NOW()
-                WHERE id IN(
-                    SELECT id FROM indexing_queue
-                    WHERE status = 'pending'
-                    ORDER BY creation_timestamp asc
-                    LIMIT %s
-                    FOR UPDATE SKIP LOCKED
-                )
-                RETURNING url, filepath;
-            """
+        WITH unique_urls AS (
+            SELECT DISTINCT ON (url) id
+            FROM indexing_queue
+            WHERE status = 'pending'
+            ORDER BY url, creation_timestamp ASC
+        ),
+        to_process AS (
+            SELECT id FROM indexing_queue
+            WHERE id IN (SELECT id FROM unique_urls)
+            ORDER BY creation_timestamp ASC
+            LIMIT %s
+            FOR UPDATE SKIP LOCKED
+        )
+        UPDATE indexing_queue
+        SET status = 'processing', claimed_at = NOW()
+        WHERE id IN (SELECT id FROM to_process)
+        RETURNING url, filepath;
+    """
 
     cursor = conn.cursor()
     cursor.execute(query, (k,))
@@ -139,7 +152,10 @@ def get_k_documents(conn, k):
     filepaths  = [res[1] for res in results]
     return urls, filepaths
 
-# TODO: Should I apply stemming to this?
+def get_title_postings(filepath):
+    title = lxml.html.parse(filepath).find('.//title').text
+    words = get_words(title)
+    return words
 
 def get_url_postings(url):
     ext = tldextract.extract(url)
@@ -147,7 +163,7 @@ def get_url_postings(url):
     parts = [p for p in parts if p and p != 'www']
     path = urlparse(url).path
     combined = '.'.join(parts) + path
-    words = re.sub(r'[^a-zA-Z0-9\s]', ' ', combined).lower().split()
+    words = get_words(combined)
 
     postings = {}
     for position, word in enumerate(words):
@@ -263,10 +279,12 @@ if __name__ == "__main__":
 
         document_positional_postings = {}
         url_word_postings = {}
+        title_word_postings = {}
 
         for filepath in remaining:
             document_positional_postings[filepath] = get_plaintext_words(filepath)
             url_word_postings[filepath] = get_url_postings(filepath_lookup[filepath])
+            title_word_postings[filepath] = get_title_postings(filepath)
 
         count = 0
 
diff --git a/indexing/utils.py b/indexing/utils.py
@@ -32,10 +32,10 @@ def get_words(text):
     lines = text
     
     # There are special characters with diacritics we probably still want.
-    # only word characters remain
+    # only word characters remain (minus underscores because some url stuff I wanted)
     # Replaces symbols and such
 
-    lines = re.sub(r"[^\w]+", " ", lines, flags=re.UNICODE)
+    lines = re.sub(r"[^0-9A-Za-z]+", " ", lines)
     lines = lines.split(' ')
     for i in range(0, len(lines)):
         if lines[i] != '':
diff --git a/search/query.py b/search/query.py
@@ -3,6 +3,9 @@ import math
 import sys
 from indexing.page_parsing import get_words
 
+# TODO: Should there be seperate idf values for url, title, and body
+# I lean towards separate for body and url where title is treated the same as body.
+
 def get_idf(conn, term):
     cursor = conn.cursor()
 
@@ -29,10 +32,7 @@ def get_idf(conn, term):
     if len(result) != 0:
         documents_with_term  = result[0][0]
 
-    if documents_with_term == 0:
-        return 0
-
-    idf = math.log(total_documents / documents_with_term)
+    idf = math.log((total_documents + 1) / (documents_with_term + 1))
     return idf
 
 def get_url_tfs(conn, term):

	information-retrieval Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs

M	indexing/README.md	\|	3	++-
M	indexing/page_parsing.py	\|	44	+++++++++++++++++++++++++++++++-------------
M	indexing/utils.py	\|	4	++--
M	search/query.py	\|	8	++++----