information-retrieval

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit e1c2a8a2d1046eb5cd1f4f7e1a62fe93fcaafd66
parent 2fd9d19d4eb4a93e01e89647fea0cd40de0850e3
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Tue,  6 Jan 2026 13:39:37 -0600

Improving url indexing, starting with title indexing

Diffstat:
Mindexing/README.md | 3++-
Mindexing/page_parsing.py | 44+++++++++++++++++++++++++++++++-------------
Mindexing/utils.py | 4++--
Msearch/query.py | 8++++----
4 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/indexing/README.md b/indexing/README.md @@ -39,9 +39,10 @@ The indexer reads from the indexing queue and indexes results in said queue. - page(url, language, url_term_count, term_count, last_updated_timestamp) -- currently only supports english - document_term(term, url, tf, positional_postings) - url_term(term, url, tf, positional_postings) -- title_term(term, url, tf, positional_postings) +- title_term(term, url, tf, positional_postings) -- not in use currently - link(source, destination) - term(name, document_count) -- should this be a computed value instead of document_count? + -- should we also add url_count for url counts here? - collection(num_documents, average_document_length, average_url_length) diff --git a/indexing/page_parsing.py b/indexing/page_parsing.py @@ -1,4 +1,5 @@ from psycopg2.extras import execute_values +import lxml.html import tldextract from urllib.parse import urlparse import time @@ -95,6 +96,8 @@ def prune_documents(conn, filepaths, max_workers): # Use the existing data to derive term document_count per term def full_term_update(conn, term): + # TODO: Probably want to do url term count here as well for idf later. + query = """ SELECT COUNT(url) FROM document_term WHERE term = %s; """ @@ -117,18 +120,28 @@ def full_term_update(conn, term): def get_k_documents(conn, k): + + # Duplicate urls in the same batch can be painful. + query = """ - UPDATE indexing_queue - SET status = 'processing', claimed_at = NOW() - WHERE id IN( - SELECT id FROM indexing_queue - WHERE status = 'pending' - ORDER BY creation_timestamp asc - LIMIT %s - FOR UPDATE SKIP LOCKED - ) - RETURNING url, filepath; - """ + WITH unique_urls AS ( + SELECT DISTINCT ON (url) id + FROM indexing_queue + WHERE status = 'pending' + ORDER BY url, creation_timestamp ASC + ), + to_process AS ( + SELECT id FROM indexing_queue + WHERE id IN (SELECT id FROM unique_urls) + ORDER BY creation_timestamp ASC + LIMIT %s + FOR UPDATE SKIP LOCKED + ) + UPDATE indexing_queue + SET status = 'processing', claimed_at = NOW() + WHERE id IN (SELECT id FROM to_process) + RETURNING url, filepath; + """ cursor = conn.cursor() cursor.execute(query, (k,)) @@ -139,7 +152,10 @@ def get_k_documents(conn, k): filepaths = [res[1] for res in results] return urls, filepaths -# TODO: Should I apply stemming to this? +def get_title_postings(filepath): + title = lxml.html.parse(filepath).find('.//title').text + words = get_words(title) + return words def get_url_postings(url): ext = tldextract.extract(url) @@ -147,7 +163,7 @@ def get_url_postings(url): parts = [p for p in parts if p and p != 'www'] path = urlparse(url).path combined = '.'.join(parts) + path - words = re.sub(r'[^a-zA-Z0-9\s]', ' ', combined).lower().split() + words = get_words(combined) postings = {} for position, word in enumerate(words): @@ -263,10 +279,12 @@ if __name__ == "__main__": document_positional_postings = {} url_word_postings = {} + title_word_postings = {} for filepath in remaining: document_positional_postings[filepath] = get_plaintext_words(filepath) url_word_postings[filepath] = get_url_postings(filepath_lookup[filepath]) + title_word_postings[filepath] = get_title_postings(filepath) count = 0 diff --git a/indexing/utils.py b/indexing/utils.py @@ -32,10 +32,10 @@ def get_words(text): lines = text # There are special characters with diacritics we probably still want. - # only word characters remain + # only word characters remain (minus underscores because some url stuff I wanted) # Replaces symbols and such - lines = re.sub(r"[^\w]+", " ", lines, flags=re.UNICODE) + lines = re.sub(r"[^0-9A-Za-z]+", " ", lines) lines = lines.split(' ') for i in range(0, len(lines)): if lines[i] != '': diff --git a/search/query.py b/search/query.py @@ -3,6 +3,9 @@ import math import sys from indexing.page_parsing import get_words +# TODO: Should there be seperate idf values for url, title, and body +# I lean towards separate for body and url where title is treated the same as body. + def get_idf(conn, term): cursor = conn.cursor() @@ -29,10 +32,7 @@ def get_idf(conn, term): if len(result) != 0: documents_with_term = result[0][0] - if documents_with_term == 0: - return 0 - - idf = math.log(total_documents / documents_with_term) + idf = math.log((total_documents + 1) / (documents_with_term + 1)) return idf def get_url_tfs(conn, term):