information-retrieval

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 16e1f7d53b8b85e296cf50efc4a424aea35ceff0
parent 872a6d92228e10ebdd5076e9314e3c06f1b23699
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Tue,  6 Jan 2026 23:51:32 -0600

Add inverse index for titles

Diffstat:
Mindexing/page_parsing.py | 24++++++++++++++++++++++--
1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/indexing/page_parsing.py b/indexing/page_parsing.py @@ -14,7 +14,7 @@ import tqdm from crawling.spider import get_indexing_db_connection MAX_WORKERS = 5 -BATCH_SIZE = 1000 +BATCH_SIZE = 10 def get_term_list(filepath): @@ -302,6 +302,12 @@ if __name__ == "__main__": ON CONFLICT (term, url) DO UPDATE SET positional_postings = EXCLUDED.positional_postings """ + query_title_term = """ + INSERT INTO title_term (term, url, tf, positional_postings) + VALUES %s + ON CONFLICT (term, url) DO UPDATE SET positional_postings = EXCLUDED.positional_postings + """ + query_page = """ INSERT INTO page (url, language, url_term_count, term_count) VALUES %s @@ -315,6 +321,7 @@ if __name__ == "__main__": term_inserts = [] url_term_inserts = [] + title_term_inserts = [] page_inserts = [] for filepath in remaining: @@ -329,7 +336,13 @@ if __name__ == "__main__": count += 1 url_terms_data = url_word_postings[filepath] + title_terms_data = title_word_postings[filepath] + title_term_count = sum(len(positions) for positions in title_terms_data.values()) url_term_count = sum(len(positions) for positions in url_terms_data.values()) + + for term, positional_postings in title_terms_data.items(): + tf = len(positional_postings) / title_term_count if title_term_count > 0 else 0 + title_term_inserts.append((term, url, tf, positional_postings)) for term, positional_postings in url_terms_data.items(): tf = len(positional_postings) / url_term_count if url_term_count > 0 else 0 @@ -341,6 +354,13 @@ if __name__ == "__main__": execute_values( cursor, + query_title_term, + title_term_inserts, + page_size=100000 + ) + + execute_values( + cursor, query_url_term, url_term_inserts, page_size=100000 @@ -350,7 +370,7 @@ if __name__ == "__main__": cursor, query, term_inserts, - page_size=100000 # TODO: Optimize this. It's kinda quick, but also random. + page_size=100000 # TODO: Optimize this. It's kinda quick, but also random, the number that is. ) execute_values( cursor,