commit 16e1f7d53b8b85e296cf50efc4a424aea35ceff0
parent 872a6d92228e10ebdd5076e9314e3c06f1b23699
Author: Andrew Laack <andrew.laack@imbue.com>
Date: Tue, 6 Jan 2026 23:51:32 -0600
Add inverse index for titles
Diffstat:
1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/indexing/page_parsing.py b/indexing/page_parsing.py
@@ -14,7 +14,7 @@ import tqdm
from crawling.spider import get_indexing_db_connection
MAX_WORKERS = 5
-BATCH_SIZE = 1000
+BATCH_SIZE = 10
def get_term_list(filepath):
@@ -302,6 +302,12 @@ if __name__ == "__main__":
ON CONFLICT (term, url) DO UPDATE SET positional_postings = EXCLUDED.positional_postings
"""
+ query_title_term = """
+ INSERT INTO title_term (term, url, tf, positional_postings)
+ VALUES %s
+ ON CONFLICT (term, url) DO UPDATE SET positional_postings = EXCLUDED.positional_postings
+ """
+
query_page = """
INSERT INTO page (url, language, url_term_count, term_count)
VALUES %s
@@ -315,6 +321,7 @@ if __name__ == "__main__":
term_inserts = []
url_term_inserts = []
+ title_term_inserts = []
page_inserts = []
for filepath in remaining:
@@ -329,7 +336,13 @@ if __name__ == "__main__":
count += 1
url_terms_data = url_word_postings[filepath]
+ title_terms_data = title_word_postings[filepath]
+ title_term_count = sum(len(positions) for positions in title_terms_data.values())
url_term_count = sum(len(positions) for positions in url_terms_data.values())
+
+ for term, positional_postings in title_terms_data.items():
+ tf = len(positional_postings) / title_term_count if title_term_count > 0 else 0
+ title_term_inserts.append((term, url, tf, positional_postings))
for term, positional_postings in url_terms_data.items():
tf = len(positional_postings) / url_term_count if url_term_count > 0 else 0
@@ -341,6 +354,13 @@ if __name__ == "__main__":
execute_values(
cursor,
+ query_title_term,
+ title_term_inserts,
+ page_size=100000
+ )
+
+ execute_values(
+ cursor,
query_url_term,
url_term_inserts,
page_size=100000
@@ -350,7 +370,7 @@ if __name__ == "__main__":
cursor,
query,
term_inserts,
- page_size=100000 # TODO: Optimize this. It's kinda quick, but also random.
+ page_size=100000 # TODO: Optimize this. It's kinda quick, but also random, the number that is.
)
execute_values(
cursor,