Updated querying - information-retrieval - Unnamed repository; edit this file 'description' to name the repository.

commit 49cfa7a25d876018bef98fba7b0e516820560740
parent 16e1f7d53b8b85e296cf50efc4a424aea35ceff0
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Wed,  7 Jan 2026 00:05:08 -0600

Updated querying

Diffstat:
M indexing/README.md  | 3 ++-
M search/query.py  | 22 +++++++++++++++++++++-

2 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/indexing/README.md b/indexing/README.md
@@ -45,6 +45,7 @@ The indexer reads from the indexing queue and indexes results in said queue.
                              -- should we also add url_count for url counts here?
 - collection(num_documents, average_document_length, average_url_length)
 
+---
 
-TODO: Add tf to document_term table.
 TODO: Add snippets and such to tables to ensure querying works correctly
+TODO: (should I) compute IDFs for title vs url vs body (?)
diff --git a/search/query.py b/search/query.py
@@ -35,6 +35,19 @@ def get_idf(conn, term):
     idf = math.log((total_documents + 1) / (documents_with_term + 1))
     return idf
 
+def get_title_tfs(conn, term):
+    cursor = conn.cursor()
+    query = """
+        SELECT url, tf FROM title_term
+        WHERE term = %s
+    """
+    cursor.execute(query, (term,))
+    result = cursor.fetchall()
+    result = {res[0]: res[1] for res in result}
+    cursor.close()
+    return result
+
+
 def get_url_tfs(conn, term):
     cursor = conn.cursor()
     query = """
@@ -78,8 +91,15 @@ if __name__ == "__main__":
             tf_idf[url] = tf_idf.get(url, 0) + tfs[url] * idf
         
         url_tfs = get_url_tfs(conn, term_o)
+        title_tfs = get_title_tfs(conn, term_o)
+
         for url in url_tfs:
-            tf_idf[url] = tf_idf.get(url, 0) + 2 * url_tfs[url] * idf
+            tf_idf[url] = tf_idf.get(url, 0) + .25 * url_tfs[url] * idf
+
+        for url in title_tfs:
+            tf_idf[url] = tf_idf.get(url, 0) + .25 * title_tfs[url] * idf
+
+
 
     k = 10

	information-retrieval Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs

M	indexing/README.md	\|	3	++-
M	search/query.py	\|	22	+++++++++++++++++++++-