commit 49cfa7a25d876018bef98fba7b0e516820560740
parent 16e1f7d53b8b85e296cf50efc4a424aea35ceff0
Author: Andrew Laack <andrew.laack@imbue.com>
Date: Wed, 7 Jan 2026 00:05:08 -0600
Updated querying
Diffstat:
2 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/indexing/README.md b/indexing/README.md
@@ -45,6 +45,7 @@ The indexer reads from the indexing queue and indexes results in said queue.
-- should we also add url_count for url counts here?
- collection(num_documents, average_document_length, average_url_length)
+---
-TODO: Add tf to document_term table.
TODO: Add snippets and such to tables to ensure querying works correctly
+TODO: (should I) compute IDFs for title vs url vs body (?)
diff --git a/search/query.py b/search/query.py
@@ -35,6 +35,19 @@ def get_idf(conn, term):
idf = math.log((total_documents + 1) / (documents_with_term + 1))
return idf
+def get_title_tfs(conn, term):
+ cursor = conn.cursor()
+ query = """
+ SELECT url, tf FROM title_term
+ WHERE term = %s
+ """
+ cursor.execute(query, (term,))
+ result = cursor.fetchall()
+ result = {res[0]: res[1] for res in result}
+ cursor.close()
+ return result
+
+
def get_url_tfs(conn, term):
cursor = conn.cursor()
query = """
@@ -78,8 +91,15 @@ if __name__ == "__main__":
tf_idf[url] = tf_idf.get(url, 0) + tfs[url] * idf
url_tfs = get_url_tfs(conn, term_o)
+ title_tfs = get_title_tfs(conn, term_o)
+
for url in url_tfs:
- tf_idf[url] = tf_idf.get(url, 0) + 2 * url_tfs[url] * idf
+ tf_idf[url] = tf_idf.get(url, 0) + .25 * url_tfs[url] * idf
+
+ for url in title_tfs:
+ tf_idf[url] = tf_idf.get(url, 0) + .25 * title_tfs[url] * idf
+
+
k = 10