information-retrieval

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 49cfa7a25d876018bef98fba7b0e516820560740
parent 16e1f7d53b8b85e296cf50efc4a424aea35ceff0
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Wed,  7 Jan 2026 00:05:08 -0600

Updated querying

Diffstat:
Mindexing/README.md | 3++-
Msearch/query.py | 22+++++++++++++++++++++-
2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/indexing/README.md b/indexing/README.md @@ -45,6 +45,7 @@ The indexer reads from the indexing queue and indexes results in said queue. -- should we also add url_count for url counts here? - collection(num_documents, average_document_length, average_url_length) +--- -TODO: Add tf to document_term table. TODO: Add snippets and such to tables to ensure querying works correctly +TODO: (should I) compute IDFs for title vs url vs body (?) diff --git a/search/query.py b/search/query.py @@ -35,6 +35,19 @@ def get_idf(conn, term): idf = math.log((total_documents + 1) / (documents_with_term + 1)) return idf +def get_title_tfs(conn, term): + cursor = conn.cursor() + query = """ + SELECT url, tf FROM title_term + WHERE term = %s + """ + cursor.execute(query, (term,)) + result = cursor.fetchall() + result = {res[0]: res[1] for res in result} + cursor.close() + return result + + def get_url_tfs(conn, term): cursor = conn.cursor() query = """ @@ -78,8 +91,15 @@ if __name__ == "__main__": tf_idf[url] = tf_idf.get(url, 0) + tfs[url] * idf url_tfs = get_url_tfs(conn, term_o) + title_tfs = get_title_tfs(conn, term_o) + for url in url_tfs: - tf_idf[url] = tf_idf.get(url, 0) + 2 * url_tfs[url] * idf + tf_idf[url] = tf_idf.get(url, 0) + .25 * url_tfs[url] * idf + + for url in title_tfs: + tf_idf[url] = tf_idf.get(url, 0) + .25 * title_tfs[url] * idf + + k = 10