information-retrieval

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 26aaac222ee693bd1d6ec1306ec135468ddb3deb
parent 75a4065b8167979b8484560b5f623a023ba647c4
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Thu,  1 Jan 2026 04:36:31 -0600

Added tf calculation for indexing.

Diffstat:
Mindexing/idf.py | 3++-
Aindexing/tf.py | 38++++++++++++++++++++++++++++++++++++++
Mindexing/utils.py | 53+++++++++++++++++++++++++++++++++++++++++++++++++----
Ametrics/cosine-similarity.py | 26++++++++++++++++++++++++++
4 files changed, 115 insertions(+), 5 deletions(-)

diff --git a/indexing/idf.py b/indexing/idf.py @@ -16,6 +16,7 @@ if __name__ == "__main__": for term in log_doc_freq: cur.execute("INSERT INTO term VALUES (?, ?)", (term, log_doc_freq[term])) - con.commit() + con.commit() print(f"All {len(log_doc_freq)} terms inserted into the db.") cur.close() + con.close() diff --git a/indexing/tf.py b/indexing/tf.py @@ -0,0 +1,38 @@ +import sqlite3 +from utils import get_tfs +from utils import get_terms +from utils import get_filepaths +from tqdm import tqdm + +if __name__ == "__main__": + filepaths = get_filepaths('sites') + print(f'Found {len(filepaths)} files') + + # returns a set of all indexed terms + terms = get_terms('database/manifest.db') + + # tfs is a dict: + # term : tf + # tf(document_path, term, value) + + con = sqlite3.connect('database/manifest.db', timeout=60) + con.execute('PRAGMA journal_mode=WAL') + cur = con.cursor() + cur.execute("CREATE TABLE IF NOT EXISTS tf(document_path, term, value)") + cur.execute("CREATE INDEX IF NOT EXISTS idx_tf_document_path ON tf(document_path)") + cur.execute("CREATE INDEX IF NOT EXISTS idx_tf_term ON tf(term)") + cur.execute("DELETE FROM tf") + + # the reason we want a term list is because that gives us a guarantee about having an idf + # considerations can be made for the necessity of this, but I think this is safe for now. + # TODO: Possibly add idf imputation during for searches on terms that don't exist in the term table. + + for i in tqdm(range(0,len(filepaths))): + filepath = filepaths[i] + # this returns the tfs for all terms from terms that exist in the current document (filepath) + tfs = get_tfs(filepath, terms) + for word in tfs: + cur.execute("INSERT INTO tf VALUES (?, ?, ?)", (filepath, word, tfs[word])) + con.commit() + cur.close() + con.close() diff --git a/indexing/utils.py b/indexing/utils.py @@ -1,4 +1,5 @@ import glob +import sqlite3 import nltk from nltk.corpus import stopwords import subprocess @@ -9,6 +10,23 @@ import re nltk.download('stopwords') +# get term frequencies for file given our list of terms (we only care about indexed terms which doesn't include certain terms like stop words) +def get_tfs(filepath, terms): + plaintext = get_plaintext(filepath) + return get_word_counts_in_included(plaintext, terms) + +# select all terms from manifest.db +def get_terms(db_path): + con = sqlite3.connect(db_path, timeout=60) + con.execute('PRAGMA journal_mode=WAL') + cur = con.cursor() + cur.execute("SELECT name FROM term") + terms = {row[0] for row in cur.fetchall()} + cur.close() + con.close() + return terms + + # use lynx to render the raw html. This will give us exactly what a user # of the best browser, lynx, would see if they went to the site. def get_plaintext(filepath): @@ -18,9 +36,7 @@ def get_plaintext(filepath): ) plaintext = result.stdout.decode('utf-8', errors='replace') - - plaintext = re.sub(r'\(BUTTON\)\s*', '', plaintext) - plaintext = re.sub(r'_{3,}', '', plaintext) + plaintext = re.sub(r'\(BUTTON\)\s*', '', plaintext) # lynx button representation. lines = [line for line in plaintext.splitlines() if 'REFRESH' not in line @@ -32,6 +48,23 @@ def get_plaintext_words(filepath): plaintext = get_plaintext(filepath) return get_words(plaintext) +def get_word_counts_in_included(text, included): + current = {} + lines = text + lines = re.sub('[^0-9a-zA-Z]+', ' ', lines) + lines = lines.split(' ') + for i in range(0, len(lines)): + if lines[i] != '': + cw = lines[i].lower() + if cw in included: + if cw not in current: + current[cw] = 1 + else: + current[cw] += 1 + return current + + + def get_words(text): current = set() lines = text @@ -49,7 +82,19 @@ def get_filepaths(document_directory): filepaths.append(filepath) return filepaths -def get_log_doc_freqs(filepaths, lower_bound_percentage=.0001, upper_bound_percentage=.85, filter_stop_words=True): +# lower bound percentage is kinda dangerous. +# it might be useful to, say, search for hash online, or to look for a last name which +# occurs very, very, infrequently. + +# also, upper bound percentage might be too strong in a lot of cases. We probably are interested +# in only using stop words, but even that is difficult because what should happen if someone searches the word +# 'the' into their search engine? + +# google replies with wikipedia and dictionary stuff for the word the, but if we don't search for it that could cause issues. to this end, I suspect we only care about removing stop words when querying, and only in cases where it doesn't result in searching for no words, or in cases where the term does actually matter. + +# given all of this, it seems right for now to simply index on each token. further considerations can be made later, but this seems reasonable, especially if we are only indexing large sites. + +def get_log_doc_freqs(filepaths, lower_bound_percentage=0, upper_bound_percentage=0, filter_stop_words=False): stop_words = set() if filter_stop_words: stop_words = set(stopwords.words('english')) diff --git a/metrics/cosine-similarity.py b/metrics/cosine-similarity.py @@ -0,0 +1,26 @@ +import math + +def magnitude(v): + sq = 0 + for i in range(len(v)): + sq += v[i] ** 2 + return math.sqrt(sq) + +def dp(A,B): + result = 0 + for i in range(len(A)): + result += A[i] * B[i] + return result + +def cosine_similarity(A,B): + + dp_AB = dp(A,B) + a_l = magnitude(A) + b_l = magnitude(B) + return dp_AB / (a_l * b_l) + + +if __name__ == "__main__": + A = [0, 4873, 823] + B = [0, 487, 48988] + print(cosine_similarity(A,B))