tf.py (1692B)
1 import sqlite3 2 from utils import get_tfs 3 from utils import get_terms 4 from utils import get_filepaths 5 from tqdm import tqdm 6 7 if __name__ == "__main__": 8 filepaths = get_filepaths('sites') 9 print(f'Found {len(filepaths)} files') 10 11 # returns a set of all indexed terms 12 terms = get_terms('database/manifest.db') 13 print(f'fetched {len(terms)} terms') 14 15 # tfs is a dict: 16 # term : tf 17 # tf(document_path, term, value) 18 19 # having the one db makes the deletion very slow. 20 con = sqlite3.connect('database/manifest.db', timeout=60) 21 con.execute('PRAGMA journal_mode=WAL') 22 cur = con.cursor() 23 cur.execute("CREATE TABLE IF NOT EXISTS tf(document_path, term, value)") 24 cur.execute("CREATE INDEX IF NOT EXISTS idx_tf_document_path ON tf(document_path)") 25 cur.execute("CREATE INDEX IF NOT EXISTS idx_tf_term ON tf(term)") 26 cur.execute("DELETE FROM tf") 27 28 # the reason we want a term list is because that gives us a guarantee about having an idf. 29 # considerations can be made for the necessity of this, but I think this is safe for now. 30 # TODO: Possibly add idf imputation during for searches on terms that don't exist in the term table. 31 32 33 # TODO: update indexed status for sites and only use that status for indexing, not every file with get_filepaths 34 35 for i in tqdm(range(0,len(filepaths))): 36 filepath = filepaths[i] 37 # this returns the tfs for all terms from terms that exist in the current document (filepath) 38 tfs = get_tfs(filepath, terms) 39 for word in tfs: 40 cur.execute("INSERT INTO tf VALUES (?, ?, ?)", (filepath, word, tfs[word])) 41 con.commit() 42 cur.close() 43 con.close()