information-retrieval

Exploration of information retrieval topics
git clone git://git.laack.co/information-retrieval.git
Log | Files | Refs

tf.py (1692B)


      1 import sqlite3
      2 from utils import get_tfs
      3 from utils import get_terms
      4 from utils import get_filepaths
      5 from tqdm import tqdm
      6 
      7 if __name__ == "__main__":
      8     filepaths = get_filepaths('sites')
      9     print(f'Found {len(filepaths)} files')
     10 
     11     # returns a set of all indexed terms
     12     terms = get_terms('database/manifest.db')
     13     print(f'fetched {len(terms)} terms')
     14 
     15     # tfs is a dict:
     16     # term : tf
     17     # tf(document_path, term, value)
     18     
     19     # having the one db makes the deletion very slow. 
     20     con = sqlite3.connect('database/manifest.db', timeout=60)
     21     con.execute('PRAGMA journal_mode=WAL')
     22     cur = con.cursor()
     23     cur.execute("CREATE TABLE IF NOT EXISTS tf(document_path, term, value)")
     24     cur.execute("CREATE INDEX IF NOT EXISTS idx_tf_document_path ON tf(document_path)")
     25     cur.execute("CREATE INDEX IF NOT EXISTS idx_tf_term ON tf(term)")
     26     cur.execute("DELETE FROM tf")
     27 
     28     # the reason we want a term list is because that gives us a guarantee about having an idf.
     29     # considerations can be made for the necessity of this, but I think this is safe for now.
     30     # TODO: Possibly add idf imputation during for searches on terms that don't exist in the term table.
     31 
     32 
     33     # TODO: update indexed status for sites and only use that status for indexing, not every file with get_filepaths
     34 
     35     for i in tqdm(range(0,len(filepaths))):
     36         filepath = filepaths[i]
     37         # this returns the tfs for all terms from terms that exist in the current document (filepath)
     38         tfs = get_tfs(filepath, terms)
     39         for word in tfs:
     40             cur.execute("INSERT INTO tf VALUES (?, ?, ?)", (filepath, word, tfs[word]))
     41         con.commit()
     42     cur.close()
     43     con.close()