information-retrieval

Exploration of information retrieval topics
git clone git://git.laack.co/information-retrieval.git
Log | Files | Refs

query.py (1588B)


      1 import sqlite3
      2 import sys
      3 from indexing.utils import get_words
      4 
      5 if __name__ == "__main__":
      6 
      7     query = sys.argv[1]
      8     limit = int(sys.argv[2])
      9     language = sys.argv[3] # probably want en for english
     10 
     11     query = get_words(query)
     12     con = sqlite3.connect('database/manifest.db', timeout=60)
     13     #tf(document_path, term, value)
     14     cur = con.cursor()
     15 
     16     idfs = {}
     17 
     18     for term in query:
     19         # term(name, idf);
     20         cur.execute("SELECT idf from term where name = ?", (term,))
     21         idf = cur.fetchone()
     22         if idf is not None and len(idf) > 0:
     23             idfs[term] = idf[0]
     24         else:
     25 
     26             # Should this be a really large value????
     27             # there is a lot of information in a value that doesn't have an idf, unless we messed
     28             # something up....
     29 
     30             idfs[term] = 0
     31             print("Couldn't find idf...")
     32 
     33 
     34     tfidf = {}
     35 
     36     for term in query:
     37         cur.execute("""
     38             SELECT site.url, tf.value
     39             FROM tf
     40             JOIN site ON tf.document_path = site.filepath
     41             WHERE tf.term = ? AND site.language = ?
     42             ORDER BY tf.value DESC
     43         """, (term, language))
     44         rows = cur.fetchall()
     45         for row in rows:
     46             if tfidf.get(row[0]) is None:
     47                 tfidf[row[0]] = float(row[1]) * idfs[term]
     48             else:
     49                 tfidf[row[0]] += float(row[1]) * idfs[term]
     50     sorted_results = sorted(tfidf.items(), key=lambda x: x[1], reverse=True)
     51     for url, score in sorted_results[:limit]:
     52         print(f"{score:.4f} - {url}")
     53     cur.close()
     54     con.close()