query.py (1588B)
1 import sqlite3 2 import sys 3 from indexing.utils import get_words 4 5 if __name__ == "__main__": 6 7 query = sys.argv[1] 8 limit = int(sys.argv[2]) 9 language = sys.argv[3] # probably want en for english 10 11 query = get_words(query) 12 con = sqlite3.connect('database/manifest.db', timeout=60) 13 #tf(document_path, term, value) 14 cur = con.cursor() 15 16 idfs = {} 17 18 for term in query: 19 # term(name, idf); 20 cur.execute("SELECT idf from term where name = ?", (term,)) 21 idf = cur.fetchone() 22 if idf is not None and len(idf) > 0: 23 idfs[term] = idf[0] 24 else: 25 26 # Should this be a really large value???? 27 # there is a lot of information in a value that doesn't have an idf, unless we messed 28 # something up.... 29 30 idfs[term] = 0 31 print("Couldn't find idf...") 32 33 34 tfidf = {} 35 36 for term in query: 37 cur.execute(""" 38 SELECT site.url, tf.value 39 FROM tf 40 JOIN site ON tf.document_path = site.filepath 41 WHERE tf.term = ? AND site.language = ? 42 ORDER BY tf.value DESC 43 """, (term, language)) 44 rows = cur.fetchall() 45 for row in rows: 46 if tfidf.get(row[0]) is None: 47 tfidf[row[0]] = float(row[1]) * idfs[term] 48 else: 49 tfidf[row[0]] += float(row[1]) * idfs[term] 50 sorted_results = sorted(tfidf.items(), key=lambda x: x[1], reverse=True) 51 for url, score in sorted_results[:limit]: 52 print(f"{score:.4f} - {url}") 53 cur.close() 54 con.close()