query.py (2615B)
1 from crawling.spider import get_indexing_db_connection 2 import math 3 import sys 4 from indexing.page_parsing import get_words 5 6 # TODO: Should there be seperate idf values for url, title, and body 7 # I lean towards separate for body and url where title is treated the same as body. 8 9 def get_idf(conn, term): 10 cursor = conn.cursor() 11 12 query = """ 13 SELECT document_count FROM term 14 WHERE name = %s 15 """ 16 cursor.execute(query, (term,)) 17 18 result = cursor.fetchall() 19 cursor.close() 20 21 22 cursor = conn.cursor() 23 24 query = """ 25 SELECT num_documents FROM collection 26 """ 27 cursor.execute(query) 28 total_documents = cursor.fetchall()[0][0] 29 cursor.close() 30 31 documents_with_term = 0 32 if len(result) != 0: 33 documents_with_term = result[0][0] 34 35 idf = math.log((total_documents + 1) / (documents_with_term + 1)) 36 return idf 37 38 def get_title_tfs(conn, term): 39 cursor = conn.cursor() 40 query = """ 41 SELECT url, tf FROM title_term 42 WHERE term = %s 43 """ 44 cursor.execute(query, (term,)) 45 result = cursor.fetchall() 46 result = {res[0]: res[1] for res in result} 47 cursor.close() 48 return result 49 50 51 def get_url_tfs(conn, term): 52 cursor = conn.cursor() 53 query = """ 54 SELECT url, tf FROM url_term 55 WHERE term = %s 56 """ 57 cursor.execute(query, (term,)) 58 result = cursor.fetchall() 59 result = {res[0]: res[1] for res in result} 60 cursor.close() 61 return result 62 63 def get_tfs(conn, term): 64 cursor = conn.cursor() 65 query = """ 66 SELECT url, tf FROM document_term 67 WHERE term = %s 68 """ 69 cursor.execute(query, (term,)) 70 result = cursor.fetchall() 71 72 result = {res[0] : res[1] for res in result} 73 74 cursor.close() 75 return result 76 77 if __name__ == "__main__": 78 user_input = sys.argv[1] 79 80 user_input = list(get_words(user_input).keys()) 81 82 conn = get_indexing_db_connection() 83 84 tf_idf = {} 85 86 for term_o in user_input: 87 idf = get_idf(conn, term_o) 88 89 tfs = get_tfs(conn, term_o) 90 for url in tfs: 91 tf_idf[url] = tf_idf.get(url, 0) + tfs[url] * idf 92 93 url_tfs = get_url_tfs(conn, term_o) 94 title_tfs = get_title_tfs(conn, term_o) 95 96 for url in url_tfs: 97 tf_idf[url] = tf_idf.get(url, 0) + .25 * url_tfs[url] * idf 98 99 for url in title_tfs: 100 tf_idf[url] = tf_idf.get(url, 0) + .25 * title_tfs[url] * idf 101 102 103 104 k = 10 105 106 sorted_results = sorted(tf_idf.items(), key=lambda x: x[1], reverse=True)[:k] 107 for url, score in sorted_results: 108 print(f"{score:.4f} {url}")