information-retrieval

Exploration of information retrieval topics
git clone git://git.laack.co/information-retrieval.git
Log | Files | Refs

query.py (2615B)


      1 from crawling.spider import get_indexing_db_connection
      2 import math
      3 import sys
      4 from indexing.page_parsing import get_words
      5 
      6 # TODO: Should there be seperate idf values for url, title, and body
      7 # I lean towards separate for body and url where title is treated the same as body.
      8 
      9 def get_idf(conn, term):
     10     cursor = conn.cursor()
     11 
     12     query = """
     13         SELECT document_count FROM term 
     14         WHERE name = %s
     15     """
     16     cursor.execute(query, (term,))
     17 
     18     result = cursor.fetchall()
     19     cursor.close()
     20 
     21 
     22     cursor = conn.cursor()
     23 
     24     query = """
     25         SELECT num_documents FROM collection
     26     """
     27     cursor.execute(query)
     28     total_documents = cursor.fetchall()[0][0]
     29     cursor.close()
     30 
     31     documents_with_term = 0
     32     if len(result) != 0:
     33         documents_with_term  = result[0][0]
     34 
     35     idf = math.log((total_documents + 1) / (documents_with_term + 1))
     36     return idf
     37 
     38 def get_title_tfs(conn, term):
     39     cursor = conn.cursor()
     40     query = """
     41         SELECT url, tf FROM title_term
     42         WHERE term = %s
     43     """
     44     cursor.execute(query, (term,))
     45     result = cursor.fetchall()
     46     result = {res[0]: res[1] for res in result}
     47     cursor.close()
     48     return result
     49 
     50 
     51 def get_url_tfs(conn, term):
     52     cursor = conn.cursor()
     53     query = """
     54         SELECT url, tf FROM url_term
     55         WHERE term = %s
     56     """
     57     cursor.execute(query, (term,))
     58     result = cursor.fetchall()
     59     result = {res[0]: res[1] for res in result}
     60     cursor.close()
     61     return result
     62 
     63 def get_tfs(conn, term):
     64     cursor = conn.cursor()
     65     query = """
     66         SELECT url, tf FROM document_term
     67         WHERE term = %s
     68     """
     69     cursor.execute(query, (term,))
     70     result = cursor.fetchall()
     71 
     72     result = {res[0] : res[1] for res in result}
     73 
     74     cursor.close()
     75     return result
     76 
     77 if __name__ == "__main__":
     78     user_input = sys.argv[1]
     79 
     80     user_input = list(get_words(user_input).keys())
     81 
     82     conn = get_indexing_db_connection()
     83 
     84     tf_idf = {}
     85     
     86     for term_o in user_input:
     87         idf = get_idf(conn, term_o)
     88         
     89         tfs = get_tfs(conn, term_o)
     90         for url in tfs:
     91             tf_idf[url] = tf_idf.get(url, 0) + tfs[url] * idf
     92         
     93         url_tfs = get_url_tfs(conn, term_o)
     94         title_tfs = get_title_tfs(conn, term_o)
     95 
     96         for url in url_tfs:
     97             tf_idf[url] = tf_idf.get(url, 0) + .25 * url_tfs[url] * idf
     98 
     99         for url in title_tfs:
    100             tf_idf[url] = tf_idf.get(url, 0) + .25 * title_tfs[url] * idf
    101 
    102 
    103 
    104     k = 10
    105 
    106     sorted_results = sorted(tf_idf.items(), key=lambda x: x[1], reverse=True)[:k]
    107     for url, score in sorted_results:
    108         print(f"{score:.4f} {url}")