information-retrieval

Exploration of information retrieval topics
git clone git://git.laack.co/information-retrieval.git
Log | Files | Refs

tf-idf.py (3332B)


      1 import os
      2 import random
      3 import math
      4 import re
      5 import sys
      6 
      7 # get all words
      8 def get_words(filename):
      9     with open(filename, 'r') as f:
     10         lines = f.read()
     11     lines = re.sub('[^0-9a-zA-Z]+', ' ', lines)
     12     lines = lines.split(' ')
     13     final = []
     14     for i in range(0, len(lines)):
     15         if lines[i] != '':
     16             final.append(lines[i].lower())
     17     return final
     18 
     19 def tfs(prefix, filenames, word, to_sample=-1):
     20     tfs = {}
     21 
     22     if to_sample == -1:
     23         for filename in filenames:
     24             tfs[filename] = tf(prefix, filename, word)
     25     else:
     26         for _ in range(0, to_sample):
     27             filename = random.choice(filenames)
     28             tfs[filename] = tf(prefix, filename, word)
     29     return tfs
     30 
     31 def tf(prefix, filename, word):
     32     words = get_words(prefix + filename)
     33     count_of_word = 0
     34     for cw in words:
     35         if cw == word:
     36             count_of_word += 1
     37     if len(words) != 0:
     38         return count_of_word / len(words)
     39     return 0 # empty documents
     40 
     41 
     42 # technically, we might just want the one word and output the value for that
     43 def idf(prefix, filenames):
     44     word_document_frequency = {}
     45     for filename in filenames:
     46         words = get_words(prefix + filename)
     47         for word in words:
     48             if word in word_document_frequency:
     49                 word_document_frequency[word] += 1
     50             else:
     51                 word_document_frequency[word] = 1
     52     idf = word_document_frequency.copy()
     53     for word in idf:
     54         idf[word] = math.log(len(filenames) / idf[word])
     55     return idf
     56 
     57 def idf_word(prefix, filenames, word, to_sample=-1):
     58 
     59     frequency = 0
     60     sampled = 0
     61 
     62     if to_sample == -1:
     63         sampled = len(filenames)
     64         for filename in filenames:
     65             words = get_words(prefix + filename)
     66             if word in words:
     67                 frequency += 1
     68     else:
     69         for _ in range(0,to_sample):
     70             sampled = to_sample
     71             filename = random.choice(filenames)
     72             words = get_words(prefix + filename)
     73             if word in words:
     74                 frequency += 1
     75             if frequency != 0:
     76                 idf = math.log
     77 
     78     if frequency != 0:
     79         idf = math.log(sampled / frequency)
     80     else:
     81         idf = 0.0
     82     return idf
     83 
     84 
     85 
     86 
     87 if __name__ == "__main__":
     88     document_directory = sys.argv[1]
     89     if(document_directory[-1] != '/'):
     90         document_directory += '/'
     91 
     92     user_input = True # continually prompt if the user is in interactive mode
     93     while user_input:
     94         word = ""
     95         top_k = 1
     96         if len(sys.argv) == 4:
     97             word = sys.argv[2]
     98             top_k = int(sys.argv[3])
     99             user_input = False
    100         else:
    101             user_input = True
    102             word = input("Word to find: ")
    103             top_k = int(input("Top k elements to show: "))
    104 
    105         filenames = os.listdir(document_directory)
    106         print('calculating idf')
    107         idf_of_word = idf_word(document_directory, filenames, word, 1000)
    108 
    109         print('calculating tf')
    110         tf_dict = tfs(document_directory, filenames, word, 1000)
    111 
    112         tfidf = {}
    113 
    114         for filename in tf_dict:
    115             tfidf[filename] = idf_of_word * tf_dict[filename]
    116 
    117         sorted_items = sorted(tfidf.items(), key=lambda kv: (kv[1], kv[0]))
    118         sorted_items.reverse()
    119 
    120         for i in range(top_k):
    121             print(sorted_items[i])