tf-idf.py (3332B)
1 import os 2 import random 3 import math 4 import re 5 import sys 6 7 # get all words 8 def get_words(filename): 9 with open(filename, 'r') as f: 10 lines = f.read() 11 lines = re.sub('[^0-9a-zA-Z]+', ' ', lines) 12 lines = lines.split(' ') 13 final = [] 14 for i in range(0, len(lines)): 15 if lines[i] != '': 16 final.append(lines[i].lower()) 17 return final 18 19 def tfs(prefix, filenames, word, to_sample=-1): 20 tfs = {} 21 22 if to_sample == -1: 23 for filename in filenames: 24 tfs[filename] = tf(prefix, filename, word) 25 else: 26 for _ in range(0, to_sample): 27 filename = random.choice(filenames) 28 tfs[filename] = tf(prefix, filename, word) 29 return tfs 30 31 def tf(prefix, filename, word): 32 words = get_words(prefix + filename) 33 count_of_word = 0 34 for cw in words: 35 if cw == word: 36 count_of_word += 1 37 if len(words) != 0: 38 return count_of_word / len(words) 39 return 0 # empty documents 40 41 42 # technically, we might just want the one word and output the value for that 43 def idf(prefix, filenames): 44 word_document_frequency = {} 45 for filename in filenames: 46 words = get_words(prefix + filename) 47 for word in words: 48 if word in word_document_frequency: 49 word_document_frequency[word] += 1 50 else: 51 word_document_frequency[word] = 1 52 idf = word_document_frequency.copy() 53 for word in idf: 54 idf[word] = math.log(len(filenames) / idf[word]) 55 return idf 56 57 def idf_word(prefix, filenames, word, to_sample=-1): 58 59 frequency = 0 60 sampled = 0 61 62 if to_sample == -1: 63 sampled = len(filenames) 64 for filename in filenames: 65 words = get_words(prefix + filename) 66 if word in words: 67 frequency += 1 68 else: 69 for _ in range(0,to_sample): 70 sampled = to_sample 71 filename = random.choice(filenames) 72 words = get_words(prefix + filename) 73 if word in words: 74 frequency += 1 75 if frequency != 0: 76 idf = math.log 77 78 if frequency != 0: 79 idf = math.log(sampled / frequency) 80 else: 81 idf = 0.0 82 return idf 83 84 85 86 87 if __name__ == "__main__": 88 document_directory = sys.argv[1] 89 if(document_directory[-1] != '/'): 90 document_directory += '/' 91 92 user_input = True # continually prompt if the user is in interactive mode 93 while user_input: 94 word = "" 95 top_k = 1 96 if len(sys.argv) == 4: 97 word = sys.argv[2] 98 top_k = int(sys.argv[3]) 99 user_input = False 100 else: 101 user_input = True 102 word = input("Word to find: ") 103 top_k = int(input("Top k elements to show: ")) 104 105 filenames = os.listdir(document_directory) 106 print('calculating idf') 107 idf_of_word = idf_word(document_directory, filenames, word, 1000) 108 109 print('calculating tf') 110 tf_dict = tfs(document_directory, filenames, word, 1000) 111 112 tfidf = {} 113 114 for filename in tf_dict: 115 tfidf[filename] = idf_of_word * tf_dict[filename] 116 117 sorted_items = sorted(tfidf.items(), key=lambda kv: (kv[1], kv[0])) 118 sorted_items.reverse() 119 120 for i in range(top_k): 121 print(sorted_items[i])