utils.py (4541B)
1 import glob 2 import sqlite3 3 import nltk 4 from nltk.corpus import stopwords 5 import subprocess 6 import math 7 import tqdm 8 import os 9 import re 10 11 12 # get term frequencies for file given our list of terms (we only care about indexed terms which doesn't include certain terms like stop words) 13 def get_tfs(filepath, terms): 14 plaintext = get_plaintext(filepath) 15 term_counts, total_terms = get_word_counts_in_included_and_total_words(plaintext, terms) 16 tfs = {} 17 for term in term_counts: 18 tfs[term] = term_counts[term] / total_terms 19 return tfs 20 21 # select all terms from manifest.db 22 def get_terms(db_path): 23 con = sqlite3.connect(db_path, timeout=60) 24 con.execute('PRAGMA journal_mode=WAL') 25 cur = con.cursor() 26 cur.execute("SELECT name FROM term") 27 terms = {row[0] for row in cur.fetchall()} 28 cur.close() 29 con.close() 30 return terms 31 32 33 # use lynx to render the raw html. This will give us exactly what a user 34 # of the best browser, lynx, would see if they went to the site. 35 def get_plaintext(filepath): 36 result = subprocess.run( 37 ['lynx', '--dump', '--force_html', '--nolist', filepath], 38 capture_output=True, 39 ) 40 41 plaintext = result.stdout.decode('utf-8', errors='replace') 42 plaintext = re.sub(r'\(BUTTON\)\s*', '', plaintext) # lynx button representation. 43 44 lines = [line for line in plaintext.splitlines() 45 if 'REFRESH' not in line 46 and 'file://' not in line 47 and not re.match(r'^\s+\d{2}-\d{2}/', line)] 48 return '\n'.join(lines) 49 50 def get_plaintext_words(filepath): 51 plaintext = get_plaintext(filepath) 52 return get_words(plaintext) 53 54 # NOTE: The total number of terms is based on the inclusion list 55 def get_word_counts_in_included_and_total_words(text, included): 56 total_terms = 0 57 current = {} 58 lines = text 59 lines = re.sub('[^0-9a-zA-Z]+', ' ', lines) 60 lines = lines.split(' ') 61 for i in range(0, len(lines)): 62 if lines[i] != '': 63 cw = lines[i].lower() 64 if cw in included: 65 if cw not in current: 66 current[cw] = 1 67 else: 68 current[cw] += 1 69 total_terms += 1 70 return current, total_terms 71 72 73 74 def get_words(text): 75 current = set() 76 lines = text 77 lines = re.sub('[^0-9a-zA-Z]+', ' ', lines) 78 lines = lines.split(' ') 79 for i in range(0, len(lines)): 80 if lines[i] != '': 81 current.add(lines[i].lower()) 82 return current 83 84 def get_filepaths(document_directory): 85 filepaths = [] 86 for filepath in glob.iglob(document_directory + '**/**', recursive=True): 87 if os.path.isfile(filepath): 88 filepaths.append(filepath) 89 return filepaths 90 91 # lower bound percentage is kinda dangerous. 92 # it might be useful to, say, search for hash online, or to look for a last name which 93 # occurs very, very, infrequently. 94 95 # also, upper bound percentage might be too strong in a lot of cases. We probably are interested 96 # in only using stop words, but even that is difficult because what should happen if someone searches the word 97 # 'the' into their search engine? 98 99 # google replies with wikipedia and dictionary stuff for the word the, but if we don't search for it that could cause issues. to this end, I suspect we only care about removing stop words when querying, and only in cases where it doesn't result in searching for no words, or in cases where the term does actually matter. 100 101 # given all of this, it seems right for now to simply index on each token. further considerations can be made later, but this seems reasonable, especially if we are only indexing large sites. 102 103 def get_log_doc_freqs(filepaths, lower_bound_percentage=0, upper_bound_percentage=1, filter_stop_words=False): 104 stop_words = set() 105 if filter_stop_words: 106 nltk.download('stopwords') 107 stop_words = set(stopwords.words('english')) 108 109 current = {} 110 for i in tqdm.tqdm(range(len(filepaths))): 111 filepath = filepaths[i] 112 words = get_plaintext_words(filepath) 113 for word in words: 114 if word in current: 115 current[word] += 1 116 else: 117 current[word] = 1 118 log_doc_freqs = {} 119 for word in current: 120 # random word or stop word? 121 if current[word] > int(lower_bound_percentage * len(filepaths)) and current[word] < int(len(filepaths) * upper_bound_percentage) and word not in stop_words: 122 log_doc_freqs[word] = math.log(len(filepaths) / current[word]) 123 return log_doc_freqs