utils.py - information-retrieval - Exploration of information retrieval topics

utils.py (4541B)
      1 import glob
      2 import sqlite3
      3 import nltk
      4 from nltk.corpus import stopwords
      5 import subprocess
      6 import math
      7 import tqdm
      8 import os
      9 import re
     10 
     11 
     12 # get term frequencies for file given our list of terms (we only care about indexed terms which doesn't include certain terms like stop words)
     13 def get_tfs(filepath, terms):
     14     plaintext = get_plaintext(filepath)
     15     term_counts, total_terms = get_word_counts_in_included_and_total_words(plaintext, terms)
     16     tfs = {}
     17     for term in term_counts:
     18         tfs[term] = term_counts[term] / total_terms
     19     return tfs
     20 
     21 # select all terms from manifest.db
     22 def get_terms(db_path):
     23     con = sqlite3.connect(db_path, timeout=60)
     24     con.execute('PRAGMA journal_mode=WAL')
     25     cur = con.cursor()
     26     cur.execute("SELECT name FROM term")
     27     terms = {row[0] for row in cur.fetchall()}
     28     cur.close()
     29     con.close()
     30     return terms
     31 
     32 
     33 # use lynx to render the raw html. This will give us exactly what a user
     34 # of the best browser, lynx, would see if they went to the site.
     35 def get_plaintext(filepath):
     36     result = subprocess.run(
     37         ['lynx', '--dump', '--force_html', '--nolist', filepath],
     38         capture_output=True,
     39     )
     40 
     41     plaintext = result.stdout.decode('utf-8', errors='replace')
     42     plaintext = re.sub(r'\(BUTTON\)\s*', '', plaintext) # lynx button representation.
     43 
     44     lines = [line for line in plaintext.splitlines() 
     45              if 'REFRESH' not in line 
     46              and 'file://' not in line
     47              and not re.match(r'^\s+\d{2}-\d{2}/', line)] 
     48     return '\n'.join(lines)
     49 
     50 def get_plaintext_words(filepath):
     51     plaintext = get_plaintext(filepath)
     52     return get_words(plaintext)
     53 
     54 # NOTE: The total number of terms is based on the inclusion list
     55 def get_word_counts_in_included_and_total_words(text, included):
     56     total_terms = 0
     57     current = {}
     58     lines = text
     59     lines = re.sub('[^0-9a-zA-Z]+', ' ', lines)
     60     lines = lines.split(' ')
     61     for i in range(0, len(lines)):
     62         if lines[i] != '':
     63             cw = lines[i].lower()
     64             if cw in included:
     65                 if cw not in current:
     66                     current[cw] = 1
     67                 else:
     68                     current[cw] += 1
     69                 total_terms += 1
     70     return current, total_terms
     71 
     72 
     73 
     74 def get_words(text):
     75     current = set()
     76     lines = text
     77     lines = re.sub('[^0-9a-zA-Z]+', ' ', lines)
     78     lines = lines.split(' ')
     79     for i in range(0, len(lines)):
     80         if lines[i] != '':
     81             current.add(lines[i].lower())
     82     return current
     83 
     84 def get_filepaths(document_directory):
     85     filepaths = []
     86     for filepath in glob.iglob(document_directory + '**/**', recursive=True):
     87         if os.path.isfile(filepath):
     88              filepaths.append(filepath)
     89     return filepaths
     90 
     91 # lower bound percentage is kinda dangerous.
     92 # it might be useful to, say, search for hash online, or to look for a last name which 
     93 # occurs very, very, infrequently. 
     94 
     95 # also, upper bound percentage might be too strong in a lot of cases. We probably are interested 
     96 # in only using stop words, but even that is difficult because what should happen if someone searches the word
     97 # 'the' into their search engine?
     98 
     99 # google replies with wikipedia and dictionary stuff for the word the, but if we don't search for it that could cause issues. to this end, I suspect we only care about removing stop words when querying, and only in cases where it doesn't result in searching for no words, or in cases where the term does actually matter. 
    100 
    101 # given all of this, it seems right for now to simply index on each token. further considerations can be made later, but this seems reasonable, especially if we are only indexing large sites.
    102 
    103 def get_log_doc_freqs(filepaths, lower_bound_percentage=0, upper_bound_percentage=1, filter_stop_words=False):
    104     stop_words = set()
    105     if filter_stop_words:
    106         nltk.download('stopwords')
    107         stop_words = set(stopwords.words('english'))
    108 
    109     current = {}
    110     for i in tqdm.tqdm(range(len(filepaths))):
    111         filepath = filepaths[i]
    112         words = get_plaintext_words(filepath)
    113         for word in words:
    114             if word in current:
    115                 current[word] += 1
    116             else:
    117                 current[word] = 1
    118     log_doc_freqs = {}
    119     for word in current:
    120         # random word or stop word?
    121         if current[word] > int(lower_bound_percentage * len(filepaths)) and current[word] < int(len(filepaths) * upper_bound_percentage) and word not in stop_words:
    122             log_doc_freqs[word] = math.log(len(filepaths) / current[word])
    123     return log_doc_freqs
	information-retrieval Exploration of information retrieval topics
	git clone git://git.laack.co/information-retrieval.git
	Log \| Files \| Refs