information-retrieval

Exploration of information retrieval topics
git clone git://git.laack.co/information-retrieval.git
Log | Files | Refs

utils.py (2578B)


      1 from langdetect import detect
      2 from bs4 import BeautifulSoup
      3 import re
      4 from bs4 import BeautifulSoup
      5 from nltk.stem import PorterStemmer
      6 
      7 ps = PorterStemmer()
      8 
      9 # Only include important information that is formatted well.
     10 # I want lots of quality information with minimal overhead.
     11 
     12 def get_plaintext(filepath):
     13 
     14     with open(filepath, 'r') as f:
     15         body = f.read()
     16     soup = BeautifulSoup(body, 'html.parser')
     17 
     18     for selector in ['nav', 'footer', 'header', '[role="navigation"]', 
     19                  '.flash-error', '.js-header-wrapper', '.footer']:
     20         for tag in soup.select(selector):
     21             tag.decompose()
     22 
     23     tags_to_extract = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'td', 'blockquote', 'code'])
     24     return u" ".join(tag.get_text(separator=" ", strip=True) for tag in tags_to_extract)
     25 
     26 def get_plaintext_words(filepath):
     27     plaintext = get_plaintext(filepath)
     28     return get_words(plaintext)
     29 
     30 def get_words(text):
     31     current = {}
     32     lines = text
     33     
     34     # There are special characters with diacritics we probably still want.
     35     # only word characters remain (minus underscores because some url stuff I wanted)
     36     # Replaces symbols and such
     37 
     38     lines = re.sub(r"[^0-9A-Za-z]+", " ", lines)
     39     lines = lines.split(' ')
     40     for i in range(0, len(lines)):
     41         if lines[i] != '':
     42             current_word = ps.stem(lines[i].lower())
     43             if current_word not in current:
     44                 current[current_word] = [i]
     45             else:
     46                 current[current_word].append(i)
     47     return current
     48 
     49 def get_html_language(filepath):
     50     try:
     51         with open(filepath, 'r', encoding='utf-8') as f:
     52             soup = BeautifulSoup(f, 'html.parser')
     53             html_tag = soup.find('html')
     54             if html_tag and html_tag.get('lang'):
     55                 return html_tag.get('lang').split('-')[0].lower()
     56     except OSError:
     57         pass
     58 
     59 def detect_language(filepath):
     60     result = get_html_language(filepath)
     61     if result is not None:
     62         return result
     63     try:
     64 
     65         # it might be worth considering more stringent criteria.
     66         # Sometimes a site will be detected as english, but this is only because of other
     67         # on screen artifacts (menus and such).
     68 
     69         # TODO: Should this even exist? If a site doesn't have the path, do we want it?
     70 
     71         detected = detect(get_plaintext(filepath))
     72         return detected
     73     except:
     74 
     75         # Failed to detect
     76         # This can happen if the plaintext is empty or a few other cases.
     77         # Should be rare for quality sites.
     78 
     79         return "zz"