utils.py (2578B)
1 from langdetect import detect 2 from bs4 import BeautifulSoup 3 import re 4 from bs4 import BeautifulSoup 5 from nltk.stem import PorterStemmer 6 7 ps = PorterStemmer() 8 9 # Only include important information that is formatted well. 10 # I want lots of quality information with minimal overhead. 11 12 def get_plaintext(filepath): 13 14 with open(filepath, 'r') as f: 15 body = f.read() 16 soup = BeautifulSoup(body, 'html.parser') 17 18 for selector in ['nav', 'footer', 'header', '[role="navigation"]', 19 '.flash-error', '.js-header-wrapper', '.footer']: 20 for tag in soup.select(selector): 21 tag.decompose() 22 23 tags_to_extract = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'td', 'blockquote', 'code']) 24 return u" ".join(tag.get_text(separator=" ", strip=True) for tag in tags_to_extract) 25 26 def get_plaintext_words(filepath): 27 plaintext = get_plaintext(filepath) 28 return get_words(plaintext) 29 30 def get_words(text): 31 current = {} 32 lines = text 33 34 # There are special characters with diacritics we probably still want. 35 # only word characters remain (minus underscores because some url stuff I wanted) 36 # Replaces symbols and such 37 38 lines = re.sub(r"[^0-9A-Za-z]+", " ", lines) 39 lines = lines.split(' ') 40 for i in range(0, len(lines)): 41 if lines[i] != '': 42 current_word = ps.stem(lines[i].lower()) 43 if current_word not in current: 44 current[current_word] = [i] 45 else: 46 current[current_word].append(i) 47 return current 48 49 def get_html_language(filepath): 50 try: 51 with open(filepath, 'r', encoding='utf-8') as f: 52 soup = BeautifulSoup(f, 'html.parser') 53 html_tag = soup.find('html') 54 if html_tag and html_tag.get('lang'): 55 return html_tag.get('lang').split('-')[0].lower() 56 except OSError: 57 pass 58 59 def detect_language(filepath): 60 result = get_html_language(filepath) 61 if result is not None: 62 return result 63 try: 64 65 # it might be worth considering more stringent criteria. 66 # Sometimes a site will be detected as english, but this is only because of other 67 # on screen artifacts (menus and such). 68 69 # TODO: Should this even exist? If a site doesn't have the path, do we want it? 70 71 detected = detect(get_plaintext(filepath)) 72 return detected 73 except: 74 75 # Failed to detect 76 # This can happen if the plaintext is empty or a few other cases. 77 # Should be rare for quality sites. 78 79 return "zz"