lang-detect.py (1274B)
1 from langdetect import detect 2 from tqdm import tqdm 3 import sqlite3 4 from utils import get_plaintext 5 from bs4 import BeautifulSoup 6 7 def get_html_language(filepath): 8 try: 9 with open(filepath, 'r', encoding='utf-8') as f: 10 soup = BeautifulSoup(f, 'html.parser') 11 html_tag = soup.find('html') 12 if html_tag and html_tag.get('lang'): 13 return html_tag.get('lang').split('-')[0].lower() 14 except OSError: 15 pass 16 17 def detect_language(filepath): 18 result = get_html_language(filepath) 19 if result is not None: 20 return result 21 return detect(get_plaintext(filepath)) 22 23 if __name__ == "__main__": 24 con_sites = sqlite3.connect('database/manifest.db', timeout=60) 25 con_sites.execute('PRAGMA journal_mode=WAL') 26 cur_sites = con_sites.cursor() 27 28 cur_sites.execute(""" 29 SELECT filepath FROM site WHERE language IS NULL 30 """) 31 32 missing_language = cur_sites.fetchall() 33 34 for i in tqdm(range(len(missing_language))): 35 filepath_tuple = missing_language[i] 36 filepath = filepath_tuple[0] 37 lang = detect_language(filepath) 38 cur_sites.execute(""" 39 UPDATE site SET language = ? WHERE filepath = ? 40 """, (lang, filepath)) 41 con_sites.commit()