information-retrieval

Exploration of information retrieval topics
git clone git://git.laack.co/information-retrieval.git
Log | Files | Refs

lang-detect.py (1274B)


      1 from langdetect import detect
      2 from tqdm import tqdm
      3 import sqlite3
      4 from utils import get_plaintext
      5 from bs4 import BeautifulSoup
      6 
      7 def get_html_language(filepath):
      8     try:
      9         with open(filepath, 'r', encoding='utf-8') as f:
     10             soup = BeautifulSoup(f, 'html.parser')
     11             html_tag = soup.find('html')
     12             if html_tag and html_tag.get('lang'):
     13                 return html_tag.get('lang').split('-')[0].lower()
     14     except OSError:
     15         pass
     16 
     17 def detect_language(filepath):
     18     result = get_html_language(filepath)
     19     if result is not None:
     20         return result
     21     return detect(get_plaintext(filepath))
     22 
     23 if __name__ == "__main__":
     24     con_sites = sqlite3.connect('database/manifest.db', timeout=60)
     25     con_sites.execute('PRAGMA journal_mode=WAL')
     26     cur_sites = con_sites.cursor()
     27 
     28     cur_sites.execute("""
     29         SELECT filepath FROM site WHERE language IS NULL
     30     """)
     31 
     32     missing_language = cur_sites.fetchall()
     33 
     34     for i in tqdm(range(len(missing_language))):
     35         filepath_tuple = missing_language[i]
     36         filepath = filepath_tuple[0]
     37         lang = detect_language(filepath)
     38         cur_sites.execute("""
     39             UPDATE site SET language = ? WHERE filepath = ?
     40         """, (lang, filepath))
     41         con_sites.commit()