information-retrieval

Exploration of information retrieval topics
git clone git://git.laack.co/information-retrieval.git
Log | Files | Refs

clean.py (893B)


      1 from crawling.spider import get_crawling_db_connection
      2 import shutil
      3 from crawling.spider import get_indexing_db_connection
      4 
      5 
      6 if __name__ == "__main__":
      7     try:
      8         shutil.rmtree("/var/lib/search/crawl_cache")
      9     except FileNotFoundError as e:
     10         print("Crawl cache directory doesn't exist, continuing with cleanup")
     11     crawling_conn = get_crawling_db_connection()
     12     
     13     crawling_cur = crawling_conn.cursor()
     14     crawling_cur.execute("""
     15         DROP TABLE queued_site;
     16     """)
     17     crawling_cur.close()
     18     crawling_conn.commit()
     19     crawling_conn.close()
     20 
     21     print("Crawling datbase cleaned")
     22 
     23     indexing_conn = get_indexing_db_connection()
     24     indexing_cur = indexing_conn.cursor()
     25     indexing_cur.execute("""
     26         DROP TABLE indexing_queue;
     27     """)
     28     indexing_cur.close()
     29     indexing_conn.commit()
     30     indexing_conn.close()
     31 
     32     print("Indexing datbase cleaned")