information-retrieval

Exploration of information retrieval topics
git clone git://git.laack.co/information-retrieval.git
Log | Files | Refs

clean_cache.py (1663B)


      1 from crawling.spider import get_indexing_db_connection
      2 import os
      3 import glob
      4 from crawling.constants import CACHE_DIRECTORY
      5 import time
      6 
      7 def get_queued_cache_files(conn):
      8     cursor = conn.cursor()
      9     query = """
     10         SELECT filepath FROM indexing_queue
     11     """
     12     cursor.execute(query)
     13     results = [res[0] for res in cursor.fetchall()]
     14     return results
     15 
     16 def get_all_cache_files():
     17     document_directory = CACHE_DIRECTORY
     18     filepaths = []
     19     for filepath in glob.iglob(document_directory + '**/**', recursive=True):
     20         if os.path.isfile(filepath):
     21              filepaths.append(filepath)
     22     return filepaths
     23 
     24 def get_old_files(filepaths):
     25 
     26     # This is ample time to ensure the files are not in the process of being inserted into the db.
     27     one_hour_ago = time.time() - 3600
     28     old_files = []
     29     for filepath in filepaths:
     30         if os.stat(filepath).st_ctime < one_hour_ago:
     31             old_files.append(filepath)
     32     return old_files
     33 
     34 
     35 def clean_cache():
     36     conn = get_indexing_db_connection()
     37     all_cache_files = get_all_cache_files()
     38     queued_cache_files = set(get_queued_cache_files(conn))
     39     unqueued_files = []
     40 
     41     for filepath in all_cache_files:
     42         if filepath not in queued_cache_files:
     43             unqueued_files.append(filepath)
     44 
     45     to_delete = get_old_files(unqueued_files)
     46     print(f"Deleting {len(to_delete)} old files.")
     47 
     48     deleted = 0
     49     for filepath in to_delete:
     50         try:
     51             os.remove(filepath)
     52             deleted += 1
     53         except:
     54             print(f"Failed to delete {filepath}, continuing on")
     55     print(f"Deleted {deleted} old files.")
     56 
     57 if __name__ == "__main__":
     58     clean_cache()