clean_cache.py (1663B)
1 from crawling.spider import get_indexing_db_connection 2 import os 3 import glob 4 from crawling.constants import CACHE_DIRECTORY 5 import time 6 7 def get_queued_cache_files(conn): 8 cursor = conn.cursor() 9 query = """ 10 SELECT filepath FROM indexing_queue 11 """ 12 cursor.execute(query) 13 results = [res[0] for res in cursor.fetchall()] 14 return results 15 16 def get_all_cache_files(): 17 document_directory = CACHE_DIRECTORY 18 filepaths = [] 19 for filepath in glob.iglob(document_directory + '**/**', recursive=True): 20 if os.path.isfile(filepath): 21 filepaths.append(filepath) 22 return filepaths 23 24 def get_old_files(filepaths): 25 26 # This is ample time to ensure the files are not in the process of being inserted into the db. 27 one_hour_ago = time.time() - 3600 28 old_files = [] 29 for filepath in filepaths: 30 if os.stat(filepath).st_ctime < one_hour_ago: 31 old_files.append(filepath) 32 return old_files 33 34 35 def clean_cache(): 36 conn = get_indexing_db_connection() 37 all_cache_files = get_all_cache_files() 38 queued_cache_files = set(get_queued_cache_files(conn)) 39 unqueued_files = [] 40 41 for filepath in all_cache_files: 42 if filepath not in queued_cache_files: 43 unqueued_files.append(filepath) 44 45 to_delete = get_old_files(unqueued_files) 46 print(f"Deleting {len(to_delete)} old files.") 47 48 deleted = 0 49 for filepath in to_delete: 50 try: 51 os.remove(filepath) 52 deleted += 1 53 except: 54 print(f"Failed to delete {filepath}, continuing on") 55 print(f"Deleted {deleted} old files.") 56 57 if __name__ == "__main__": 58 clean_cache()