prune.py (1097B)
1 # prune documents that aren't useful / we don't want. 2 import os 3 from indexing.utils import get_filepaths 4 from indexing.utils import get_plaintext 5 from concurrent.futures import ThreadPoolExecutor, as_completed 6 import tqdm 7 8 # this can happen from rate limiting, requiring js, and some other things too. 9 def non_substantive(plaintext): 10 if len(plaintext) < 1_000: 11 return True 12 return False 13 14 def drop(plaintext): 15 if non_substantive(plaintext): 16 return True 17 return False 18 def process_file(filepath): 19 plaintext = get_plaintext(filepath) 20 if drop(plaintext): 21 os.remove(filepath) 22 return True 23 return False 24 25 if __name__ == "__main__": 26 filepaths = get_filepaths("sites") 27 print(f'{len(filepaths)} filepaths found.') 28 deleted = 0 29 30 with ThreadPoolExecutor(max_workers=50) as executor: 31 futures = {executor.submit(process_file, fp): fp for fp in filepaths} 32 for future in tqdm.tqdm(as_completed(futures), total=len(filepaths)): 33 if future.result(): 34 deleted += 1 35 print(f'Deleted {deleted} files')