information-retrieval

Exploration of information retrieval topics
git clone git://git.laack.co/information-retrieval.git
Log | Files | Refs

prune.py (1097B)


      1 # prune documents that aren't useful / we don't want.
      2 import os
      3 from indexing.utils import get_filepaths
      4 from indexing.utils import get_plaintext
      5 from concurrent.futures import ThreadPoolExecutor, as_completed
      6 import tqdm
      7 
      8 # this can happen from rate limiting, requiring js, and some other things too.
      9 def non_substantive(plaintext):
     10     if len(plaintext) < 1_000:
     11         return True
     12     return False
     13 
     14 def drop(plaintext):
     15     if non_substantive(plaintext):
     16         return True
     17     return False
     18 def process_file(filepath):
     19     plaintext = get_plaintext(filepath)
     20     if drop(plaintext):
     21         os.remove(filepath)
     22         return True
     23     return False
     24 
     25 if __name__ == "__main__":
     26     filepaths = get_filepaths("sites")
     27     print(f'{len(filepaths)} filepaths found.')
     28     deleted = 0
     29 
     30     with ThreadPoolExecutor(max_workers=50) as executor:
     31         futures = {executor.submit(process_file, fp): fp for fp in filepaths}
     32         for future in tqdm.tqdm(as_completed(futures), total=len(filepaths)):
     33             if future.result():
     34                 deleted += 1
     35     print(f'Deleted {deleted} files')