blog

Personal blog
git clone git://git.laack.co/blog.git
Log | Files | Refs

urls_parsing.py (782B)


      1 import pandas as pd
      2 import matplotlib.pyplot as plt
      3 import os
      4 from collections import Counter
      5 
      6 # Find all CSV files recursively under "results"
      7 files = [os.path.join(dp, f) for dp, _, filenames in os.walk("results") for f in filenames if os.path.splitext(f)[1] == '.csv']
      8 
      9 urls = set()
     10 
     11 for file in files:
     12     print(f'Processing {file}')
     13     df = pd.read_csv(file, low_memory=False)
     14 
     15     if 'url' not in df.columns:
     16         print("Warning: 'url' column not found in this file")
     17         continue
     18 
     19     df = df[~df['url'].isin(urls)]
     20     df = df.drop_duplicates(subset=['url'], keep='first')
     21 
     22     new_urls = df['url'].dropna()
     23     urls.update(new_urls)
     24 
     25     print(f"Added {len(new_urls)} more urls.")
     26 
     27 print(f"Total unique URL processed: {len(urls)}")
     28 
     29 for url in urls:
     30     print(url)