urls_parsing.py (782B)
1 import pandas as pd 2 import matplotlib.pyplot as plt 3 import os 4 from collections import Counter 5 6 # Find all CSV files recursively under "results" 7 files = [os.path.join(dp, f) for dp, _, filenames in os.walk("results") for f in filenames if os.path.splitext(f)[1] == '.csv'] 8 9 urls = set() 10 11 for file in files: 12 print(f'Processing {file}') 13 df = pd.read_csv(file, low_memory=False) 14 15 if 'url' not in df.columns: 16 print("Warning: 'url' column not found in this file") 17 continue 18 19 df = df[~df['url'].isin(urls)] 20 df = df.drop_duplicates(subset=['url'], keep='first') 21 22 new_urls = df['url'].dropna() 23 urls.update(new_urls) 24 25 print(f"Added {len(new_urls)} more urls.") 26 27 print(f"Total unique URL processed: {len(urls)}") 28 29 for url in urls: 30 print(url)