duration.py (1416B)
1 import pandas as pd 2 import matplotlib.pyplot as plt 3 import os 4 5 files = [os.path.join(dp, f) for dp, _, filenames in os.walk("results") for f in filenames if os.path.splitext(f)[1] == '.csv'] 6 durations = [] 7 seen_urls = set() 8 9 for file in files: 10 print(f'Processing {file}') 11 df = pd.read_csv(f'{file}', low_memory=False) 12 13 if 'url' in df.columns: 14 initial_count = len(df) 15 16 df = df[~df['url'].isin(seen_urls)] 17 18 df = df.drop_duplicates(subset=['url'], keep='first') 19 20 new_urls = df['url'].dropna() 21 seen_urls.update(new_urls) 22 23 print(f"Removed {initial_count - len(df)} duplicate URLs") 24 else: 25 print("Warning: 'url' column not found in this file") 26 27 duration = df['duration_seconds'] 28 29 duration = duration.dropna() 30 31 duration = pd.to_numeric(duration, errors='coerce') 32 33 duration = duration.dropna() 34 35 duration = duration[duration == duration.astype(int)] 36 37 duration = duration.astype(int) 38 39 duration_list = duration.to_list() 40 durations.extend(duration_list) 41 print(f"Processed {len(duration_list)} valid integer rows") 42 43 print(f"Total unique URLs processed: {len(seen_urls)}") 44 print(f"Mean: {sum(durations) / len(durations)}") 45 print(f"Total valid durations: {len(durations)}") 46 plt.hist(durations, range=(0, 1000), bins=100) 47 plt.savefig("durations_histogram.png")