bitrate.py (2027B)
1 import pandas as pd 2 import matplotlib.pyplot as plt 3 import os 4 from collections import Counter 5 6 # Find all CSV files recursively under "results" 7 files = [os.path.join(dp, f) for dp, _, filenames in os.walk("results") for f in filenames if os.path.splitext(f)[1] == '.csv'] 8 9 resolutions = [] 10 seen_urls = set() 11 invalid_values = {"Not getting res", "Not Used In This Run", "N/A"} 12 13 14 for file in files: 15 print(f'Processing {file}') 16 df = pd.read_csv(file, low_memory=False) 17 18 if 'url' not in df.columns: 19 print("Warning: 'url' column not found in this file") 20 continue 21 22 if 'max_bitrate' not in df.columns: 23 print("Warning: 'max_bitrate' column not found in this file") 24 continue 25 26 initial_count = len(df) 27 28 # Remove duplicates based on 'url' 29 df = df[~df['url'].isin(seen_urls)] 30 df = df.drop_duplicates(subset=['url'], keep='first') 31 32 new_urls = df['url'].dropna() 33 seen_urls.update(new_urls) 34 35 print(f"Removed {initial_count - len(df)} duplicate URLs") 36 37 # Clean and collect max_bitrate values 38 resolution = df['max_bitrate'].dropna().astype(str).str.strip() 39 resolution = resolution[~resolution.isin(invalid_values)] 40 resolution = resolution[resolution != ''] 41 resolution_list = resolution.tolist() 42 resolutions.extend(resolution_list) 43 44 print(f"Processed {len(resolution_list)} valid resolution rows") 45 46 # Count occurrences 47 resolution_counts = Counter(resolutions) 48 sorted_resolutions = sorted(resolution_counts.items(), key=lambda x: (-x[1], x[0])) 49 50 # Summary 51 print(f"Total unique URLs processed: {len(seen_urls)}") 52 print(f"Total valid max_bitrate entries: {len(resolutions)}") 53 for res, count in sorted_resolutions: 54 print(f"{res}: {count}") 55 56 # Plot 57 res_labels, res_values = zip(*sorted_resolutions) 58 plt.figure(figsize=(12, 6)) 59 plt.bar(res_labels, res_values, color='salmon') 60 plt.xlabel('Max Bitrate') 61 plt.ylabel('Count') 62 plt.title('Max Bitrate Distribution') 63 plt.xticks(rotation=45) 64 plt.tight_layout() 65 plt.savefig("max_bitrate_histogram.png")