blog

Personal blog
git clone git://git.laack.co/blog.git
Log | Files | Refs

bitrate.py (2027B)


      1 import pandas as pd
      2 import matplotlib.pyplot as plt
      3 import os
      4 from collections import Counter
      5 
      6 # Find all CSV files recursively under "results"
      7 files = [os.path.join(dp, f) for dp, _, filenames in os.walk("results") for f in filenames if os.path.splitext(f)[1] == '.csv']
      8 
      9 resolutions = []
     10 seen_urls = set()
     11 invalid_values = {"Not getting res", "Not Used In This Run", "N/A"}
     12 
     13 
     14 for file in files:
     15     print(f'Processing {file}')
     16     df = pd.read_csv(file, low_memory=False)
     17 
     18     if 'url' not in df.columns:
     19         print("Warning: 'url' column not found in this file")
     20         continue
     21 
     22     if 'max_bitrate' not in df.columns:
     23         print("Warning: 'max_bitrate' column not found in this file")
     24         continue
     25 
     26     initial_count = len(df)
     27 
     28     # Remove duplicates based on 'url'
     29     df = df[~df['url'].isin(seen_urls)]
     30     df = df.drop_duplicates(subset=['url'], keep='first')
     31 
     32     new_urls = df['url'].dropna()
     33     seen_urls.update(new_urls)
     34 
     35     print(f"Removed {initial_count - len(df)} duplicate URLs")
     36 
     37     # Clean and collect max_bitrate values
     38     resolution = df['max_bitrate'].dropna().astype(str).str.strip()
     39     resolution = resolution[~resolution.isin(invalid_values)]
     40     resolution = resolution[resolution != '']
     41     resolution_list = resolution.tolist()
     42     resolutions.extend(resolution_list)
     43 
     44     print(f"Processed {len(resolution_list)} valid resolution rows")
     45 
     46 # Count occurrences
     47 resolution_counts = Counter(resolutions)
     48 sorted_resolutions = sorted(resolution_counts.items(), key=lambda x: (-x[1], x[0]))
     49 
     50 # Summary
     51 print(f"Total unique URLs processed: {len(seen_urls)}")
     52 print(f"Total valid max_bitrate entries: {len(resolutions)}")
     53 for res, count in sorted_resolutions:
     54     print(f"{res}: {count}")
     55 
     56 # Plot
     57 res_labels, res_values = zip(*sorted_resolutions)
     58 plt.figure(figsize=(12, 6))
     59 plt.bar(res_labels, res_values, color='salmon')
     60 plt.xlabel('Max Bitrate')
     61 plt.ylabel('Count')
     62 plt.title('Max Bitrate Distribution')
     63 plt.xticks(rotation=45)
     64 plt.tight_layout()
     65 plt.savefig("max_bitrate_histogram.png")