Added analysis - blog - Unnamed repository; edit this file 'description' to name the repository.

commit 65b0744793d84be90e7f045fad0a7df4c9c1a9c5
parent 7d29a4a799bd92b9228925caeabd0daba88b0b82
Author: andrew.laack <andrew.laack@imbue.com>
Date:   Tue, 16 Sep 2025 17:20:21 -0700

Added analysis

Diffstat:
M python/youtube/analysis.py  | 22 ++++++++++++++++++----
A python/youtube/analysis.txt  | 39 +++++++++++++++++++++++++++++++++++++++
M python/youtube/yt-lots.py  | 2 +-

3 files changed, 58 insertions(+), 5 deletions(-)
diff --git a/python/youtube/analysis.py b/python/youtube/analysis.py
@@ -3,12 +3,27 @@ import matplotlib.pyplot as plt
 import os
 import pickle
 
-files = os.listdir('results')
+files = [os.path.join(dp, f) for dp, _, filenames in os.walk("results") for f in filenames if os.path.splitext(f)[1] == '.csv']
 durations = []
+seen_urls = set()
 
 for file in files:
     print(f'Processing {file}')
-    df = pd.read_csv(f'results/{file}', low_memory=False)
+    df = pd.read_csv(f'{file}', low_memory=False)
+    
+    if 'url' in df.columns:
+        initial_count = len(df)
+        
+        df = df[~df['url'].isin(seen_urls)]
+        
+        df = df.drop_duplicates(subset=['url'], keep='first')
+        
+        new_urls = df['url'].dropna()
+        seen_urls.update(new_urls)
+        
+        print(f"Removed {initial_count - len(df)} duplicate URLs")
+    else:
+        print("Warning: 'url' column not found in this file")
     
     duration = df['duration_seconds']
     
@@ -26,11 +41,10 @@ for file in files:
     durations.extend(duration_list)
     print(f"Processed {len(duration_list)} valid integer rows")
 
-
+print(f"Total unique URLs processed: {len(seen_urls)}")
 print(f"Mean: {sum(durations) / len(durations)}")
 print(f"Total valid durations: {len(durations)}")
 plt.hist(durations, range=(0, 1000), bins=100)
 plt.savefig("durations_histogram.png")
-
 with open('durations.pkl', 'wb') as f:
     pickle.dump(durations, f)
diff --git a/python/youtube/analysis.txt b/python/youtube/analysis.txt
@@ -0,0 +1,39 @@
+Processing results/no-resolution/lots1.csv
+Removed 978780 duplicate URLs
+Processed 1509890 valid integer rows
+Processing results/no-resolution/res1.csv
+Removed 54421 duplicate URLs
+Processed 208935 valid integer rows
+Processing results/no-resolution/res_res.csv
+Removed 243218 duplicate URLs
+Processed 19945 valid integer rows
+Processing results/no-resolution/res983074.csv
+Removed 252218 duplicate URLs
+Processed 4134 valid integer rows
+Processing results/no-resolution/res_8.csv
+Removed 2353544 duplicate URLs
+Processed 121553 valid integer rows
+Processing results/no-resolution/results-only-lengths.csv
+Removed 362964 duplicate URLs
+Processed 615173 valid integer rows
+Processing results/no-resolution/lots.csv
+Removed 6847717 duplicate URLs
+Processed 5111429 valid integer rows
+Processing results/no-resolution/results-comp1.csv
+Removed 2466923 duplicate URLs
+Processed 21657 valid integer rows
+Processing results/no-resolution/results.csv
+Removed 262076 duplicate URLs
+Processed 1382 valid integer rows
+Processing results/max-bitrate/lots-with-bitrate.csv
+Removed 205695 duplicate URLs
+Processed 39729 valid integer rows
+Processing results/maybe-resolution/lots.csv
+Removed 12415 duplicate URLs
+Processed 432 valid integer rows
+Processing results/maybe-resolution/res_some_res.csv
+Removed 6795 duplicate URLs
+Processed 2115 valid integer rows
+Total unique URLs processed: 7656374
+Mean: 666.1375179686886
+Total valid durations: 7656374
diff --git a/python/youtube/yt-lots.py b/python/youtube/yt-lots.py
@@ -50,7 +50,7 @@ def parse_videos(obj):
     return results
 
 def fetch_resolution_and_bitrate(video_id):
-    return "Not including"
+#    return "Not including"
     url = f"https://www.youtube.com/watch?v={video_id}"
     try:
         resp = session.get(url, headers=HEADERS, timeout=10)

	blog Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs

M	python/youtube/analysis.py	\|	22	++++++++++++++++++----
A	python/youtube/analysis.txt	\|	39	+++++++++++++++++++++++++++++++++++++++
M	python/youtube/yt-lots.py	\|	2	+-