commit 65b0744793d84be90e7f045fad0a7df4c9c1a9c5
parent 7d29a4a799bd92b9228925caeabd0daba88b0b82
Author: andrew.laack <andrew.laack@imbue.com>
Date: Tue, 16 Sep 2025 17:20:21 -0700
Added analysis
Diffstat:
3 files changed, 58 insertions(+), 5 deletions(-)
diff --git a/python/youtube/analysis.py b/python/youtube/analysis.py
@@ -3,12 +3,27 @@ import matplotlib.pyplot as plt
import os
import pickle
-files = os.listdir('results')
+files = [os.path.join(dp, f) for dp, _, filenames in os.walk("results") for f in filenames if os.path.splitext(f)[1] == '.csv']
durations = []
+seen_urls = set()
for file in files:
print(f'Processing {file}')
- df = pd.read_csv(f'results/{file}', low_memory=False)
+ df = pd.read_csv(f'{file}', low_memory=False)
+
+ if 'url' in df.columns:
+ initial_count = len(df)
+
+ df = df[~df['url'].isin(seen_urls)]
+
+ df = df.drop_duplicates(subset=['url'], keep='first')
+
+ new_urls = df['url'].dropna()
+ seen_urls.update(new_urls)
+
+ print(f"Removed {initial_count - len(df)} duplicate URLs")
+ else:
+ print("Warning: 'url' column not found in this file")
duration = df['duration_seconds']
@@ -26,11 +41,10 @@ for file in files:
durations.extend(duration_list)
print(f"Processed {len(duration_list)} valid integer rows")
-
+print(f"Total unique URLs processed: {len(seen_urls)}")
print(f"Mean: {sum(durations) / len(durations)}")
print(f"Total valid durations: {len(durations)}")
plt.hist(durations, range=(0, 1000), bins=100)
plt.savefig("durations_histogram.png")
-
with open('durations.pkl', 'wb') as f:
pickle.dump(durations, f)
diff --git a/python/youtube/analysis.txt b/python/youtube/analysis.txt
@@ -0,0 +1,39 @@
+Processing results/no-resolution/lots1.csv
+Removed 978780 duplicate URLs
+Processed 1509890 valid integer rows
+Processing results/no-resolution/res1.csv
+Removed 54421 duplicate URLs
+Processed 208935 valid integer rows
+Processing results/no-resolution/res_res.csv
+Removed 243218 duplicate URLs
+Processed 19945 valid integer rows
+Processing results/no-resolution/res983074.csv
+Removed 252218 duplicate URLs
+Processed 4134 valid integer rows
+Processing results/no-resolution/res_8.csv
+Removed 2353544 duplicate URLs
+Processed 121553 valid integer rows
+Processing results/no-resolution/results-only-lengths.csv
+Removed 362964 duplicate URLs
+Processed 615173 valid integer rows
+Processing results/no-resolution/lots.csv
+Removed 6847717 duplicate URLs
+Processed 5111429 valid integer rows
+Processing results/no-resolution/results-comp1.csv
+Removed 2466923 duplicate URLs
+Processed 21657 valid integer rows
+Processing results/no-resolution/results.csv
+Removed 262076 duplicate URLs
+Processed 1382 valid integer rows
+Processing results/max-bitrate/lots-with-bitrate.csv
+Removed 205695 duplicate URLs
+Processed 39729 valid integer rows
+Processing results/maybe-resolution/lots.csv
+Removed 12415 duplicate URLs
+Processed 432 valid integer rows
+Processing results/maybe-resolution/res_some_res.csv
+Removed 6795 duplicate URLs
+Processed 2115 valid integer rows
+Total unique URLs processed: 7656374
+Mean: 666.1375179686886
+Total valid durations: 7656374
diff --git a/python/youtube/yt-lots.py b/python/youtube/yt-lots.py
@@ -50,7 +50,7 @@ def parse_videos(obj):
return results
def fetch_resolution_and_bitrate(video_id):
- return "Not including"
+# return "Not including"
url = f"https://www.youtube.com/watch?v={video_id}"
try:
resp = session.get(url, headers=HEADERS, timeout=10)