commit c4fe5b6d74a01f55abf6ca6240185208fa9b2908
parent 58c7038c2dc8646fecdac690371111e06927fda7
Author: andrew.laack <andrew.laack@imbue.com>
Date: Mon, 15 Sep 2025 17:34:02 -0500
Merge branch 'master' of ssh://brgr.heron-peacock.ts.net/home/shared/git/public-repos/blog
Diffstat:
1 file changed, 34 insertions(+), 0 deletions(-)
diff --git a/python/youtube/analysis.py b/python/youtube/analysis.py
@@ -0,0 +1,34 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import os
+import pickle
+
+files = os.listdir('results')
+durations = []
+
+for file in files:
+ print(f'Processing {file}')
+ df = pd.read_csv(f'results/{file}', low_memory=False)
+
+ duration = df['duration_seconds']
+
+ duration = duration.dropna()
+
+ duration = pd.to_numeric(duration, errors='coerce')
+
+ duration = duration.dropna()
+
+ duration = duration[duration == duration.astype(int)]
+
+ duration = duration.astype(int)
+
+ duration_list = duration.to_list()
+ durations.extend(duration_list)
+ print(f"Processed {len(duration_list)} valid integer rows")
+
+print(f"Total valid durations: {len(durations)}")
+plt.hist(durations, range=(0, 1000), bins=100)
+plt.savefig("durations_histogram.png")
+
+with open('durations.pkl', 'wb') as f:
+ pickle.dump(durations, f)