blog

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit c4fe5b6d74a01f55abf6ca6240185208fa9b2908
parent 58c7038c2dc8646fecdac690371111e06927fda7
Author: andrew.laack <andrew.laack@imbue.com>
Date:   Mon, 15 Sep 2025 17:34:02 -0500

Merge branch 'master' of ssh://brgr.heron-peacock.ts.net/home/shared/git/public-repos/blog

Diffstat:
Apython/youtube/analysis.py | 34++++++++++++++++++++++++++++++++++
1 file changed, 34 insertions(+), 0 deletions(-)

diff --git a/python/youtube/analysis.py b/python/youtube/analysis.py @@ -0,0 +1,34 @@ +import pandas as pd +import matplotlib.pyplot as plt +import os +import pickle + +files = os.listdir('results') +durations = [] + +for file in files: + print(f'Processing {file}') + df = pd.read_csv(f'results/{file}', low_memory=False) + + duration = df['duration_seconds'] + + duration = duration.dropna() + + duration = pd.to_numeric(duration, errors='coerce') + + duration = duration.dropna() + + duration = duration[duration == duration.astype(int)] + + duration = duration.astype(int) + + duration_list = duration.to_list() + durations.extend(duration_list) + print(f"Processed {len(duration_list)} valid integer rows") + +print(f"Total valid durations: {len(durations)}") +plt.hist(durations, range=(0, 1000), bins=100) +plt.savefig("durations_histogram.png") + +with open('durations.pkl', 'wb') as f: + pickle.dump(durations, f)