blog

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 65b0744793d84be90e7f045fad0a7df4c9c1a9c5
parent 7d29a4a799bd92b9228925caeabd0daba88b0b82
Author: andrew.laack <andrew.laack@imbue.com>
Date:   Tue, 16 Sep 2025 17:20:21 -0700

Added analysis

Diffstat:
Mpython/youtube/analysis.py | 22++++++++++++++++++----
Apython/youtube/analysis.txt | 39+++++++++++++++++++++++++++++++++++++++
Mpython/youtube/yt-lots.py | 2+-
3 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/python/youtube/analysis.py b/python/youtube/analysis.py @@ -3,12 +3,27 @@ import matplotlib.pyplot as plt import os import pickle -files = os.listdir('results') +files = [os.path.join(dp, f) for dp, _, filenames in os.walk("results") for f in filenames if os.path.splitext(f)[1] == '.csv'] durations = [] +seen_urls = set() for file in files: print(f'Processing {file}') - df = pd.read_csv(f'results/{file}', low_memory=False) + df = pd.read_csv(f'{file}', low_memory=False) + + if 'url' in df.columns: + initial_count = len(df) + + df = df[~df['url'].isin(seen_urls)] + + df = df.drop_duplicates(subset=['url'], keep='first') + + new_urls = df['url'].dropna() + seen_urls.update(new_urls) + + print(f"Removed {initial_count - len(df)} duplicate URLs") + else: + print("Warning: 'url' column not found in this file") duration = df['duration_seconds'] @@ -26,11 +41,10 @@ for file in files: durations.extend(duration_list) print(f"Processed {len(duration_list)} valid integer rows") - +print(f"Total unique URLs processed: {len(seen_urls)}") print(f"Mean: {sum(durations) / len(durations)}") print(f"Total valid durations: {len(durations)}") plt.hist(durations, range=(0, 1000), bins=100) plt.savefig("durations_histogram.png") - with open('durations.pkl', 'wb') as f: pickle.dump(durations, f) diff --git a/python/youtube/analysis.txt b/python/youtube/analysis.txt @@ -0,0 +1,39 @@ +Processing results/no-resolution/lots1.csv +Removed 978780 duplicate URLs +Processed 1509890 valid integer rows +Processing results/no-resolution/res1.csv +Removed 54421 duplicate URLs +Processed 208935 valid integer rows +Processing results/no-resolution/res_res.csv +Removed 243218 duplicate URLs +Processed 19945 valid integer rows +Processing results/no-resolution/res983074.csv +Removed 252218 duplicate URLs +Processed 4134 valid integer rows +Processing results/no-resolution/res_8.csv +Removed 2353544 duplicate URLs +Processed 121553 valid integer rows +Processing results/no-resolution/results-only-lengths.csv +Removed 362964 duplicate URLs +Processed 615173 valid integer rows +Processing results/no-resolution/lots.csv +Removed 6847717 duplicate URLs +Processed 5111429 valid integer rows +Processing results/no-resolution/results-comp1.csv +Removed 2466923 duplicate URLs +Processed 21657 valid integer rows +Processing results/no-resolution/results.csv +Removed 262076 duplicate URLs +Processed 1382 valid integer rows +Processing results/max-bitrate/lots-with-bitrate.csv +Removed 205695 duplicate URLs +Processed 39729 valid integer rows +Processing results/maybe-resolution/lots.csv +Removed 12415 duplicate URLs +Processed 432 valid integer rows +Processing results/maybe-resolution/res_some_res.csv +Removed 6795 duplicate URLs +Processed 2115 valid integer rows +Total unique URLs processed: 7656374 +Mean: 666.1375179686886 +Total valid durations: 7656374 diff --git a/python/youtube/yt-lots.py b/python/youtube/yt-lots.py @@ -50,7 +50,7 @@ def parse_videos(obj): return results def fetch_resolution_and_bitrate(video_id): - return "Not including" +# return "Not including" url = f"https://www.youtube.com/watch?v={video_id}" try: resp = session.get(url, headers=HEADERS, timeout=10)