blog

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit ad5e0e80020882c546421478a39761c0e115a515
parent d4ba2fb57c92450400e753c64235fb5e0ec3896c
Author: Andrew Laack <andrew@laack.co>
Date:   Fri, 19 Sep 2025 01:43:27 -0500

Created comprehensiver results

Diffstat:
Mpython/youtube/analysis/size_analysis.py | 95+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
Apython/youtube/results/size_analysis_results.txt | 8++++++++
2 files changed, 69 insertions(+), 34 deletions(-)

diff --git a/python/youtube/analysis/size_analysis.py b/python/youtube/analysis/size_analysis.py @@ -3,27 +3,6 @@ import matplotlib.pyplot as plt from pathlib import Path import pandas as pd -URLS_DIR = Path("../urls") - - -if URLS_DIR.exists(): - files = os.listdir(URLS_DIR) -else: - raise Exception(str(URLS_DIR) + "is empty...") - -filtered = [] - -for file in files: - if file.endswith(".csv") and file.__contains__("with_size"): - filtered.append(URLS_DIR / file) - -if not filtered: - raise Exception("No applicable files found") - -for file in filtered: - df = pd.read_csv(file) - print(df.describe()) - def convert_to_bytes(str_size : str): split = str_size.split("M") @@ -43,9 +22,61 @@ def convert_to_bytes(str_size : str): return size -sizes = df['size'].apply(convert_to_bytes) -urls = df['url'] -sizes = sizes.to_list() +URLS_DIR = Path("../urls") + + +if URLS_DIR.exists(): + files = os.listdir(URLS_DIR) +else: + raise Exception(str(URLS_DIR) + "is empty...") + +filtered = [] + +for file in files: + if file.endswith(".csv") and file.__contains__("with_size"): + filtered.append(URLS_DIR / file) + +if not filtered: + raise Exception("No applicable files found") + +all = [] + +for file in filtered: + ls = pd.read_csv(file).values.tolist() + all.extend(ls) + +last = "" +unique = {} +dupes = 0 +count = 0 + +for ele in all: + if ele[0] == last: + unique[last].append(convert_to_bytes(ele[1])) + count += 1 + continue + + # dupe + if ele[0] in unique: + last = "" + dupes += 1 + continue + + last = ele[0] + unique[last] = [convert_to_bytes(ele[1])] + count += 1 + +print("URLS Parsed: " + str(len(unique))) +print("Duplicate Video Options Removed: " + str(dupes)) +print("Non-Duplicate Videos: " + str(count)) + +all = [] +for key in unique: + ls = unique[key] + all.extend(ls) + +sizes = all +urls = unique.keys() def bytes_to_larger(count_bytes : float): cuts = 0 @@ -72,17 +103,13 @@ def bytes_to_larger(count_bytes : float): largest_size_by_url = {} -for index in range(len(sizes)): - current_largest = largest_size_by_url.get(urls[index], -1) - current = sizes[index] - if current > current_largest: - largest_size_by_url[urls[index]] = current +for key in unique: + largest = max(unique[key]) + largest_size_by_url[key] = largest sum_size_by_url = {} -for index in range(len(sizes)): - current = sum_size_by_url.get(urls[index], 0) - current += sizes[index] - sum_size_by_url[urls[index]] = current +for key in unique: + sum_size_by_url[key] = sum(unique[key]) print(len(largest_size_by_url)) @@ -122,4 +149,4 @@ for size in sizes: itr += 1 plt.plot(itrs, running_means) -#plt.show() +plt.show() diff --git a/python/youtube/results/size_analysis_results.txt b/python/youtube/results/size_analysis_results.txt @@ -0,0 +1,8 @@ +URLS Parsed: 615222 +Duplicate Video Options Removed: 1997786 +Non-Duplicate Videos: 8259177 +615222 +Average video size across all resolutions for each video: 107.61747504914014 MB +Average largest video size across videos: 396.17368260710157 MB +Average sum size across all videos and formats: 1.4447334047285894 GB +Sum of all videos queried: 888.8317747239322 TB