commit ad5e0e80020882c546421478a39761c0e115a515
parent d4ba2fb57c92450400e753c64235fb5e0ec3896c
Author: Andrew Laack <andrew@laack.co>
Date: Fri, 19 Sep 2025 01:43:27 -0500
Created comprehensiver results
Diffstat:
2 files changed, 69 insertions(+), 34 deletions(-)
diff --git a/python/youtube/analysis/size_analysis.py b/python/youtube/analysis/size_analysis.py
@@ -3,27 +3,6 @@ import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
-URLS_DIR = Path("../urls")
-
-
-if URLS_DIR.exists():
- files = os.listdir(URLS_DIR)
-else:
- raise Exception(str(URLS_DIR) + "is empty...")
-
-filtered = []
-
-for file in files:
- if file.endswith(".csv") and file.__contains__("with_size"):
- filtered.append(URLS_DIR / file)
-
-if not filtered:
- raise Exception("No applicable files found")
-
-for file in filtered:
- df = pd.read_csv(file)
- print(df.describe())
-
def convert_to_bytes(str_size : str):
split = str_size.split("M")
@@ -43,9 +22,61 @@ def convert_to_bytes(str_size : str):
return size
-sizes = df['size'].apply(convert_to_bytes)
-urls = df['url']
-sizes = sizes.to_list()
+URLS_DIR = Path("../urls")
+
+
+if URLS_DIR.exists():
+ files = os.listdir(URLS_DIR)
+else:
+ raise Exception(str(URLS_DIR) + "is empty...")
+
+filtered = []
+
+for file in files:
+ if file.endswith(".csv") and file.__contains__("with_size"):
+ filtered.append(URLS_DIR / file)
+
+if not filtered:
+ raise Exception("No applicable files found")
+
+all = []
+
+for file in filtered:
+ ls = pd.read_csv(file).values.tolist()
+ all.extend(ls)
+
+last = ""
+unique = {}
+dupes = 0
+count = 0
+
+for ele in all:
+ if ele[0] == last:
+ unique[last].append(convert_to_bytes(ele[1]))
+ count += 1
+ continue
+
+ # dupe
+ if ele[0] in unique:
+ last = ""
+ dupes += 1
+ continue
+
+ last = ele[0]
+ unique[last] = [convert_to_bytes(ele[1])]
+ count += 1
+
+print("URLS Parsed: " + str(len(unique)))
+print("Duplicate Video Options Removed: " + str(dupes))
+print("Non-Duplicate Videos: " + str(count))
+
+all = []
+for key in unique:
+ ls = unique[key]
+ all.extend(ls)
+
+sizes = all
+urls = unique.keys()
def bytes_to_larger(count_bytes : float):
cuts = 0
@@ -72,17 +103,13 @@ def bytes_to_larger(count_bytes : float):
largest_size_by_url = {}
-for index in range(len(sizes)):
- current_largest = largest_size_by_url.get(urls[index], -1)
- current = sizes[index]
- if current > current_largest:
- largest_size_by_url[urls[index]] = current
+for key in unique:
+ largest = max(unique[key])
+ largest_size_by_url[key] = largest
sum_size_by_url = {}
-for index in range(len(sizes)):
- current = sum_size_by_url.get(urls[index], 0)
- current += sizes[index]
- sum_size_by_url[urls[index]] = current
+for key in unique:
+ sum_size_by_url[key] = sum(unique[key])
print(len(largest_size_by_url))
@@ -122,4 +149,4 @@ for size in sizes:
itr += 1
plt.plot(itrs, running_means)
-#plt.show()
+plt.show()
diff --git a/python/youtube/results/size_analysis_results.txt b/python/youtube/results/size_analysis_results.txt
@@ -0,0 +1,8 @@
+URLS Parsed: 615222
+Duplicate Video Options Removed: 1997786
+Non-Duplicate Videos: 8259177
+615222
+Average video size across all resolutions for each video: 107.61747504914014 MB
+Average largest video size across videos: 396.17368260710157 MB
+Average sum size across all videos and formats: 1.4447334047285894 GB
+Sum of all videos queried: 888.8317747239322 TB