commit 00097e7b854af12fc4eca332cf5662a76f13a37f parent dbe00cdbbbfb74d00c60c19fb938f55a73888ba9 Author: Andrew Laack <andrew@laack.co> Date: Thu, 18 Sep 2025 23:51:16 -0500 Merge branch 'master' of ssh://brgr:/home/shared/git/public-repos/blog Diffstat:
| A | python/youtube/analysis/size_analysis.py | | | 105 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| R | python/youtube/analysis/urls.py -> python/youtube/analysis/urls_parsing.py | | | 0 |
2 files changed, 105 insertions(+), 0 deletions(-)
diff --git a/python/youtube/analysis/size_analysis.py b/python/youtube/analysis/size_analysis.py @@ -0,0 +1,105 @@ +import os +import matplotlib.pyplot as plt +from pathlib import Path +import pandas as pd + +URLS_DIR = Path("../urls") + + +if URLS_DIR.exists(): + files = os.listdir(URLS_DIR) +else: + raise Exception(str(URLS_DIR) + "is empty...") + +filtered = [] + +for file in files: + if file.endswith(".csv") and file.__contains__("with_size"): + filtered.append(URLS_DIR / file) + +if not filtered: + raise Exception("No applicable files found") + +for file in filtered: + df = pd.read_csv(file) + print(df.describe()) + +def convert_to_bytes(str_size : str): + + split = str_size.split("M") + + if len(split) == 1: + split = str_size.split("G") + + size = 0 + + if str_size.__contains__("GiB"): + val = float(split[0]) + size = val * 1073741824 + + if str_size.__contains__("MiB"): + val = float(split[0]) + size = val * 1048576 + + return size + +sizes = df['size'].apply(convert_to_bytes) + +sizes = sizes.to_list() + + +def bytes_to_larger(count_bytes : float): + cuts = 0 + while count_bytes > 1000: + count_bytes /= 1000 + cuts += 1 + + suffix = "B" + + if cuts == 1: + suffix = 'KB' + elif cuts == 2: + suffix = 'MB' + elif cuts == 3: + suffix = "GB" + elif cuts == 4: + suffix = "TB" + elif cuts == 5: + suffix = "PB" + elif cuts == 6: + suffix = "EB" + + return str(count_bytes) + " " + suffix + + + +def mean(sizes): + mean_val = sum(sizes) / len(sizes) + return mean_val + + + +sum_bytes = sum(sizes) +sum_str = bytes_to_larger(sum_bytes) + +mean_val = mean(sizes) +mean_str = bytes_to_larger(mean_val) + + +print("Average video size: " + mean_str) +print("Sum of all videos queried: " + sum_str) + + +running_means = [] +running_mean = 0 +itrs = [] +itr = 1 + +for size in sizes: + running_mean = running_mean + ((size - running_mean) / itr) + running_means.append(running_mean) + itrs.append(itr) + itr += 1 + +plt.plot(itrs, running_means) +plt.show() diff --git a/python/youtube/analysis/urls.py b/python/youtube/analysis/urls_parsing.py