size_analysis.py (3092B)
1 import os 2 import matplotlib.pyplot as plt 3 from pathlib import Path 4 import pandas as pd 5 6 def convert_to_bytes(str_size : str): 7 8 split = str_size.split("M") 9 10 if len(split) == 1: 11 split = str_size.split("G") 12 13 size = 0 14 15 if str_size.__contains__("GiB"): 16 val = float(split[0]) 17 size = val * 1073741824 18 19 if str_size.__contains__("MiB"): 20 val = float(split[0]) 21 size = val * 1048576 22 23 return size 24 25 URLS_DIR = Path("../urls") 26 27 28 if URLS_DIR.exists(): 29 files = os.listdir(URLS_DIR) 30 else: 31 raise Exception(str(URLS_DIR) + "is empty...") 32 33 filtered = [] 34 35 for file in files: 36 if file.endswith(".csv") and file.__contains__("with_size"): 37 filtered.append(URLS_DIR / file) 38 39 if not filtered: 40 raise Exception("No applicable files found") 41 42 all = [] 43 44 for file in filtered: 45 ls = pd.read_csv(file).values.tolist() 46 all.extend(ls) 47 48 last = "" 49 unique = {} 50 dupes = 0 51 count = 0 52 53 for ele in all: 54 if ele[0] == last: 55 unique[last].append(convert_to_bytes(ele[1])) 56 count += 1 57 continue 58 59 # dupe 60 if ele[0] in unique: 61 last = "" 62 dupes += 1 63 continue 64 65 last = ele[0] 66 unique[last] = [convert_to_bytes(ele[1])] 67 count += 1 68 69 print("URLS Parsed: " + str(len(unique))) 70 print("Duplicate Video Options Removed: " + str(dupes)) 71 print("Non-Duplicate Videos: " + str(count)) 72 73 all = [] 74 for key in unique: 75 ls = unique[key] 76 all.extend(ls) 77 78 sizes = all 79 urls = unique.keys() 80 81 def bytes_to_larger(count_bytes : float): 82 cuts = 0 83 while count_bytes > 1000: 84 count_bytes /= 1000 85 cuts += 1 86 87 suffix = "B" 88 89 if cuts == 1: 90 suffix = 'KB' 91 elif cuts == 2: 92 suffix = 'MB' 93 elif cuts == 3: 94 suffix = "GB" 95 elif cuts == 4: 96 suffix = "TB" 97 elif cuts == 5: 98 suffix = "PB" 99 elif cuts == 6: 100 suffix = "EB" 101 102 return str(count_bytes) + " " + suffix 103 104 largest_size_by_url = {} 105 106 for key in unique: 107 largest = max(unique[key]) 108 largest_size_by_url[key] = largest 109 110 sum_size_by_url = {} 111 for key in unique: 112 sum_size_by_url[key] = sum(unique[key]) 113 114 print(len(largest_size_by_url)) 115 116 def mean(sizes): 117 mean_val = sum(sizes) / len(sizes) 118 return mean_val 119 120 sum_bytes = sum(sizes) 121 sum_str = bytes_to_larger(sum_bytes) 122 123 mean_val = mean(sizes) 124 mean_str = bytes_to_larger(mean_val) 125 126 mean_sum_size = mean(sum_size_by_url.values()) 127 128 mean_val_largest = mean(largest_size_by_url.values()) 129 mean_str_largest = bytes_to_larger(mean_val_largest) 130 mean_sum_str = bytes_to_larger(mean_sum_size) 131 132 133 print("Average video size across all resolutions for each video: " + mean_str) 134 print("Average largest video size across videos: " + mean_str_largest) 135 136 print("Average sum size across all videos and formats: " + mean_sum_str) 137 print("Sum of all videos queried: " + sum_str) 138 139 140 running_means = [] 141 running_mean = 0 142 itrs = [] 143 itr = 1 144 145 for size in sizes: 146 running_mean = running_mean + ((size - running_mean) / itr) 147 running_means.append(running_mean) 148 itrs.append(itr) 149 itr += 1 150 151 plt.plot(itrs, running_means) 152 plt.show()