blog

Personal blog
git clone git://git.laack.co/blog.git
Log | Files | Refs

size_analysis.py (3092B)


      1 import os
      2 import matplotlib.pyplot as plt
      3 from pathlib import Path
      4 import pandas as pd
      5 
      6 def convert_to_bytes(str_size : str):
      7 
      8     split = str_size.split("M")
      9 
     10     if len(split) == 1:
     11         split = str_size.split("G")
     12 
     13     size = 0
     14 
     15     if str_size.__contains__("GiB"):
     16         val = float(split[0])
     17         size = val * 1073741824
     18 
     19     if str_size.__contains__("MiB"):
     20         val = float(split[0])
     21         size = val * 1048576
     22 
     23     return size
     24 
     25 URLS_DIR = Path("../urls")
     26 
     27 
     28 if URLS_DIR.exists():
     29     files = os.listdir(URLS_DIR)
     30 else:
     31     raise Exception(str(URLS_DIR) + "is empty...")
     32 
     33 filtered = []
     34 
     35 for file in files:
     36     if file.endswith(".csv") and file.__contains__("with_size"):
     37         filtered.append(URLS_DIR / file)
     38 
     39 if not filtered:
     40     raise Exception("No applicable files found")
     41 
     42 all = []
     43 
     44 for file in filtered:
     45     ls = pd.read_csv(file).values.tolist()
     46     all.extend(ls)
     47 
     48 last = ""
     49 unique = {}
     50 dupes = 0
     51 count = 0
     52 
     53 for ele in all:
     54     if ele[0] == last:
     55         unique[last].append(convert_to_bytes(ele[1]))
     56         count += 1
     57         continue
     58 
     59     # dupe
     60     if ele[0] in unique:
     61         last = ""
     62         dupes += 1
     63         continue
     64 
     65     last = ele[0]
     66     unique[last] = [convert_to_bytes(ele[1])]
     67     count += 1
     68 
     69 print("URLS Parsed: " + str(len(unique)))
     70 print("Duplicate Video Options Removed: " + str(dupes))
     71 print("Non-Duplicate Videos: " + str(count))
     72 
     73 all = []
     74 for key in unique:
     75     ls = unique[key]
     76     all.extend(ls)
     77 
     78 sizes = all
     79 urls = unique.keys()
     80 
     81 def bytes_to_larger(count_bytes : float):
     82     cuts = 0
     83     while count_bytes > 1000:
     84         count_bytes /= 1000
     85         cuts += 1
     86 
     87     suffix = "B"
     88 
     89     if cuts == 1:
     90         suffix = 'KB'
     91     elif cuts == 2:
     92         suffix = 'MB'
     93     elif cuts == 3:
     94         suffix = "GB"
     95     elif cuts == 4:
     96         suffix = "TB"
     97     elif cuts == 5:
     98         suffix = "PB"
     99     elif cuts == 6:
    100         suffix = "EB"
    101 
    102     return str(count_bytes) + " " + suffix
    103 
    104 largest_size_by_url = {}
    105 
    106 for key in unique:
    107     largest = max(unique[key])
    108     largest_size_by_url[key] = largest
    109 
    110 sum_size_by_url = {}
    111 for key in unique:
    112     sum_size_by_url[key] = sum(unique[key])
    113 
    114 print(len(largest_size_by_url))
    115 
    116 def mean(sizes):
    117     mean_val = sum(sizes) / len(sizes)
    118     return mean_val
    119 
    120 sum_bytes = sum(sizes)
    121 sum_str = bytes_to_larger(sum_bytes)
    122 
    123 mean_val = mean(sizes)
    124 mean_str = bytes_to_larger(mean_val)
    125 
    126 mean_sum_size = mean(sum_size_by_url.values())
    127 
    128 mean_val_largest = mean(largest_size_by_url.values())
    129 mean_str_largest = bytes_to_larger(mean_val_largest)
    130 mean_sum_str = bytes_to_larger(mean_sum_size)
    131 
    132 
    133 print("Average video size across all resolutions for each video: " + mean_str)
    134 print("Average largest video size across videos: " + mean_str_largest)
    135 
    136 print("Average sum size across all videos and formats: " + mean_sum_str)
    137 print("Sum of all videos queried: " + sum_str)
    138 
    139 
    140 running_means = []
    141 running_mean = 0
    142 itrs = []
    143 itr = 1
    144 
    145 for size in sizes:
    146     running_mean = running_mean + ((size - running_mean) / itr)
    147     running_means.append(running_mean)
    148     itrs.append(itr)
    149     itr += 1
    150 
    151 plt.plot(itrs, running_means)
    152 plt.show()