commit 751210ca6a7e034224677b112802e1fddd5d54c4 parent 65b0744793d84be90e7f045fad0a7df4c9c1a9c5 Author: Andrew Laack <andrew@laack.co> Date: Tue, 16 Sep 2025 19:52:11 -0500 Finished basic scraping Diffstat:
11 files changed, 1022 insertions(+), 89 deletions(-)
diff --git a/python/youtube/analysis.py b/python/youtube/analysis.py @@ -1,50 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt -import os -import pickle - -files = [os.path.join(dp, f) for dp, _, filenames in os.walk("results") for f in filenames if os.path.splitext(f)[1] == '.csv'] -durations = [] -seen_urls = set() - -for file in files: - print(f'Processing {file}') - df = pd.read_csv(f'{file}', low_memory=False) - - if 'url' in df.columns: - initial_count = len(df) - - df = df[~df['url'].isin(seen_urls)] - - df = df.drop_duplicates(subset=['url'], keep='first') - - new_urls = df['url'].dropna() - seen_urls.update(new_urls) - - print(f"Removed {initial_count - len(df)} duplicate URLs") - else: - print("Warning: 'url' column not found in this file") - - duration = df['duration_seconds'] - - duration = duration.dropna() - - duration = pd.to_numeric(duration, errors='coerce') - - duration = duration.dropna() - - duration = duration[duration == duration.astype(int)] - - duration = duration.astype(int) - - duration_list = duration.to_list() - durations.extend(duration_list) - print(f"Processed {len(duration_list)} valid integer rows") - -print(f"Total unique URLs processed: {len(seen_urls)}") -print(f"Mean: {sum(durations) / len(durations)}") -print(f"Total valid durations: {len(durations)}") -plt.hist(durations, range=(0, 1000), bins=100) -plt.savefig("durations_histogram.png") -with open('durations.pkl', 'wb') as f: - pickle.dump(durations, f) diff --git a/python/youtube/analysis.txt b/python/youtube/analysis.txt @@ -1,39 +0,0 @@ -Processing results/no-resolution/lots1.csv -Removed 978780 duplicate URLs -Processed 1509890 valid integer rows -Processing results/no-resolution/res1.csv -Removed 54421 duplicate URLs -Processed 208935 valid integer rows -Processing results/no-resolution/res_res.csv -Removed 243218 duplicate URLs -Processed 19945 valid integer rows -Processing results/no-resolution/res983074.csv -Removed 252218 duplicate URLs -Processed 4134 valid integer rows -Processing results/no-resolution/res_8.csv -Removed 2353544 duplicate URLs -Processed 121553 valid integer rows -Processing results/no-resolution/results-only-lengths.csv -Removed 362964 duplicate URLs -Processed 615173 valid integer rows -Processing results/no-resolution/lots.csv -Removed 6847717 duplicate URLs -Processed 5111429 valid integer rows -Processing results/no-resolution/results-comp1.csv -Removed 2466923 duplicate URLs -Processed 21657 valid integer rows -Processing results/no-resolution/results.csv -Removed 262076 duplicate URLs -Processed 1382 valid integer rows -Processing results/max-bitrate/lots-with-bitrate.csv -Removed 205695 duplicate URLs -Processed 39729 valid integer rows -Processing results/maybe-resolution/lots.csv -Removed 12415 duplicate URLs -Processed 432 valid integer rows -Processing results/maybe-resolution/res_some_res.csv -Removed 6795 duplicate URLs -Processed 2115 valid integer rows -Total unique URLs processed: 7656374 -Mean: 666.1375179686886 -Total valid durations: 7656374 diff --git a/python/youtube/analysis/bitrate.py b/python/youtube/analysis/bitrate.py @@ -0,0 +1,65 @@ +import pandas as pd +import matplotlib.pyplot as plt +import os +from collections import Counter + +# Find all CSV files recursively under "results" +files = [os.path.join(dp, f) for dp, _, filenames in os.walk("results") for f in filenames if os.path.splitext(f)[1] == '.csv'] + +resolutions = [] +seen_urls = set() +invalid_values = {"Not getting res", "Not Used In This Run", "N/A"} + + +for file in files: + print(f'Processing {file}') + df = pd.read_csv(file, low_memory=False) + + if 'url' not in df.columns: + print("Warning: 'url' column not found in this file") + continue + + if 'max_bitrate' not in df.columns: + print("Warning: 'max_bitrate' column not found in this file") + continue + + initial_count = len(df) + + # Remove duplicates based on 'url' + df = df[~df['url'].isin(seen_urls)] + df = df.drop_duplicates(subset=['url'], keep='first') + + new_urls = df['url'].dropna() + seen_urls.update(new_urls) + + print(f"Removed {initial_count - len(df)} duplicate URLs") + + # Clean and collect max_bitrate values + resolution = df['max_bitrate'].dropna().astype(str).str.strip() + resolution = resolution[~resolution.isin(invalid_values)] + resolution = resolution[resolution != ''] + resolution_list = resolution.tolist() + resolutions.extend(resolution_list) + + print(f"Processed {len(resolution_list)} valid resolution rows") + +# Count occurrences +resolution_counts = Counter(resolutions) +sorted_resolutions = sorted(resolution_counts.items(), key=lambda x: (-x[1], x[0])) + +# Summary +print(f"Total unique URLs processed: {len(seen_urls)}") +print(f"Total valid max_bitrate entries: {len(resolutions)}") +for res, count in sorted_resolutions: + print(f"{res}: {count}") + +# Plot +res_labels, res_values = zip(*sorted_resolutions) +plt.figure(figsize=(12, 6)) +plt.bar(res_labels, res_values, color='salmon') +plt.xlabel('Max Bitrate') +plt.ylabel('Count') +plt.title('Max Bitrate Distribution') +plt.xticks(rotation=45) +plt.tight_layout() +plt.savefig("max_bitrate_histogram.png") diff --git a/python/youtube/analysis/duration.py b/python/youtube/analysis/duration.py @@ -0,0 +1,47 @@ +import pandas as pd +import matplotlib.pyplot as plt +import os + +files = [os.path.join(dp, f) for dp, _, filenames in os.walk("results") for f in filenames if os.path.splitext(f)[1] == '.csv'] +durations = [] +seen_urls = set() + +for file in files: + print(f'Processing {file}') + df = pd.read_csv(f'{file}', low_memory=False) + + if 'url' in df.columns: + initial_count = len(df) + + df = df[~df['url'].isin(seen_urls)] + + df = df.drop_duplicates(subset=['url'], keep='first') + + new_urls = df['url'].dropna() + seen_urls.update(new_urls) + + print(f"Removed {initial_count - len(df)} duplicate URLs") + else: + print("Warning: 'url' column not found in this file") + + duration = df['duration_seconds'] + + duration = duration.dropna() + + duration = pd.to_numeric(duration, errors='coerce') + + duration = duration.dropna() + + duration = duration[duration == duration.astype(int)] + + duration = duration.astype(int) + + duration_list = duration.to_list() + durations.extend(duration_list) + print(f"Processed {len(duration_list)} valid integer rows") + +print(f"Total unique URLs processed: {len(seen_urls)}") +print(f"Mean: {sum(durations) / len(durations)}") +print(f"Total valid durations: {len(durations)}") +plt.hist(durations, range=(0, 1000), bins=100) +plt.savefig("durations_histogram.png") diff --git a/python/youtube/bitrate.txt b/python/youtube/bitrate.txt @@ -0,0 +1,853 @@ +Processing results/max-bitrate/wores.csv +Removed 635 duplicate URLs +Processed 908 valid resolution rows +Processing results/max-bitrate/lots-without-res2.csv +Removed 1080 duplicate URLs +Processed 366 valid resolution rows +Processing results/max-bitrate/lots-without-res1.csv +Removed 53185 duplicate URLs +Processed 566 valid resolution rows +Processing results/max-bitrate/lots-with-bitrate.csv +Removed 92281 duplicate URLs +Processed 785 valid resolution rows +Processing results/max-bitrate/lots-without-res.csv +Removed 243018 duplicate URLs +Processed 2431 valid resolution rows +Processing results/max-bitrate/9-16-2025-B.csv +Warning: 'max_bitrate' column not found in this file +Processing results/max-bitrate/9-16-2025-A.csv +Warning: 'max_bitrate' column not found in this file +Processing results/no-resolution/results.csv +Warning: 'max_bitrate' column not found in this file +Processing results/no-resolution/res_res.csv +Warning: 'max_bitrate' column not found in this file +Processing results/no-resolution/lots.csv +Warning: 'max_bitrate' column not found in this file +Processing results/no-resolution/lots1.csv +Warning: 'max_bitrate' column not found in this file +Processing results/no-resolution/results-comp1.csv +Warning: 'max_bitrate' column not found in this file +Processing results/no-resolution/results-only-lengths.csv +Warning: 'max_bitrate' column not found in this file +Processing results/no-resolution/res1.csv +Warning: 'max_bitrate' column not found in this file +Processing results/no-resolution/res_8.csv +Warning: 'max_bitrate' column not found in this file +Processing results/no-resolution/res983074.csv +Warning: 'max_bitrate' column not found in this file +Processing results/maybe-resolution/lots.csv +Warning: 'max_bitrate' column not found in this file +Processing results/maybe-resolution/res_some_res.csv +Warning: 'max_bitrate' column not found in this file +Total unique URLs processed: 602691 +Total valid max_bitrate entries: 5056 +597 kbps: 162 +598 kbps: 154 +596 kbps: 143 +599 kbps: 140 +600 kbps: 121 +601 kbps: 110 +595 kbps: 108 +602 kbps: 98 +594 kbps: 93 +603 kbps: 63 +604 kbps: 46 +593 kbps: 45 +605 kbps: 38 +606 kbps: 38 +592 kbps: 27 +607 kbps: 27 +608 kbps: 27 +610 kbps: 26 +591 kbps: 24 +609 kbps: 19 +611 kbps: 18 +383 kbps: 16 +585 kbps: 16 +309 kbps: 15 +590 kbps: 14 +207 kbps: 13 +310 kbps: 13 +373 kbps: 13 +484 kbps: 13 +588 kbps: 13 +612 kbps: 13 +399 kbps: 12 +423 kbps: 12 +536 kbps: 12 +575 kbps: 12 +738 kbps: 12 +321 kbps: 11 +350 kbps: 11 +371 kbps: 11 +384 kbps: 11 +415 kbps: 11 +419 kbps: 11 +422 kbps: 11 +476 kbps: 11 +526 kbps: 11 +541 kbps: 11 +550 kbps: 11 +574 kbps: 11 +587 kbps: 11 +613 kbps: 11 +617 kbps: 11 +620 kbps: 11 +624 kbps: 11 +285 kbps: 10 +322 kbps: 10 +327 kbps: 10 +336 kbps: 10 +358 kbps: 10 +380 kbps: 10 +388 kbps: 10 +393 kbps: 10 +397 kbps: 10 +429 kbps: 10 +441 kbps: 10 +456 kbps: 10 +497 kbps: 10 +507 kbps: 10 +513 kbps: 10 +514 kbps: 10 +551 kbps: 10 +553 kbps: 10 +554 kbps: 10 +564 kbps: 10 +580 kbps: 10 +650 kbps: 10 +209 kbps: 9 +296 kbps: 9 +325 kbps: 9 +328 kbps: 9 +349 kbps: 9 +357 kbps: 9 +363 kbps: 9 +366 kbps: 9 +394 kbps: 9 +425 kbps: 9 +427 kbps: 9 +431 kbps: 9 +433 kbps: 9 +437 kbps: 9 +463 kbps: 9 +485 kbps: 9 +491 kbps: 9 +503 kbps: 9 +505 kbps: 9 +508 kbps: 9 +511 kbps: 9 +523 kbps: 9 +543 kbps: 9 +545 kbps: 9 +548 kbps: 9 +569 kbps: 9 +589 kbps: 9 +618 kbps: 9 +621 kbps: 9 +629 kbps: 9 +631 kbps: 9 +646 kbps: 9 +688 kbps: 9 +208 kbps: 8 +256 kbps: 8 +257 kbps: 8 +286 kbps: 8 +287 kbps: 8 +295 kbps: 8 +306 kbps: 8 +334 kbps: 8 +335 kbps: 8 +337 kbps: 8 +338 kbps: 8 +356 kbps: 8 +360 kbps: 8 +362 kbps: 8 +387 kbps: 8 +389 kbps: 8 +391 kbps: 8 +400 kbps: 8 +403 kbps: 8 +411 kbps: 8 +421 kbps: 8 +426 kbps: 8 +430 kbps: 8 +444 kbps: 8 +445 kbps: 8 +455 kbps: 8 +487 kbps: 8 +490 kbps: 8 +493 kbps: 8 +498 kbps: 8 +517 kbps: 8 +531 kbps: 8 +542 kbps: 8 +544 kbps: 8 +546 kbps: 8 +547 kbps: 8 +568 kbps: 8 +622 kbps: 8 +626 kbps: 8 +632 kbps: 8 +644 kbps: 8 +671 kbps: 8 +692 kbps: 8 +733 kbps: 8 +734 kbps: 8 +248 kbps: 7 +260 kbps: 7 +273 kbps: 7 +294 kbps: 7 +300 kbps: 7 +302 kbps: 7 +307 kbps: 7 +313 kbps: 7 +314 kbps: 7 +316 kbps: 7 +317 kbps: 7 +323 kbps: 7 +326 kbps: 7 +342 kbps: 7 +367 kbps: 7 +376 kbps: 7 +392 kbps: 7 +398 kbps: 7 +407 kbps: 7 +409 kbps: 7 +428 kbps: 7 +436 kbps: 7 +442 kbps: 7 +443 kbps: 7 +448 kbps: 7 +452 kbps: 7 +458 kbps: 7 +465 kbps: 7 +467 kbps: 7 +470 kbps: 7 +471 kbps: 7 +473 kbps: 7 +474 kbps: 7 +479 kbps: 7 +480 kbps: 7 +481 kbps: 7 +494 kbps: 7 +504 kbps: 7 +510 kbps: 7 +516 kbps: 7 +518 kbps: 7 +521 kbps: 7 +527 kbps: 7 +528 kbps: 7 +538 kbps: 7 +559 kbps: 7 +560 kbps: 7 +582 kbps: 7 +615 kbps: 7 +623 kbps: 7 +628 kbps: 7 +634 kbps: 7 +640 kbps: 7 +647 kbps: 7 +649 kbps: 7 +677 kbps: 7 +687 kbps: 7 +843 kbps: 7 +204 kbps: 6 +245 kbps: 6 +279 kbps: 6 +298 kbps: 6 +315 kbps: 6 +329 kbps: 6 +330 kbps: 6 +333 kbps: 6 +340 kbps: 6 +344 kbps: 6 +346 kbps: 6 +348 kbps: 6 +355 kbps: 6 +361 kbps: 6 +364 kbps: 6 +370 kbps: 6 +379 kbps: 6 +382 kbps: 6 +395 kbps: 6 +401 kbps: 6 +402 kbps: 6 +405 kbps: 6 +408 kbps: 6 +412 kbps: 6 +414 kbps: 6 +416 kbps: 6 +420 kbps: 6 +424 kbps: 6 +438 kbps: 6 +439 kbps: 6 +440 kbps: 6 +446 kbps: 6 +453 kbps: 6 +462 kbps: 6 +466 kbps: 6 +468 kbps: 6 +478 kbps: 6 +486 kbps: 6 +496 kbps: 6 +499 kbps: 6 +500 kbps: 6 +501 kbps: 6 +506 kbps: 6 +524 kbps: 6 +535 kbps: 6 +537 kbps: 6 +539 kbps: 6 +552 kbps: 6 +557 kbps: 6 +561 kbps: 6 +562 kbps: 6 +571 kbps: 6 +572 kbps: 6 +581 kbps: 6 +583 kbps: 6 +584 kbps: 6 +586 kbps: 6 +619 kbps: 6 +625 kbps: 6 +638 kbps: 6 +642 kbps: 6 +654 kbps: 6 +656 kbps: 6 +673 kbps: 6 +680 kbps: 6 +691 kbps: 6 +698 kbps: 6 +701 kbps: 6 +703 kbps: 6 +714 kbps: 6 +717 kbps: 6 +728 kbps: 6 +729 kbps: 6 +730 kbps: 6 +737 kbps: 6 +739 kbps: 6 +746 kbps: 6 +761 kbps: 6 +772 kbps: 6 +845 kbps: 6 +103 kbps: 5 +146 kbps: 5 +159 kbps: 5 +182 kbps: 5 +199 kbps: 5 +201 kbps: 5 +202 kbps: 5 +205 kbps: 5 +222 kbps: 5 +231 kbps: 5 +243 kbps: 5 +247 kbps: 5 +255 kbps: 5 +266 kbps: 5 +269 kbps: 5 +274 kbps: 5 +275 kbps: 5 +280 kbps: 5 +293 kbps: 5 +301 kbps: 5 +303 kbps: 5 +304 kbps: 5 +311 kbps: 5 +319 kbps: 5 +324 kbps: 5 +347 kbps: 5 +353 kbps: 5 +365 kbps: 5 +368 kbps: 5 +369 kbps: 5 +372 kbps: 5 +381 kbps: 5 +390 kbps: 5 +396 kbps: 5 +404 kbps: 5 +410 kbps: 5 +418 kbps: 5 +432 kbps: 5 +434 kbps: 5 +435 kbps: 5 +447 kbps: 5 +449 kbps: 5 +450 kbps: 5 +454 kbps: 5 +460 kbps: 5 +464 kbps: 5 +469 kbps: 5 +472 kbps: 5 +475 kbps: 5 +482 kbps: 5 +492 kbps: 5 +509 kbps: 5 +525 kbps: 5 +532 kbps: 5 +549 kbps: 5 +556 kbps: 5 +565 kbps: 5 +567 kbps: 5 +573 kbps: 5 +579 kbps: 5 +616 kbps: 5 +627 kbps: 5 +633 kbps: 5 +636 kbps: 5 +637 kbps: 5 +648 kbps: 5 +651 kbps: 5 +655 kbps: 5 +658 kbps: 5 +670 kbps: 5 +672 kbps: 5 +674 kbps: 5 +696 kbps: 5 +699 kbps: 5 +713 kbps: 5 +719 kbps: 5 +723 kbps: 5 +732 kbps: 5 +836 kbps: 5 +839 kbps: 5 +844 kbps: 5 +854 kbps: 5 +856 kbps: 5 +108 kbps: 4 +123 kbps: 4 +138 kbps: 4 +163 kbps: 4 +175 kbps: 4 +188 kbps: 4 +211 kbps: 4 +213 kbps: 4 +215 kbps: 4 +220 kbps: 4 +226 kbps: 4 +229 kbps: 4 +235 kbps: 4 +236 kbps: 4 +237 kbps: 4 +242 kbps: 4 +254 kbps: 4 +258 kbps: 4 +264 kbps: 4 +271 kbps: 4 +283 kbps: 4 +291 kbps: 4 +312 kbps: 4 +331 kbps: 4 +341 kbps: 4 +352 kbps: 4 +359 kbps: 4 +377 kbps: 4 +406 kbps: 4 +417 kbps: 4 +451 kbps: 4 +457 kbps: 4 +459 kbps: 4 +489 kbps: 4 +502 kbps: 4 +512 kbps: 4 +515 kbps: 4 +522 kbps: 4 +555 kbps: 4 +558 kbps: 4 +566 kbps: 4 +570 kbps: 4 +576 kbps: 4 +577 kbps: 4 +614 kbps: 4 +630 kbps: 4 +635 kbps: 4 +643 kbps: 4 +653 kbps: 4 +660 kbps: 4 +661 kbps: 4 +669 kbps: 4 +675 kbps: 4 +678 kbps: 4 +693 kbps: 4 +694 kbps: 4 +695 kbps: 4 +706 kbps: 4 +708 kbps: 4 +711 kbps: 4 +731 kbps: 4 +757 kbps: 4 +758 kbps: 4 +762 kbps: 4 +774 kbps: 4 +788 kbps: 4 +803 kbps: 4 +811 kbps: 4 +830 kbps: 4 +847 kbps: 4 +850 kbps: 4 +871 kbps: 4 +874 kbps: 4 +109 kbps: 3 +114 kbps: 3 +129 kbps: 3 +136 kbps: 3 +139 kbps: 3 +140 kbps: 3 +141 kbps: 3 +164 kbps: 3 +165 kbps: 3 +167 kbps: 3 +173 kbps: 3 +174 kbps: 3 +179 kbps: 3 +180 kbps: 3 +181 kbps: 3 +185 kbps: 3 +192 kbps: 3 +193 kbps: 3 +195 kbps: 3 +216 kbps: 3 +224 kbps: 3 +233 kbps: 3 +234 kbps: 3 +240 kbps: 3 +241 kbps: 3 +251 kbps: 3 +259 kbps: 3 +261 kbps: 3 +263 kbps: 3 +270 kbps: 3 +277 kbps: 3 +281 kbps: 3 +282 kbps: 3 +288 kbps: 3 +305 kbps: 3 +308 kbps: 3 +318 kbps: 3 +339 kbps: 3 +343 kbps: 3 +354 kbps: 3 +375 kbps: 3 +378 kbps: 3 +413 kbps: 3 +477 kbps: 3 +483 kbps: 3 +519 kbps: 3 +520 kbps: 3 +530 kbps: 3 +533 kbps: 3 +563 kbps: 3 +578 kbps: 3 +645 kbps: 3 +663 kbps: 3 +665 kbps: 3 +667 kbps: 3 +668 kbps: 3 +679 kbps: 3 +681 kbps: 3 +682 kbps: 3 +684 kbps: 3 +685 kbps: 3 +690 kbps: 3 +700 kbps: 3 +704 kbps: 3 +705 kbps: 3 +71 kbps: 3 +710 kbps: 3 +712 kbps: 3 +718 kbps: 3 +727 kbps: 3 +735 kbps: 3 +736 kbps: 3 +743 kbps: 3 +755 kbps: 3 +756 kbps: 3 +769 kbps: 3 +777 kbps: 3 +780 kbps: 3 +787 kbps: 3 +792 kbps: 3 +800 kbps: 3 +801 kbps: 3 +808 kbps: 3 +823 kbps: 3 +827 kbps: 3 +834 kbps: 3 +840 kbps: 3 +841 kbps: 3 +848 kbps: 3 +849 kbps: 3 +853 kbps: 3 +861 kbps: 3 +867 kbps: 3 +869 kbps: 3 +894 kbps: 3 +113 kbps: 2 +116 kbps: 2 +119 kbps: 2 +134 kbps: 2 +135 kbps: 2 +1375 kbps: 2 +142 kbps: 2 +144 kbps: 2 +149 kbps: 2 +153 kbps: 2 +155 kbps: 2 +156 kbps: 2 +157 kbps: 2 +166 kbps: 2 +177 kbps: 2 +183 kbps: 2 +186 kbps: 2 +189 kbps: 2 +194 kbps: 2 +200 kbps: 2 +203 kbps: 2 +206 kbps: 2 +210 kbps: 2 +212 kbps: 2 +214 kbps: 2 +217 kbps: 2 +225 kbps: 2 +228 kbps: 2 +238 kbps: 2 +239 kbps: 2 +246 kbps: 2 +249 kbps: 2 +250 kbps: 2 +262 kbps: 2 +267 kbps: 2 +268 kbps: 2 +272 kbps: 2 +276 kbps: 2 +278 kbps: 2 +289 kbps: 2 +292 kbps: 2 +297 kbps: 2 +299 kbps: 2 +320 kbps: 2 +351 kbps: 2 +374 kbps: 2 +385 kbps: 2 +386 kbps: 2 +461 kbps: 2 +488 kbps: 2 +495 kbps: 2 +529 kbps: 2 +534 kbps: 2 +540 kbps: 2 +641 kbps: 2 +652 kbps: 2 +659 kbps: 2 +666 kbps: 2 +676 kbps: 2 +686 kbps: 2 +689 kbps: 2 +707 kbps: 2 +709 kbps: 2 +715 kbps: 2 +721 kbps: 2 +724 kbps: 2 +725 kbps: 2 +726 kbps: 2 +741 kbps: 2 +742 kbps: 2 +748 kbps: 2 +750 kbps: 2 +754 kbps: 2 +759 kbps: 2 +763 kbps: 2 +765 kbps: 2 +766 kbps: 2 +768 kbps: 2 +771 kbps: 2 +775 kbps: 2 +779 kbps: 2 +786 kbps: 2 +789 kbps: 2 +790 kbps: 2 +793 kbps: 2 +794 kbps: 2 +799 kbps: 2 +804 kbps: 2 +805 kbps: 2 +806 kbps: 2 +807 kbps: 2 +81 kbps: 2 +815 kbps: 2 +816 kbps: 2 +817 kbps: 2 +818 kbps: 2 +824 kbps: 2 +825 kbps: 2 +828 kbps: 2 +837 kbps: 2 +838 kbps: 2 +842 kbps: 2 +846 kbps: 2 +851 kbps: 2 +852 kbps: 2 +855 kbps: 2 +86 kbps: 2 +863 kbps: 2 +870 kbps: 2 +872 kbps: 2 +884 kbps: 2 +898 kbps: 2 +902 kbps: 2 +904 kbps: 2 +907 kbps: 2 +922 kbps: 2 +927 kbps: 2 +93 kbps: 2 +933 kbps: 2 +953 kbps: 2 +101 kbps: 1 +1018 kbps: 1 +1031 kbps: 1 +104 kbps: 1 +106 kbps: 1 +1060 kbps: 1 +1076 kbps: 1 +1090 kbps: 1 +112 kbps: 1 +1147 kbps: 1 +115 kbps: 1 +117 kbps: 1 +1183 kbps: 1 +120 kbps: 1 +122 kbps: 1 +124 kbps: 1 +1257 kbps: 1 +126 kbps: 1 +1269 kbps: 1 +127 kbps: 1 +1298 kbps: 1 +130 kbps: 1 +1306 kbps: 1 +1319 kbps: 1 +137 kbps: 1 +1371 kbps: 1 +1376 kbps: 1 +143 kbps: 1 +147 kbps: 1 +151 kbps: 1 +152 kbps: 1 +154 kbps: 1 +158 kbps: 1 +160 kbps: 1 +162 kbps: 1 +168 kbps: 1 +172 kbps: 1 +176 kbps: 1 +178 kbps: 1 +184 kbps: 1 +187 kbps: 1 +190 kbps: 1 +196 kbps: 1 +197 kbps: 1 +198 kbps: 1 +218 kbps: 1 +223 kbps: 1 +227 kbps: 1 +230 kbps: 1 +232 kbps: 1 +244 kbps: 1 +252 kbps: 1 +253 kbps: 1 +284 kbps: 1 +290 kbps: 1 +332 kbps: 1 +345 kbps: 1 +62 kbps: 1 +639 kbps: 1 +657 kbps: 1 +66 kbps: 1 +662 kbps: 1 +664 kbps: 1 +683 kbps: 1 +69 kbps: 1 +697 kbps: 1 +70 kbps: 1 +702 kbps: 1 +716 kbps: 1 +720 kbps: 1 +722 kbps: 1 +740 kbps: 1 +747 kbps: 1 +749 kbps: 1 +751 kbps: 1 +752 kbps: 1 +753 kbps: 1 +767 kbps: 1 +770 kbps: 1 +773 kbps: 1 +776 kbps: 1 +78 kbps: 1 +781 kbps: 1 +782 kbps: 1 +783 kbps: 1 +785 kbps: 1 +791 kbps: 1 +796 kbps: 1 +797 kbps: 1 +798 kbps: 1 +80 kbps: 1 +802 kbps: 1 +809 kbps: 1 +810 kbps: 1 +814 kbps: 1 +819 kbps: 1 +820 kbps: 1 +821 kbps: 1 +822 kbps: 1 +826 kbps: 1 +829 kbps: 1 +831 kbps: 1 +833 kbps: 1 +835 kbps: 1 +85 kbps: 1 +857 kbps: 1 +858 kbps: 1 +859 kbps: 1 +860 kbps: 1 +862 kbps: 1 +864 kbps: 1 +866 kbps: 1 +87 kbps: 1 +875 kbps: 1 +876 kbps: 1 +877 kbps: 1 +88 kbps: 1 +880 kbps: 1 +883 kbps: 1 +89 kbps: 1 +891 kbps: 1 +892 kbps: 1 +893 kbps: 1 +895 kbps: 1 +897 kbps: 1 +90 kbps: 1 +900 kbps: 1 +903 kbps: 1 +905 kbps: 1 +906 kbps: 1 +909 kbps: 1 +910 kbps: 1 +912 kbps: 1 +913 kbps: 1 +917 kbps: 1 +918 kbps: 1 +921 kbps: 1 +925 kbps: 1 +930 kbps: 1 +937 kbps: 1 +941 kbps: 1 +947 kbps: 1 +955 kbps: 1 +963 kbps: 1 +966 kbps: 1 +971 kbps: 1 +981 kbps: 1 +994 kbps: 1 diff --git a/python/youtube/duration.txt b/python/youtube/duration.txt @@ -0,0 +1,57 @@ +Processing results/max-bitrate/wores.csv +Removed 635 duplicate URLs +Processed 8236 valid integer rows +Processing results/max-bitrate/lots-without-res2.csv +Removed 1080 duplicate URLs +Processed 4764 valid integer rows +Processing results/max-bitrate/lots-without-res1.csv +Removed 53185 duplicate URLs +Processed 113645 valid integer rows +Processing results/max-bitrate/lots-with-bitrate.csv +Removed 92281 duplicate URLs +Processed 153143 valid integer rows +Processing results/max-bitrate/lots-without-res.csv +Removed 243018 duplicate URLs +Processed 322903 valid integer rows +Processing results/max-bitrate/9-16-2025-B.csv +Removed 35677 duplicate URLs +Processed 203894 valid integer rows +Processing results/max-bitrate/9-16-2025-A.csv +Removed 209832 duplicate URLs +Processed 28994 valid integer rows +Processing results/no-resolution/results.csv +Removed 168732 duplicate URLs +Processed 94726 valid integer rows +Processing results/no-resolution/res_res.csv +Removed 256549 duplicate URLs +Processed 6614 valid integer rows +Processing results/no-resolution/lots.csv +Removed 5772634 duplicate URLs +Processed 6186512 valid integer rows +Processing results/no-resolution/lots1.csv +Removed 2427588 duplicate URLs +Processed 61082 valid integer rows +Processing results/no-resolution/results-comp1.csv +Removed 2420972 duplicate URLs +Processed 67608 valid integer rows +Processing results/no-resolution/results-only-lengths.csv +Removed 428185 duplicate URLs +Processed 549952 valid integer rows +Processing results/no-resolution/res1.csv +Removed 252845 duplicate URLs +Processed 10511 valid integer rows +Processing results/no-resolution/res_8.csv +Removed 2458952 duplicate URLs +Processed 16145 valid integer rows +Processing results/no-resolution/res983074.csv +Removed 254962 duplicate URLs +Processed 1390 valid integer rows +Processing results/maybe-resolution/lots.csv +Removed 12554 duplicate URLs +Processed 293 valid integer rows +Processing results/maybe-resolution/res_some_res.csv +Removed 7894 duplicate URLs +Processed 1016 valid integer rows +Total unique URLs processed: 7831428 +Mean: 669.6658929891203 +Total valid durations: 7831428 diff --git a/python/youtube/total.py b/python/youtube/scraping/total.py diff --git a/python/youtube/youtube-scraping-only-lengths-comprehensive.py b/python/youtube/scraping/youtube-scraping-only-lengths-comprehensive.py diff --git a/python/youtube/youtube-scraping-only-lengths.py b/python/youtube/scraping/youtube-scraping-only-lengths.py diff --git a/python/youtube/youtube-scraping.py b/python/youtube/scraping/youtube-scraping.py diff --git a/python/youtube/yt-lots.py b/python/youtube/scraping/yt-lots.py