blog

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 58c7038c2dc8646fecdac690371111e06927fda7
parent 4bb1e36ba5c744ddef03412ab629926e969d7a4b
Author: andrew.laack <andrew.laack@imbue.com>
Date:   Mon, 15 Sep 2025 17:33:36 -0500

Added more scripts

Diffstat:
Mpython/youtube/youtube-scraping.py | 4++--
Mpython/youtube/yt-lots.py | 31+++++++++++++++++--------------
2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/python/youtube/youtube-scraping.py b/python/youtube/youtube-scraping.py @@ -2833,7 +2833,7 @@ def get_recent_videos_for_term(term, max_videos=100): print(f" Got {len(videos)} so far for '{term}'...") final_videos = [] - with ThreadPoolExecutor(max_workers=20) as executor: + with ThreadPoolExecutor(max_workers=250) as executor: future_to_vid = {executor.submit(fetch_highest_resolution, vid_id): (title, url, secs) for title, url, secs, vid_id in videos[:max_videos]} for future in as_completed(future_to_vid): @@ -2844,7 +2844,7 @@ def get_recent_videos_for_term(term, max_videos=100): return final_videos all_rows = [] -MAX_CONCURRENT_TERMS = 25 +MAX_CONCURRENT_TERMS = 250 with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_TERMS) as executor: future_to_term = {executor.submit(get_recent_videos_for_term, term, 100): term for term in search_terms} diff --git a/python/youtube/yt-lots.py b/python/youtube/yt-lots.py @@ -7,7 +7,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed import pandas as pd df = pd.read_csv('trends.csv') - search_terms = df['query'].to_list() print(len(search_terms)) @@ -50,24 +49,28 @@ def parse_videos(obj): results.extend(parse_videos(v)) return results -def fetch_highest_resolution(video_id): - return "Not getting res" +def fetch_resolution_and_bitrate(video_id): + return "Not including" url = f"https://www.youtube.com/watch?v={video_id}" try: resp = session.get(url, headers=HEADERS, timeout=10) resp.raise_for_status() match = re.search(r'ytInitialPlayerResponse\s*=\s*({.*?});', resp.text) if not match: - return "N/A" + return "N/A", "N/A" data = json.loads(match.group(1)) formats = data.get("streamingData", {}).get("formats", []) max_res = 0 + max_bitrate = 0 for f in formats: if "height" in f and f["height"] > max_res: max_res = f["height"] - return f"{max_res}p" if max_res else "N/A" + if "bitrate" in f and f["bitrate"] > max_bitrate: + max_bitrate = f["bitrate"] + return (f"{max_res}p" if max_res else "N/A", + f"{round(max_bitrate/1000)} kbps" if max_bitrate else "N/A") except Exception: - return "N/A" + return "N/A", "N/A" def fetch_more_results(continuation, api_key): try: @@ -127,20 +130,20 @@ def get_recent_videos_for_term(term, max_videos=10000): final_videos = [] with ThreadPoolExecutor(max_workers=200) as executor: - future_to_vid = {executor.submit(fetch_highest_resolution, vid_id): (title, url, secs) + future_to_vid = {executor.submit(fetch_resolution_and_bitrate, vid_id): (title, url, secs) for title, url, secs, vid_id in videos[:max_videos]} for future in as_completed(future_to_vid): title, url, secs = future_to_vid[future] - res = future.result() - final_videos.append((title, url, secs, res)) + res, bitrate = future.result() + final_videos.append((title, url, secs, res, bitrate)) return final_videos MAX_CONCURRENT_TERMS = 200 -with open("lots.csv", "w", newline="", encoding="utf-8") as f: +with open("lots-without-res.csv", "w", newline="", encoding="utf-8") as f: writer = csv.writer(f) - writer.writerow(["term", "title", "url", "duration_seconds", "highest_resolution"]) + writer.writerow(["term", "title", "url", "duration_seconds", "highest_resolution", "max_bitrate"]) with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_TERMS) as executor: future_to_term = { @@ -156,9 +159,9 @@ with open("lots.csv", "w", newline="", encoding="utf-8") as f: avg = sum(v[2] for v in vids) / len(vids) print(f"Average for '{term}': {avg/60:.2f} minutes over {len(vids)} videos") - rows = [(term, title, url, secs, res) for title, url, secs, res in vids] - writer.writerows(rows) # ✅ write in batch per term - f.flush() # 💾 ensure data is written to disk + rows = [(term, title, url, secs, res, bitrate) for title, url, secs, res, bitrate in vids] + writer.writerows(rows) + f.flush() else: print(f"No videos found for '{term}'.") except Exception as e: