commit 58c7038c2dc8646fecdac690371111e06927fda7
parent 4bb1e36ba5c744ddef03412ab629926e969d7a4b
Author: andrew.laack <andrew.laack@imbue.com>
Date: Mon, 15 Sep 2025 17:33:36 -0500
Added more scripts
Diffstat:
2 files changed, 19 insertions(+), 16 deletions(-)
diff --git a/python/youtube/youtube-scraping.py b/python/youtube/youtube-scraping.py
@@ -2833,7 +2833,7 @@ def get_recent_videos_for_term(term, max_videos=100):
print(f" Got {len(videos)} so far for '{term}'...")
final_videos = []
- with ThreadPoolExecutor(max_workers=20) as executor:
+ with ThreadPoolExecutor(max_workers=250) as executor:
future_to_vid = {executor.submit(fetch_highest_resolution, vid_id): (title, url, secs)
for title, url, secs, vid_id in videos[:max_videos]}
for future in as_completed(future_to_vid):
@@ -2844,7 +2844,7 @@ def get_recent_videos_for_term(term, max_videos=100):
return final_videos
all_rows = []
-MAX_CONCURRENT_TERMS = 25
+MAX_CONCURRENT_TERMS = 250
with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_TERMS) as executor:
future_to_term = {executor.submit(get_recent_videos_for_term, term, 100): term for term in search_terms}
diff --git a/python/youtube/yt-lots.py b/python/youtube/yt-lots.py
@@ -7,7 +7,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
df = pd.read_csv('trends.csv')
-
search_terms = df['query'].to_list()
print(len(search_terms))
@@ -50,24 +49,28 @@ def parse_videos(obj):
results.extend(parse_videos(v))
return results
-def fetch_highest_resolution(video_id):
- return "Not getting res"
+def fetch_resolution_and_bitrate(video_id):
+ return "Not including"
url = f"https://www.youtube.com/watch?v={video_id}"
try:
resp = session.get(url, headers=HEADERS, timeout=10)
resp.raise_for_status()
match = re.search(r'ytInitialPlayerResponse\s*=\s*({.*?});', resp.text)
if not match:
- return "N/A"
+ return "N/A", "N/A"
data = json.loads(match.group(1))
formats = data.get("streamingData", {}).get("formats", [])
max_res = 0
+ max_bitrate = 0
for f in formats:
if "height" in f and f["height"] > max_res:
max_res = f["height"]
- return f"{max_res}p" if max_res else "N/A"
+ if "bitrate" in f and f["bitrate"] > max_bitrate:
+ max_bitrate = f["bitrate"]
+ return (f"{max_res}p" if max_res else "N/A",
+ f"{round(max_bitrate/1000)} kbps" if max_bitrate else "N/A")
except Exception:
- return "N/A"
+ return "N/A", "N/A"
def fetch_more_results(continuation, api_key):
try:
@@ -127,20 +130,20 @@ def get_recent_videos_for_term(term, max_videos=10000):
final_videos = []
with ThreadPoolExecutor(max_workers=200) as executor:
- future_to_vid = {executor.submit(fetch_highest_resolution, vid_id): (title, url, secs)
+ future_to_vid = {executor.submit(fetch_resolution_and_bitrate, vid_id): (title, url, secs)
for title, url, secs, vid_id in videos[:max_videos]}
for future in as_completed(future_to_vid):
title, url, secs = future_to_vid[future]
- res = future.result()
- final_videos.append((title, url, secs, res))
+ res, bitrate = future.result()
+ final_videos.append((title, url, secs, res, bitrate))
return final_videos
MAX_CONCURRENT_TERMS = 200
-with open("lots.csv", "w", newline="", encoding="utf-8") as f:
+with open("lots-without-res.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
- writer.writerow(["term", "title", "url", "duration_seconds", "highest_resolution"])
+ writer.writerow(["term", "title", "url", "duration_seconds", "highest_resolution", "max_bitrate"])
with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_TERMS) as executor:
future_to_term = {
@@ -156,9 +159,9 @@ with open("lots.csv", "w", newline="", encoding="utf-8") as f:
avg = sum(v[2] for v in vids) / len(vids)
print(f"Average for '{term}': {avg/60:.2f} minutes over {len(vids)} videos")
- rows = [(term, title, url, secs, res) for title, url, secs, res in vids]
- writer.writerows(rows) # ✅ write in batch per term
- f.flush() # 💾 ensure data is written to disk
+ rows = [(term, title, url, secs, res, bitrate) for title, url, secs, res, bitrate in vids]
+ writer.writerows(rows)
+ f.flush()
else:
print(f"No videos found for '{term}'.")
except Exception as e: