commit 4bb1e36ba5c744ddef03412ab629926e969d7a4b
parent 196d6ceab3231d6bbc58fa02ab9504bb1821fccc
Author: andrew.laack <andrew.laack@imbue.com>
Date: Sun, 14 Sep 2025 14:39:57 -0700
More tests
Diffstat:
3 files changed, 170 insertions(+), 3 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.csv
diff --git a/python/youtube/youtube-scraping-only-lengths-comprehensive.py b/python/youtube/youtube-scraping-only-lengths-comprehensive.py
@@ -1,3 +1,4 @@
+# https://www.kaggle.com/datasets/muhammedtausif/data-science-trends-on-google
import re
import json
import csv
@@ -74,7 +75,7 @@ def fetch_more_results(continuation, api_key):
except Exception:
return None, []
-def get_recent_videos_for_term(term, max_videos=100):
+def get_recent_videos_for_term(term, max_videos=1000):
url = f"https://www.youtube.com/results?search_query={term.replace(' ', '+')}&sp=CAI%253D"
print(f"\nFetching for term '{term}': {url}")
resp = session.get(url, headers=HEADERS)
@@ -109,7 +110,7 @@ def get_recent_videos_for_term(term, max_videos=100):
print(f" Got {len(videos)} so far for '{term}'...")
final_videos = []
- with ThreadPoolExecutor(max_workers=20) as executor:
+ with ThreadPoolExecutor(max_workers=200) as executor:
future_to_vid = {executor.submit(fetch_highest_resolution, vid_id): (title, url, secs)
for title, url, secs, vid_id in videos[:max_videos]}
for future in as_completed(future_to_vid):
@@ -123,7 +124,7 @@ all_rows = []
MAX_CONCURRENT_TERMS = 200
with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_TERMS) as executor:
- future_to_term = {executor.submit(get_recent_videos_for_term, term, 100): term for term in search_terms}
+ future_to_term = {executor.submit(get_recent_videos_for_term, term, 1000): term for term in search_terms}
for future in as_completed(future_to_term):
term = future_to_term[future]
try:
diff --git a/python/youtube/yt-lots.py b/python/youtube/yt-lots.py
@@ -0,0 +1,165 @@
+# https://www.kaggle.com/datasets/muhammedtausif/data-science-trends-on-google
+import re
+import json
+import csv
+import requests
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import pandas as pd
+
+df = pd.read_csv('trends.csv')
+
+search_terms = df['query'].to_list()
+print(len(search_terms))
+
+session = requests.Session()
+
+HEADERS = {
+ "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"
+}
+
+API_URL = "https://www.youtube.com/youtubei/v1/search"
+API_KEY_RE = re.compile(r'"INNERTUBE_API_KEY":"(.*?)"')
+CLIENT_CONTEXT = {
+ "context": {
+ "client": {
+ "clientName": "WEB",
+ "clientVersion": "2.20240722.00.00"
+ }
+ }
+}
+
+def parse_videos(obj):
+ results = []
+ if isinstance(obj, dict):
+ if 'videoRenderer' in obj or 'shortsRenderer' in obj:
+ v = obj.get('videoRenderer') or obj.get('shortsRenderer')
+ title = v.get('title', {}).get('runs', [{}])[0].get('text')
+ video_id = v.get('videoId')
+ length = v.get('lengthText', {}).get('simpleText') if 'lengthText' in v else None
+ secs = 0
+ if length:
+ parts = length.split(':')
+ for p in parts:
+ secs = secs * 60 + int(p)
+ results.append((title, f"https://www.youtube.com/watch?v={video_id}", secs, video_id))
+ for v in obj.values():
+ results.extend(parse_videos(v))
+ elif isinstance(obj, list):
+ for v in obj:
+ results.extend(parse_videos(v))
+ return results
+
+def fetch_highest_resolution(video_id):
+ return "Not getting res"
+ url = f"https://www.youtube.com/watch?v={video_id}"
+ try:
+ resp = session.get(url, headers=HEADERS, timeout=10)
+ resp.raise_for_status()
+ match = re.search(r'ytInitialPlayerResponse\s*=\s*({.*?});', resp.text)
+ if not match:
+ return "N/A"
+ data = json.loads(match.group(1))
+ formats = data.get("streamingData", {}).get("formats", [])
+ max_res = 0
+ for f in formats:
+ if "height" in f and f["height"] > max_res:
+ max_res = f["height"]
+ return f"{max_res}p" if max_res else "N/A"
+ except Exception:
+ return "N/A"
+
+def fetch_more_results(continuation, api_key):
+ try:
+ resp = session.post(
+ f"{API_URL}?key={api_key}",
+ headers=HEADERS,
+ json={**CLIENT_CONTEXT, "continuation": continuation},
+ timeout=10
+ )
+ resp.raise_for_status()
+ data = resp.json()
+ videos = parse_videos(data)
+ cont = None
+ try:
+ cont = data['onResponseReceivedCommands'][0]['appendContinuationItemsAction'] \
+ ['continuationItems'][-1]['continuationItemRenderer'] \
+ ['continuationEndpoint']['continuationCommand']['token']
+ except Exception:
+ pass
+ return cont, videos
+ except Exception:
+ return None, []
+
+def get_recent_videos_for_term(term, max_videos=10000):
+ url = f"https://www.youtube.com/results?search_query={term.replace(' ', '+')}&sp=CAI%253D"
+ print(f"\nFetching for term '{term}': {url}")
+ resp = session.get(url, headers=HEADERS)
+ html = resp.text
+
+ api_match = API_KEY_RE.search(html)
+ if not api_match:
+ print("Could not find API key for term:", term)
+ return []
+ api_key = api_match.group(1)
+
+ data_match = re.search(r'var ytInitialData = ({.*?});</script>', html, re.DOTALL)
+ if not data_match:
+ print("No ytInitialData found for term:", term)
+ return []
+ data = json.loads(data_match.group(1))
+
+ videos = parse_videos(data)
+
+ try:
+ cont = data['contents']['twoColumnSearchResultsRenderer']['primaryContents'] \
+ ['sectionListRenderer']['contents'][-1]['continuationItemRenderer'] \
+ ['continuationEndpoint']['continuationCommand']['token']
+ except Exception:
+ cont = None
+
+ while cont and len(videos) < max_videos:
+ cont, more = fetch_more_results(cont, api_key)
+ for vid in more:
+ if len(videos) < max_videos:
+ videos.append(vid)
+ print(f" Got {len(videos)} so far for '{term}'...")
+
+ final_videos = []
+ with ThreadPoolExecutor(max_workers=200) as executor:
+ future_to_vid = {executor.submit(fetch_highest_resolution, vid_id): (title, url, secs)
+ for title, url, secs, vid_id in videos[:max_videos]}
+ for future in as_completed(future_to_vid):
+ title, url, secs = future_to_vid[future]
+ res = future.result()
+ final_videos.append((title, url, secs, res))
+
+ return final_videos
+
+MAX_CONCURRENT_TERMS = 200
+
+with open("lots.csv", "w", newline="", encoding="utf-8") as f:
+ writer = csv.writer(f)
+ writer.writerow(["term", "title", "url", "duration_seconds", "highest_resolution"])
+
+ with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_TERMS) as executor:
+ future_to_term = {
+ executor.submit(get_recent_videos_for_term, term, 10000): term
+ for term in search_terms
+ }
+
+ for future in as_completed(future_to_term):
+ term = future_to_term[future]
+ try:
+ vids = future.result()
+ if vids:
+ avg = sum(v[2] for v in vids) / len(vids)
+ print(f"Average for '{term}': {avg/60:.2f} minutes over {len(vids)} videos")
+
+ rows = [(term, title, url, secs, res) for title, url, secs, res in vids]
+ writer.writerows(rows) # ✅ write in batch per term
+ f.flush() # 💾 ensure data is written to disk
+ else:
+ print(f"No videos found for '{term}'.")
+ except Exception as e:
+ print(f"Error fetching term '{term}': {e}")