blog

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 4bb1e36ba5c744ddef03412ab629926e969d7a4b
parent 196d6ceab3231d6bbc58fa02ab9504bb1821fccc
Author: andrew.laack <andrew.laack@imbue.com>
Date:   Sun, 14 Sep 2025 14:39:57 -0700

More tests

Diffstat:
A.gitignore | 1+
Mpython/youtube/youtube-scraping-only-lengths-comprehensive.py | 7++++---
Apython/youtube/yt-lots.py | 165+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 170 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1 @@ +*.csv diff --git a/python/youtube/youtube-scraping-only-lengths-comprehensive.py b/python/youtube/youtube-scraping-only-lengths-comprehensive.py @@ -1,3 +1,4 @@ +# https://www.kaggle.com/datasets/muhammedtausif/data-science-trends-on-google import re import json import csv @@ -74,7 +75,7 @@ def fetch_more_results(continuation, api_key): except Exception: return None, [] -def get_recent_videos_for_term(term, max_videos=100): +def get_recent_videos_for_term(term, max_videos=1000): url = f"https://www.youtube.com/results?search_query={term.replace(' ', '+')}&sp=CAI%253D" print(f"\nFetching for term '{term}': {url}") resp = session.get(url, headers=HEADERS) @@ -109,7 +110,7 @@ def get_recent_videos_for_term(term, max_videos=100): print(f" Got {len(videos)} so far for '{term}'...") final_videos = [] - with ThreadPoolExecutor(max_workers=20) as executor: + with ThreadPoolExecutor(max_workers=200) as executor: future_to_vid = {executor.submit(fetch_highest_resolution, vid_id): (title, url, secs) for title, url, secs, vid_id in videos[:max_videos]} for future in as_completed(future_to_vid): @@ -123,7 +124,7 @@ all_rows = [] MAX_CONCURRENT_TERMS = 200 with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_TERMS) as executor: - future_to_term = {executor.submit(get_recent_videos_for_term, term, 100): term for term in search_terms} + future_to_term = {executor.submit(get_recent_videos_for_term, term, 1000): term for term in search_terms} for future in as_completed(future_to_term): term = future_to_term[future] try: diff --git a/python/youtube/yt-lots.py b/python/youtube/yt-lots.py @@ -0,0 +1,165 @@ +# https://www.kaggle.com/datasets/muhammedtausif/data-science-trends-on-google +import re +import json +import csv +import requests +from concurrent.futures import ThreadPoolExecutor, as_completed +import pandas as pd + +df = pd.read_csv('trends.csv') + +search_terms = df['query'].to_list() +print(len(search_terms)) + +session = requests.Session() + +HEADERS = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36" +} + +API_URL = "https://www.youtube.com/youtubei/v1/search" +API_KEY_RE = re.compile(r'"INNERTUBE_API_KEY":"(.*?)"') +CLIENT_CONTEXT = { + "context": { + "client": { + "clientName": "WEB", + "clientVersion": "2.20240722.00.00" + } + } +} + +def parse_videos(obj): + results = [] + if isinstance(obj, dict): + if 'videoRenderer' in obj or 'shortsRenderer' in obj: + v = obj.get('videoRenderer') or obj.get('shortsRenderer') + title = v.get('title', {}).get('runs', [{}])[0].get('text') + video_id = v.get('videoId') + length = v.get('lengthText', {}).get('simpleText') if 'lengthText' in v else None + secs = 0 + if length: + parts = length.split(':') + for p in parts: + secs = secs * 60 + int(p) + results.append((title, f"https://www.youtube.com/watch?v={video_id}", secs, video_id)) + for v in obj.values(): + results.extend(parse_videos(v)) + elif isinstance(obj, list): + for v in obj: + results.extend(parse_videos(v)) + return results + +def fetch_highest_resolution(video_id): + return "Not getting res" + url = f"https://www.youtube.com/watch?v={video_id}" + try: + resp = session.get(url, headers=HEADERS, timeout=10) + resp.raise_for_status() + match = re.search(r'ytInitialPlayerResponse\s*=\s*({.*?});', resp.text) + if not match: + return "N/A" + data = json.loads(match.group(1)) + formats = data.get("streamingData", {}).get("formats", []) + max_res = 0 + for f in formats: + if "height" in f and f["height"] > max_res: + max_res = f["height"] + return f"{max_res}p" if max_res else "N/A" + except Exception: + return "N/A" + +def fetch_more_results(continuation, api_key): + try: + resp = session.post( + f"{API_URL}?key={api_key}", + headers=HEADERS, + json={**CLIENT_CONTEXT, "continuation": continuation}, + timeout=10 + ) + resp.raise_for_status() + data = resp.json() + videos = parse_videos(data) + cont = None + try: + cont = data['onResponseReceivedCommands'][0]['appendContinuationItemsAction'] \ + ['continuationItems'][-1]['continuationItemRenderer'] \ + ['continuationEndpoint']['continuationCommand']['token'] + except Exception: + pass + return cont, videos + except Exception: + return None, [] + +def get_recent_videos_for_term(term, max_videos=10000): + url = f"https://www.youtube.com/results?search_query={term.replace(' ', '+')}&sp=CAI%253D" + print(f"\nFetching for term '{term}': {url}") + resp = session.get(url, headers=HEADERS) + html = resp.text + + api_match = API_KEY_RE.search(html) + if not api_match: + print("Could not find API key for term:", term) + return [] + api_key = api_match.group(1) + + data_match = re.search(r'var ytInitialData = ({.*?});</script>', html, re.DOTALL) + if not data_match: + print("No ytInitialData found for term:", term) + return [] + data = json.loads(data_match.group(1)) + + videos = parse_videos(data) + + try: + cont = data['contents']['twoColumnSearchResultsRenderer']['primaryContents'] \ + ['sectionListRenderer']['contents'][-1]['continuationItemRenderer'] \ + ['continuationEndpoint']['continuationCommand']['token'] + except Exception: + cont = None + + while cont and len(videos) < max_videos: + cont, more = fetch_more_results(cont, api_key) + for vid in more: + if len(videos) < max_videos: + videos.append(vid) + print(f" Got {len(videos)} so far for '{term}'...") + + final_videos = [] + with ThreadPoolExecutor(max_workers=200) as executor: + future_to_vid = {executor.submit(fetch_highest_resolution, vid_id): (title, url, secs) + for title, url, secs, vid_id in videos[:max_videos]} + for future in as_completed(future_to_vid): + title, url, secs = future_to_vid[future] + res = future.result() + final_videos.append((title, url, secs, res)) + + return final_videos + +MAX_CONCURRENT_TERMS = 200 + +with open("lots.csv", "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(["term", "title", "url", "duration_seconds", "highest_resolution"]) + + with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_TERMS) as executor: + future_to_term = { + executor.submit(get_recent_videos_for_term, term, 10000): term + for term in search_terms + } + + for future in as_completed(future_to_term): + term = future_to_term[future] + try: + vids = future.result() + if vids: + avg = sum(v[2] for v in vids) / len(vids) + print(f"Average for '{term}': {avg/60:.2f} minutes over {len(vids)} videos") + + rows = [(term, title, url, secs, res) for title, url, secs, res in vids] + writer.writerows(rows) # ✅ write in batch per term + f.flush() # 💾 ensure data is written to disk + else: + print(f"No videos found for '{term}'.") + except Exception as e: + print(f"Error fetching term '{term}': {e}")