More tests - blog - Unnamed repository; edit this file 'description' to name the repository.

commit 4bb1e36ba5c744ddef03412ab629926e969d7a4b
parent 196d6ceab3231d6bbc58fa02ab9504bb1821fccc
Author: andrew.laack <andrew.laack@imbue.com>
Date:   Sun, 14 Sep 2025 14:39:57 -0700

More tests

Diffstat:
A .gitignore  | 1 +
M python/youtube/youtube-scraping-only-lengths-comprehensive.py  | 7 ++++---
A python/youtube/yt-lots.py  | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 170 insertions(+), 3 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.csv
diff --git a/python/youtube/youtube-scraping-only-lengths-comprehensive.py b/python/youtube/youtube-scraping-only-lengths-comprehensive.py
@@ -1,3 +1,4 @@
+# https://www.kaggle.com/datasets/muhammedtausif/data-science-trends-on-google
 import re
 import json
 import csv
@@ -74,7 +75,7 @@ def fetch_more_results(continuation, api_key):
     except Exception:
         return None, []
 
-def get_recent_videos_for_term(term, max_videos=100):
+def get_recent_videos_for_term(term, max_videos=1000):
     url = f"https://www.youtube.com/results?search_query={term.replace(' ', '+')}&sp=CAI%253D"
     print(f"\nFetching for term '{term}': {url}")
     resp = session.get(url, headers=HEADERS)
@@ -109,7 +110,7 @@ def get_recent_videos_for_term(term, max_videos=100):
         print(f"  Got {len(videos)} so far for '{term}'...")
 
     final_videos = []
-    with ThreadPoolExecutor(max_workers=20) as executor:  
+    with ThreadPoolExecutor(max_workers=200) as executor:  
         future_to_vid = {executor.submit(fetch_highest_resolution, vid_id): (title, url, secs)
                          for title, url, secs, vid_id in videos[:max_videos]}
         for future in as_completed(future_to_vid):
@@ -123,7 +124,7 @@ all_rows = []
 MAX_CONCURRENT_TERMS = 200
 
 with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_TERMS) as executor:
-    future_to_term = {executor.submit(get_recent_videos_for_term, term, 100): term for term in search_terms}
+    future_to_term = {executor.submit(get_recent_videos_for_term, term, 1000): term for term in search_terms}
     for future in as_completed(future_to_term):
         term = future_to_term[future]
         try:
diff --git a/python/youtube/yt-lots.py b/python/youtube/yt-lots.py
@@ -0,0 +1,165 @@
+# https://www.kaggle.com/datasets/muhammedtausif/data-science-trends-on-google
+import re
+import json
+import csv
+import requests
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import pandas as pd
+
+df = pd.read_csv('trends.csv')
+
+search_terms = df['query'].to_list()
+print(len(search_terms))
+
+session = requests.Session()
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+                  "(KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"
+}
+
+API_URL = "https://www.youtube.com/youtubei/v1/search"
+API_KEY_RE = re.compile(r'"INNERTUBE_API_KEY":"(.*?)"')
+CLIENT_CONTEXT = {
+    "context": {
+        "client": {
+            "clientName": "WEB",
+            "clientVersion": "2.20240722.00.00"
+        }
+    }
+}
+
+def parse_videos(obj):
+    results = []
+    if isinstance(obj, dict):
+        if 'videoRenderer' in obj or 'shortsRenderer' in obj:
+            v = obj.get('videoRenderer') or obj.get('shortsRenderer')
+            title = v.get('title', {}).get('runs', [{}])[0].get('text')
+            video_id = v.get('videoId')
+            length = v.get('lengthText', {}).get('simpleText') if 'lengthText' in v else None
+            secs = 0
+            if length:
+                parts = length.split(':')
+                for p in parts:
+                    secs = secs * 60 + int(p)
+            results.append((title, f"https://www.youtube.com/watch?v={video_id}", secs, video_id))
+        for v in obj.values():
+            results.extend(parse_videos(v))
+    elif isinstance(obj, list):
+        for v in obj:
+            results.extend(parse_videos(v))
+    return results
+
+def fetch_highest_resolution(video_id):
+    return "Not getting res"
+    url = f"https://www.youtube.com/watch?v={video_id}"
+    try:
+        resp = session.get(url, headers=HEADERS, timeout=10)
+        resp.raise_for_status()
+        match = re.search(r'ytInitialPlayerResponse\s*=\s*({.*?});', resp.text)
+        if not match:
+            return "N/A"
+        data = json.loads(match.group(1))
+        formats = data.get("streamingData", {}).get("formats", [])
+        max_res = 0
+        for f in formats:
+            if "height" in f and f["height"] > max_res:
+                max_res = f["height"]
+        return f"{max_res}p" if max_res else "N/A"
+    except Exception:
+        return "N/A"
+
+def fetch_more_results(continuation, api_key):
+    try:
+        resp = session.post(
+            f"{API_URL}?key={api_key}",
+            headers=HEADERS,
+            json={**CLIENT_CONTEXT, "continuation": continuation},
+            timeout=10
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        videos = parse_videos(data)
+        cont = None
+        try:
+            cont = data['onResponseReceivedCommands'][0]['appendContinuationItemsAction'] \
+                ['continuationItems'][-1]['continuationItemRenderer'] \
+                ['continuationEndpoint']['continuationCommand']['token']
+        except Exception:
+            pass
+        return cont, videos
+    except Exception:
+        return None, []
+
+def get_recent_videos_for_term(term, max_videos=10000):
+    url = f"https://www.youtube.com/results?search_query={term.replace(' ', '+')}&sp=CAI%253D"
+    print(f"\nFetching for term '{term}': {url}")
+    resp = session.get(url, headers=HEADERS)
+    html = resp.text
+
+    api_match = API_KEY_RE.search(html)
+    if not api_match:
+        print("Could not find API key for term:", term)
+        return []
+    api_key = api_match.group(1)
+
+    data_match = re.search(r'var ytInitialData = ({.*?});</script>', html, re.DOTALL)
+    if not data_match:
+        print("No ytInitialData found for term:", term)
+        return []
+    data = json.loads(data_match.group(1))
+
+    videos = parse_videos(data)
+
+    try:
+        cont = data['contents']['twoColumnSearchResultsRenderer']['primaryContents'] \
+            ['sectionListRenderer']['contents'][-1]['continuationItemRenderer'] \
+            ['continuationEndpoint']['continuationCommand']['token']
+    except Exception:
+        cont = None
+
+    while cont and len(videos) < max_videos:
+        cont, more = fetch_more_results(cont, api_key)
+        for vid in more:
+            if len(videos) < max_videos:
+                videos.append(vid)
+        print(f"  Got {len(videos)} so far for '{term}'...")
+
+    final_videos = []
+    with ThreadPoolExecutor(max_workers=200) as executor:  
+        future_to_vid = {executor.submit(fetch_highest_resolution, vid_id): (title, url, secs)
+                         for title, url, secs, vid_id in videos[:max_videos]}
+        for future in as_completed(future_to_vid):
+            title, url, secs = future_to_vid[future]
+            res = future.result()
+            final_videos.append((title, url, secs, res))
+
+    return final_videos
+
+MAX_CONCURRENT_TERMS = 200
+
+with open("lots.csv", "w", newline="", encoding="utf-8") as f:
+    writer = csv.writer(f)
+    writer.writerow(["term", "title", "url", "duration_seconds", "highest_resolution"])
+
+    with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_TERMS) as executor:
+        future_to_term = {
+            executor.submit(get_recent_videos_for_term, term, 10000): term
+            for term in search_terms
+        }
+
+        for future in as_completed(future_to_term):
+            term = future_to_term[future]
+            try:
+                vids = future.result()
+                if vids:
+                    avg = sum(v[2] for v in vids) / len(vids)
+                    print(f"Average for '{term}': {avg/60:.2f} minutes over {len(vids)} videos")
+
+                    rows = [(term, title, url, secs, res) for title, url, secs, res in vids]
+                    writer.writerows(rows)  # ✅ write in batch per term
+                    f.flush()               # 💾 ensure data is written to disk
+                else:
+                    print(f"No videos found for '{term}'.")
+            except Exception as e:
+                print(f"Error fetching term '{term}': {e}")

	blog Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs

A	.gitignore	\|	1	+
M	python/youtube/youtube-scraping-only-lengths-comprehensive.py	\|	7	++++---
A	python/youtube/yt-lots.py	\|	165	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++