Added more scripts - blog - Unnamed repository; edit this file 'description' to name the repository.

commit 58c7038c2dc8646fecdac690371111e06927fda7
parent 4bb1e36ba5c744ddef03412ab629926e969d7a4b
Author: andrew.laack <andrew.laack@imbue.com>
Date:   Mon, 15 Sep 2025 17:33:36 -0500

Added more scripts

Diffstat:
M python/youtube/youtube-scraping.py  | 4 ++--
M python/youtube/yt-lots.py  | 31 +++++++++++++++++--------------

2 files changed, 19 insertions(+), 16 deletions(-)
diff --git a/python/youtube/youtube-scraping.py b/python/youtube/youtube-scraping.py
@@ -2833,7 +2833,7 @@ def get_recent_videos_for_term(term, max_videos=100):
         print(f"  Got {len(videos)} so far for '{term}'...")
 
     final_videos = []
-    with ThreadPoolExecutor(max_workers=20) as executor:  
+    with ThreadPoolExecutor(max_workers=250) as executor:  
         future_to_vid = {executor.submit(fetch_highest_resolution, vid_id): (title, url, secs)
                          for title, url, secs, vid_id in videos[:max_videos]}
         for future in as_completed(future_to_vid):
@@ -2844,7 +2844,7 @@ def get_recent_videos_for_term(term, max_videos=100):
     return final_videos
 
 all_rows = []
-MAX_CONCURRENT_TERMS = 25
+MAX_CONCURRENT_TERMS = 250
 
 with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_TERMS) as executor:
     future_to_term = {executor.submit(get_recent_videos_for_term, term, 100): term for term in search_terms}
diff --git a/python/youtube/yt-lots.py b/python/youtube/yt-lots.py
@@ -7,7 +7,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 import pandas as pd
 
 df = pd.read_csv('trends.csv')
-
 search_terms = df['query'].to_list()
 print(len(search_terms))
 
@@ -50,24 +49,28 @@ def parse_videos(obj):
             results.extend(parse_videos(v))
     return results
 
-def fetch_highest_resolution(video_id):
-    return "Not getting res"
+def fetch_resolution_and_bitrate(video_id):
+    return "Not including"
     url = f"https://www.youtube.com/watch?v={video_id}"
     try:
         resp = session.get(url, headers=HEADERS, timeout=10)
         resp.raise_for_status()
         match = re.search(r'ytInitialPlayerResponse\s*=\s*({.*?});', resp.text)
         if not match:
-            return "N/A"
+            return "N/A", "N/A"
         data = json.loads(match.group(1))
         formats = data.get("streamingData", {}).get("formats", [])
         max_res = 0
+        max_bitrate = 0
         for f in formats:
             if "height" in f and f["height"] > max_res:
                 max_res = f["height"]
-        return f"{max_res}p" if max_res else "N/A"
+            if "bitrate" in f and f["bitrate"] > max_bitrate:
+                max_bitrate = f["bitrate"]
+        return (f"{max_res}p" if max_res else "N/A",
+                f"{round(max_bitrate/1000)} kbps" if max_bitrate else "N/A")
     except Exception:
-        return "N/A"
+        return "N/A", "N/A"
 
 def fetch_more_results(continuation, api_key):
     try:
@@ -127,20 +130,20 @@ def get_recent_videos_for_term(term, max_videos=10000):
 
     final_videos = []
     with ThreadPoolExecutor(max_workers=200) as executor:  
-        future_to_vid = {executor.submit(fetch_highest_resolution, vid_id): (title, url, secs)
+        future_to_vid = {executor.submit(fetch_resolution_and_bitrate, vid_id): (title, url, secs)
                          for title, url, secs, vid_id in videos[:max_videos]}
         for future in as_completed(future_to_vid):
             title, url, secs = future_to_vid[future]
-            res = future.result()
-            final_videos.append((title, url, secs, res))
+            res, bitrate = future.result()
+            final_videos.append((title, url, secs, res, bitrate))
 
     return final_videos
 
 MAX_CONCURRENT_TERMS = 200
 
-with open("lots.csv", "w", newline="", encoding="utf-8") as f:
+with open("lots-without-res.csv", "w", newline="", encoding="utf-8") as f:
     writer = csv.writer(f)
-    writer.writerow(["term", "title", "url", "duration_seconds", "highest_resolution"])
+    writer.writerow(["term", "title", "url", "duration_seconds", "highest_resolution", "max_bitrate"])
 
     with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_TERMS) as executor:
         future_to_term = {
@@ -156,9 +159,9 @@ with open("lots.csv", "w", newline="", encoding="utf-8") as f:
                     avg = sum(v[2] for v in vids) / len(vids)
                     print(f"Average for '{term}': {avg/60:.2f} minutes over {len(vids)} videos")
 
-                    rows = [(term, title, url, secs, res) for title, url, secs, res in vids]
-                    writer.writerows(rows)  # ✅ write in batch per term
-                    f.flush()               # 💾 ensure data is written to disk
+                    rows = [(term, title, url, secs, res, bitrate) for title, url, secs, res, bitrate in vids]
+                    writer.writerows(rows)
+                    f.flush()
                 else:
                     print(f"No videos found for '{term}'.")
             except Exception as e:

	blog Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs

M	python/youtube/youtube-scraping.py	\|	4	++--
M	python/youtube/yt-lots.py	\|	31	+++++++++++++++++--------------