Incremental - information-retrieval - Unnamed repository; edit this file 'description' to name the repository.

commit 863992bcb89baa8aa852607e1aa6cc07d891c47d
parent 49cfa7a25d876018bef98fba7b0e516820560740
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Wed,  7 Jan 2026 16:42:13 -0600

Incremental

Diffstat:
M crawling/spider.py  | 96 +++++++++++++++++++++++++++++++++++++++++++------------------------------------

1 file changed, 52 insertions(+), 44 deletions(-)
diff --git a/crawling/spider.py b/crawling/spider.py
@@ -22,7 +22,7 @@ import urllib.request
 import requests
 import os
 from urllib.parse import urljoin, urlparse
-from bs4 import BeautifulSoup
+from lxml import html
 import sys
 import psycopg2
 from crawling.constants import CRAWLING_DB 
@@ -37,11 +37,11 @@ import uuid
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
 # this is the number of links we take out of the queue
-LINK_SELECTION_COUNT = 500
+LINK_SELECTION_COUNT = 50000
 MAX_SITE_SIZE = 2_000_000
 MAX_URLS_PER_SITE = 100
 # number of concurrent workers for thread pool executor
-MAX_WORKERS = 50
+MAX_WORKERS = 200
 
 # TODO: Only queue if we haven't already indexed it recently, or some other logic here.
 
@@ -68,15 +68,28 @@ def crawl_url(url, filepath):
     written_to_fs = False
 
     user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0'
-    if not is_allowed(url,user_agent):
-        print(f"Can't crawl {url} due to robots.txt violation")
-        return written_to_fs, links
-
 
     headers = {
         'User-agent': user_agent,
     }
 
+
+    # TODO: Remove to support more languages.
+    # TODO: Add Content-Length check for head (not always supported for shitty sites, like js stuff)
+
+    try:
+        head_response = requests.head(url, headers=headers, timeout=1)
+        content_lang = head_response.headers.get('Content-Language', '')
+        if content_lang:
+            if not content_lang.split(',')[0].strip().lower().startswith('en'):
+                print(f'site not english {url}')
+                return written_to_fs, links
+    except Exception:
+        return written_to_fs, links
+
+    if not is_allowed(url, user_agent):
+        print(f"Can't crawl {url} due to robots.txt violation")
+        return written_to_fs, links
     try:
         source_code = requests.get(url, headers=headers, timeout=1)
         if not source_code.ok:
@@ -86,11 +99,10 @@ def crawl_url(url, filepath):
         content_type = source_code.headers.get('Content-Type', '')
         if 'text/html' not in content_type:
             print(f'Content type for {url} not html, returning.')
-            return False, links
-
-        soup = BeautifulSoup(source_code.content, 'html.parser')
-        content = soup.prettify()
-
+            return written_to_fs, links
+        doc = html.document_fromstring(source_code.content)
+        doc.make_links_absolute(url)
+        content = html.tostring(doc, pretty_print=True, encoding='unicode')
         if len(content.encode('utf-8')) < MAX_SITE_SIZE:
             with open(filepath, 'w') as f:
                 f.write(content)
@@ -102,33 +114,22 @@ def crawl_url(url, filepath):
     except Exception as e:
         print(e)
         return written_to_fs, links
-
     current_url_without_fragment = urlparse(url)._replace(fragment='').geturl()
-
-    # find all links < max_urls_per_site that direct to a different page.
-    for link in soup.find_all('a', href=True):
-            href = link.get('href')
-
-            if href.startswith('#'):
-                continue
-
-            absolute_url = urljoin(url, href)
-            parsed = urlparse(absolute_url)
-
-            url_without_fragment = parsed._replace(fragment='').geturl()
-            
-            if url_without_fragment == current_url_without_fragment:
-                continue
-            
-            # TODO: Rank domains / subdomains or something like that
-            # there are some really long domains that seem to trap my crawlers.
-            if len(absolute_url) > 50:
-                continue
-
-            if parsed.scheme in ('http', 'https'):
-                if len(links) < MAX_URLS_PER_SITE:
-                    links.add(absolute_url)
-
+    for el in doc.iter('a'):
+        href = el.get('href')
+        if not href:
+            continue
+        
+        parsed = urlparse(href)
+        url_without_fragment = parsed._replace(fragment='').geturl()
+        
+        if url_without_fragment == current_url_without_fragment:
+            continue
+        
+        if len(href) > 50:
+            continue
+        if parsed.scheme in ('http', 'https') and len(links) < MAX_URLS_PER_SITE:
+            links.add(href)
     assert written_to_fs == True
     return written_to_fs, links
 
@@ -272,6 +273,8 @@ def ensure_queued_sites_and_get_connection(db_name, db_user, db_password, db_hos
             crawl_requests INTEGER DEFAULT 1 NOT NULL CHECK (crawl_requests >= 1),
             depth INTEGER NOT NULL CHECK (depth >= 0)
         );
+        CREATE INDEX IF NOT EXISTS idx_queued_url ON queued_site (url);
+        CREATE INDEX IF NOT EXISTS idx_queued_depth ON queued_site (depth);
     """
 
     cursor.execute(create_table_query)
@@ -455,12 +458,17 @@ if __name__ == "__main__":
         with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
             futures = {executor.submit(crawl, url): url for url in urls_dict}
             for future in as_completed(futures):
-                url, success, filepath, links = future.result()
-                # success means the html was written to the filepath
-                # if not success, just delete from the db
-                move_url_to_indexing_if_success(conn, url, filepath, success, conn_indexing_queue)
-                current_urls_depth = urls_dict[url]
-                queue_urls_for_crawling(conn, links, current_urls_depth)
+                try:
+
+                    url, success, filepath, links = future.result(timeout=20)
+                    # success means the html was written to the filepath
+                    # if not success, just delete from the db
+                    move_url_to_indexing_if_success(conn, url, filepath, success, conn_indexing_queue)
+                    current_urls_depth = urls_dict[url]
+                    queue_urls_for_crawling(conn, links, current_urls_depth)
+                except TimeoutError:
+                    print(f"Timeout, cancelling")
+                    future.cancel()
 
     conn.close()
     conn_indexing_queue.close()

	information-retrieval Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs