commit 863992bcb89baa8aa852607e1aa6cc07d891c47d
parent 49cfa7a25d876018bef98fba7b0e516820560740
Author: Andrew Laack <andrew.laack@imbue.com>
Date: Wed, 7 Jan 2026 16:42:13 -0600
Incremental
Diffstat:
| M | crawling/spider.py | | | 96 | +++++++++++++++++++++++++++++++++++++++++++------------------------------------ |
1 file changed, 52 insertions(+), 44 deletions(-)
diff --git a/crawling/spider.py b/crawling/spider.py
@@ -22,7 +22,7 @@ import urllib.request
import requests
import os
from urllib.parse import urljoin, urlparse
-from bs4 import BeautifulSoup
+from lxml import html
import sys
import psycopg2
from crawling.constants import CRAWLING_DB
@@ -37,11 +37,11 @@ import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed
# this is the number of links we take out of the queue
-LINK_SELECTION_COUNT = 500
+LINK_SELECTION_COUNT = 50000
MAX_SITE_SIZE = 2_000_000
MAX_URLS_PER_SITE = 100
# number of concurrent workers for thread pool executor
-MAX_WORKERS = 50
+MAX_WORKERS = 200
# TODO: Only queue if we haven't already indexed it recently, or some other logic here.
@@ -68,15 +68,28 @@ def crawl_url(url, filepath):
written_to_fs = False
user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0'
- if not is_allowed(url,user_agent):
- print(f"Can't crawl {url} due to robots.txt violation")
- return written_to_fs, links
-
headers = {
'User-agent': user_agent,
}
+
+ # TODO: Remove to support more languages.
+ # TODO: Add Content-Length check for head (not always supported for shitty sites, like js stuff)
+
+ try:
+ head_response = requests.head(url, headers=headers, timeout=1)
+ content_lang = head_response.headers.get('Content-Language', '')
+ if content_lang:
+ if not content_lang.split(',')[0].strip().lower().startswith('en'):
+ print(f'site not english {url}')
+ return written_to_fs, links
+ except Exception:
+ return written_to_fs, links
+
+ if not is_allowed(url, user_agent):
+ print(f"Can't crawl {url} due to robots.txt violation")
+ return written_to_fs, links
try:
source_code = requests.get(url, headers=headers, timeout=1)
if not source_code.ok:
@@ -86,11 +99,10 @@ def crawl_url(url, filepath):
content_type = source_code.headers.get('Content-Type', '')
if 'text/html' not in content_type:
print(f'Content type for {url} not html, returning.')
- return False, links
-
- soup = BeautifulSoup(source_code.content, 'html.parser')
- content = soup.prettify()
-
+ return written_to_fs, links
+ doc = html.document_fromstring(source_code.content)
+ doc.make_links_absolute(url)
+ content = html.tostring(doc, pretty_print=True, encoding='unicode')
if len(content.encode('utf-8')) < MAX_SITE_SIZE:
with open(filepath, 'w') as f:
f.write(content)
@@ -102,33 +114,22 @@ def crawl_url(url, filepath):
except Exception as e:
print(e)
return written_to_fs, links
-
current_url_without_fragment = urlparse(url)._replace(fragment='').geturl()
-
- # find all links < max_urls_per_site that direct to a different page.
- for link in soup.find_all('a', href=True):
- href = link.get('href')
-
- if href.startswith('#'):
- continue
-
- absolute_url = urljoin(url, href)
- parsed = urlparse(absolute_url)
-
- url_without_fragment = parsed._replace(fragment='').geturl()
-
- if url_without_fragment == current_url_without_fragment:
- continue
-
- # TODO: Rank domains / subdomains or something like that
- # there are some really long domains that seem to trap my crawlers.
- if len(absolute_url) > 50:
- continue
-
- if parsed.scheme in ('http', 'https'):
- if len(links) < MAX_URLS_PER_SITE:
- links.add(absolute_url)
-
+ for el in doc.iter('a'):
+ href = el.get('href')
+ if not href:
+ continue
+
+ parsed = urlparse(href)
+ url_without_fragment = parsed._replace(fragment='').geturl()
+
+ if url_without_fragment == current_url_without_fragment:
+ continue
+
+ if len(href) > 50:
+ continue
+ if parsed.scheme in ('http', 'https') and len(links) < MAX_URLS_PER_SITE:
+ links.add(href)
assert written_to_fs == True
return written_to_fs, links
@@ -272,6 +273,8 @@ def ensure_queued_sites_and_get_connection(db_name, db_user, db_password, db_hos
crawl_requests INTEGER DEFAULT 1 NOT NULL CHECK (crawl_requests >= 1),
depth INTEGER NOT NULL CHECK (depth >= 0)
);
+ CREATE INDEX IF NOT EXISTS idx_queued_url ON queued_site (url);
+ CREATE INDEX IF NOT EXISTS idx_queued_depth ON queued_site (depth);
"""
cursor.execute(create_table_query)
@@ -455,12 +458,17 @@ if __name__ == "__main__":
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = {executor.submit(crawl, url): url for url in urls_dict}
for future in as_completed(futures):
- url, success, filepath, links = future.result()
- # success means the html was written to the filepath
- # if not success, just delete from the db
- move_url_to_indexing_if_success(conn, url, filepath, success, conn_indexing_queue)
- current_urls_depth = urls_dict[url]
- queue_urls_for_crawling(conn, links, current_urls_depth)
+ try:
+
+ url, success, filepath, links = future.result(timeout=20)
+ # success means the html was written to the filepath
+ # if not success, just delete from the db
+ move_url_to_indexing_if_success(conn, url, filepath, success, conn_indexing_queue)
+ current_urls_depth = urls_dict[url]
+ queue_urls_for_crawling(conn, links, current_urls_depth)
+ except TimeoutError:
+ print(f"Timeout, cancelling")
+ future.cancel()
conn.close()
conn_indexing_queue.close()