information-retrieval

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 863992bcb89baa8aa852607e1aa6cc07d891c47d
parent 49cfa7a25d876018bef98fba7b0e516820560740
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Wed,  7 Jan 2026 16:42:13 -0600

Incremental

Diffstat:
Mcrawling/spider.py | 96+++++++++++++++++++++++++++++++++++++++++++------------------------------------
1 file changed, 52 insertions(+), 44 deletions(-)

diff --git a/crawling/spider.py b/crawling/spider.py @@ -22,7 +22,7 @@ import urllib.request import requests import os from urllib.parse import urljoin, urlparse -from bs4 import BeautifulSoup +from lxml import html import sys import psycopg2 from crawling.constants import CRAWLING_DB @@ -37,11 +37,11 @@ import uuid from concurrent.futures import ThreadPoolExecutor, as_completed # this is the number of links we take out of the queue -LINK_SELECTION_COUNT = 500 +LINK_SELECTION_COUNT = 50000 MAX_SITE_SIZE = 2_000_000 MAX_URLS_PER_SITE = 100 # number of concurrent workers for thread pool executor -MAX_WORKERS = 50 +MAX_WORKERS = 200 # TODO: Only queue if we haven't already indexed it recently, or some other logic here. @@ -68,15 +68,28 @@ def crawl_url(url, filepath): written_to_fs = False user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0' - if not is_allowed(url,user_agent): - print(f"Can't crawl {url} due to robots.txt violation") - return written_to_fs, links - headers = { 'User-agent': user_agent, } + + # TODO: Remove to support more languages. + # TODO: Add Content-Length check for head (not always supported for shitty sites, like js stuff) + + try: + head_response = requests.head(url, headers=headers, timeout=1) + content_lang = head_response.headers.get('Content-Language', '') + if content_lang: + if not content_lang.split(',')[0].strip().lower().startswith('en'): + print(f'site not english {url}') + return written_to_fs, links + except Exception: + return written_to_fs, links + + if not is_allowed(url, user_agent): + print(f"Can't crawl {url} due to robots.txt violation") + return written_to_fs, links try: source_code = requests.get(url, headers=headers, timeout=1) if not source_code.ok: @@ -86,11 +99,10 @@ def crawl_url(url, filepath): content_type = source_code.headers.get('Content-Type', '') if 'text/html' not in content_type: print(f'Content type for {url} not html, returning.') - return False, links - - soup = BeautifulSoup(source_code.content, 'html.parser') - content = soup.prettify() - + return written_to_fs, links + doc = html.document_fromstring(source_code.content) + doc.make_links_absolute(url) + content = html.tostring(doc, pretty_print=True, encoding='unicode') if len(content.encode('utf-8')) < MAX_SITE_SIZE: with open(filepath, 'w') as f: f.write(content) @@ -102,33 +114,22 @@ def crawl_url(url, filepath): except Exception as e: print(e) return written_to_fs, links - current_url_without_fragment = urlparse(url)._replace(fragment='').geturl() - - # find all links < max_urls_per_site that direct to a different page. - for link in soup.find_all('a', href=True): - href = link.get('href') - - if href.startswith('#'): - continue - - absolute_url = urljoin(url, href) - parsed = urlparse(absolute_url) - - url_without_fragment = parsed._replace(fragment='').geturl() - - if url_without_fragment == current_url_without_fragment: - continue - - # TODO: Rank domains / subdomains or something like that - # there are some really long domains that seem to trap my crawlers. - if len(absolute_url) > 50: - continue - - if parsed.scheme in ('http', 'https'): - if len(links) < MAX_URLS_PER_SITE: - links.add(absolute_url) - + for el in doc.iter('a'): + href = el.get('href') + if not href: + continue + + parsed = urlparse(href) + url_without_fragment = parsed._replace(fragment='').geturl() + + if url_without_fragment == current_url_without_fragment: + continue + + if len(href) > 50: + continue + if parsed.scheme in ('http', 'https') and len(links) < MAX_URLS_PER_SITE: + links.add(href) assert written_to_fs == True return written_to_fs, links @@ -272,6 +273,8 @@ def ensure_queued_sites_and_get_connection(db_name, db_user, db_password, db_hos crawl_requests INTEGER DEFAULT 1 NOT NULL CHECK (crawl_requests >= 1), depth INTEGER NOT NULL CHECK (depth >= 0) ); + CREATE INDEX IF NOT EXISTS idx_queued_url ON queued_site (url); + CREATE INDEX IF NOT EXISTS idx_queued_depth ON queued_site (depth); """ cursor.execute(create_table_query) @@ -455,12 +458,17 @@ if __name__ == "__main__": with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: futures = {executor.submit(crawl, url): url for url in urls_dict} for future in as_completed(futures): - url, success, filepath, links = future.result() - # success means the html was written to the filepath - # if not success, just delete from the db - move_url_to_indexing_if_success(conn, url, filepath, success, conn_indexing_queue) - current_urls_depth = urls_dict[url] - queue_urls_for_crawling(conn, links, current_urls_depth) + try: + + url, success, filepath, links = future.result(timeout=20) + # success means the html was written to the filepath + # if not success, just delete from the db + move_url_to_indexing_if_success(conn, url, filepath, success, conn_indexing_queue) + current_urls_depth = urls_dict[url] + queue_urls_for_crawling(conn, links, current_urls_depth) + except TimeoutError: + print(f"Timeout, cancelling") + future.cancel() conn.close() conn_indexing_queue.close()