commit 6c374cfe6d87c097ed0f3fe655d4b5ce0cc68837
parent 6654bdb0e73dbb6f1328cd341aed8f23936dcb96
Author: Andrew Laack <andrew.laack@imbue.com>
Date: Thu, 1 Jan 2026 16:56:57 -0600
Respect robots.txt
Diffstat:
2 files changed, 40 insertions(+), 4 deletions(-)
diff --git a/collection/spider.py b/collection/spider.py
@@ -9,6 +9,9 @@
# using a queue to remove this additional logic. We can reuse that to populate the queue, but that is
# unrelated.
+import urllib.robotparser
+from urllib.parse import urlparse
+import urllib.request
import requests
import os
import datetime
@@ -43,7 +46,7 @@ import time
# bytes
MAX_SIZE = 2_000_000
-MAX_WORKERS = 5
+MAX_WORKERS = 250
MAX_URLS_PER_SITE = 100
NOT_INDEXED = 0
INDEXED = 1
@@ -56,15 +59,45 @@ def should_queue(url, cur):
""", (url, cutoff))
return cur.fetchone() is None
-def search_url(url, filepath):
+def is_allowed(url, user_agent, timeout=.1):
+ try:
+ parsed = urlparse(url)
+ robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
+ rp = urllib.robotparser.RobotFileParser()
+ rp.set_url(robots_url)
+ with urllib.request.urlopen(robots_url, timeout=timeout) as response:
+ rp.parse(response.read().decode('utf-8').splitlines())
+ return rp.can_fetch(user_agent, url)
+ except Exception:
+ return True
+
+# you should always repect robots.txt, but if you are trying to do something with this spider I guess you can
+# disable it. please don't do this enmasse though, that's naughty.
+
+def search_url(url, filepath, respect_robots_txt=True):
+ user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0'
+ if respect_robots_txt:
+ if not is_allowed(url,user_agent):
+ print(f"Can't crawl {url} due to robots.txt violation")
+ return "", "", []
+
links = set()
+
headers = {
- 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0',
+ 'User-Agent': user_agent,
}
+
try:
source_code = requests.get(url, headers=headers, timeout=1) # natural limit to file size in memory
- if source_code.status_code != 200:
+ if not source_code.ok:
+ print(f'Status code not 2xx for {url}, returning.')
+ return "", "", []
+
+ content_type = source_code.headers.get('Content-Type', '')
+ if 'text/html' not in content_type:
+ print(f'Content type for {url} not html, returning.')
return "", "", []
+
soup = BeautifulSoup(source_code.content, 'html.parser')
content = soup.prettify()
diff --git a/indexing/tf.py b/indexing/tf.py
@@ -29,6 +29,9 @@ if __name__ == "__main__":
# considerations can be made for the necessity of this, but I think this is safe for now.
# TODO: Possibly add idf imputation during for searches on terms that don't exist in the term table.
+
+ # TODO: update indexed status for sites and only use that status for indexing, not every file with get_filepaths
+
for i in tqdm(range(0,len(filepaths))):
filepath = filepaths[i]
# this returns the tfs for all terms from terms that exist in the current document (filepath)