Respect robots.txt - information-retrieval - Unnamed repository; edit this file 'description' to name the repository.

commit 6c374cfe6d87c097ed0f3fe655d4b5ce0cc68837
parent 6654bdb0e73dbb6f1328cd341aed8f23936dcb96
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Thu,  1 Jan 2026 16:56:57 -0600

Respect robots.txt

Diffstat:
M collection/spider.py  | 41 +++++++++++++++++++++++++++++++++++++----
M indexing/tf.py  | 3 +++

2 files changed, 40 insertions(+), 4 deletions(-)
diff --git a/collection/spider.py b/collection/spider.py
@@ -9,6 +9,9 @@
 # using a queue to remove this additional logic. We can reuse that to populate the queue, but that is
 # unrelated.
 
+import urllib.robotparser
+from urllib.parse import urlparse
+import urllib.request
 import requests
 import os
 import datetime
@@ -43,7 +46,7 @@ import time
 
 # bytes
 MAX_SIZE = 2_000_000
-MAX_WORKERS = 5
+MAX_WORKERS = 250
 MAX_URLS_PER_SITE = 100
 NOT_INDEXED = 0
 INDEXED = 1
@@ -56,15 +59,45 @@ def should_queue(url, cur):
     """, (url, cutoff))
     return cur.fetchone() is None
 
-def search_url(url, filepath):
+def is_allowed(url, user_agent, timeout=.1):
+    try:
+        parsed = urlparse(url)
+        robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
+        rp = urllib.robotparser.RobotFileParser()
+        rp.set_url(robots_url)
+        with urllib.request.urlopen(robots_url, timeout=timeout) as response:
+            rp.parse(response.read().decode('utf-8').splitlines())
+        return rp.can_fetch(user_agent, url)
+    except Exception:
+        return True
+
+# you should always repect robots.txt, but if you are trying to do something with this spider I guess you can
+# disable it. please don't do this enmasse though, that's naughty.
+
+def search_url(url, filepath, respect_robots_txt=True):
+    user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0'
+    if respect_robots_txt:
+        if not is_allowed(url,user_agent):
+            print(f"Can't crawl {url} due to robots.txt violation")
+            return "", "", []
+
     links = set()
+
     headers = {
-        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0',
+        'User-Agent': user_agent,
     }
+
     try:
         source_code = requests.get(url, headers=headers, timeout=1) # natural limit to file size in memory
-        if source_code.status_code != 200:
+        if not source_code.ok:
+            print(f'Status code not 2xx for {url}, returning.')
+            return "", "", []
+        
+        content_type = source_code.headers.get('Content-Type', '')
+        if 'text/html' not in content_type:
+            print(f'Content type for {url} not html, returning.')
             return "", "", []
+
         soup = BeautifulSoup(source_code.content, 'html.parser')
         content = soup.prettify()
 
diff --git a/indexing/tf.py b/indexing/tf.py
@@ -29,6 +29,9 @@ if __name__ == "__main__":
     # considerations can be made for the necessity of this, but I think this is safe for now.
     # TODO: Possibly add idf imputation during for searches on terms that don't exist in the term table.
 
+
+    # TODO: update indexed status for sites and only use that status for indexing, not every file with get_filepaths
+
     for i in tqdm(range(0,len(filepaths))):
         filepath = filepaths[i]
         # this returns the tfs for all terms from terms that exist in the current document (filepath)

	information-retrieval Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs

M	collection/spider.py	\|	41	+++++++++++++++++++++++++++++++++++++----
M	indexing/tf.py	\|	3	+++