information-retrieval

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 6c374cfe6d87c097ed0f3fe655d4b5ce0cc68837
parent 6654bdb0e73dbb6f1328cd341aed8f23936dcb96
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Thu,  1 Jan 2026 16:56:57 -0600

Respect robots.txt

Diffstat:
Mcollection/spider.py | 41+++++++++++++++++++++++++++++++++++++----
Mindexing/tf.py | 3+++
2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/collection/spider.py b/collection/spider.py @@ -9,6 +9,9 @@ # using a queue to remove this additional logic. We can reuse that to populate the queue, but that is # unrelated. +import urllib.robotparser +from urllib.parse import urlparse +import urllib.request import requests import os import datetime @@ -43,7 +46,7 @@ import time # bytes MAX_SIZE = 2_000_000 -MAX_WORKERS = 5 +MAX_WORKERS = 250 MAX_URLS_PER_SITE = 100 NOT_INDEXED = 0 INDEXED = 1 @@ -56,15 +59,45 @@ def should_queue(url, cur): """, (url, cutoff)) return cur.fetchone() is None -def search_url(url, filepath): +def is_allowed(url, user_agent, timeout=.1): + try: + parsed = urlparse(url) + robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt" + rp = urllib.robotparser.RobotFileParser() + rp.set_url(robots_url) + with urllib.request.urlopen(robots_url, timeout=timeout) as response: + rp.parse(response.read().decode('utf-8').splitlines()) + return rp.can_fetch(user_agent, url) + except Exception: + return True + +# you should always repect robots.txt, but if you are trying to do something with this spider I guess you can +# disable it. please don't do this enmasse though, that's naughty. + +def search_url(url, filepath, respect_robots_txt=True): + user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0' + if respect_robots_txt: + if not is_allowed(url,user_agent): + print(f"Can't crawl {url} due to robots.txt violation") + return "", "", [] + links = set() + headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0', + 'User-Agent': user_agent, } + try: source_code = requests.get(url, headers=headers, timeout=1) # natural limit to file size in memory - if source_code.status_code != 200: + if not source_code.ok: + print(f'Status code not 2xx for {url}, returning.') + return "", "", [] + + content_type = source_code.headers.get('Content-Type', '') + if 'text/html' not in content_type: + print(f'Content type for {url} not html, returning.') return "", "", [] + soup = BeautifulSoup(source_code.content, 'html.parser') content = soup.prettify() diff --git a/indexing/tf.py b/indexing/tf.py @@ -29,6 +29,9 @@ if __name__ == "__main__": # considerations can be made for the necessity of this, but I think this is safe for now. # TODO: Possibly add idf imputation during for searches on terms that don't exist in the term table. + + # TODO: update indexed status for sites and only use that status for indexing, not every file with get_filepaths + for i in tqdm(range(0,len(filepaths))): filepath = filepaths[i] # this returns the tfs for all terms from terms that exist in the current document (filepath)