commit 7067745bb56e6e07eabb2574447937fbc0ec3779
parent 0547119084bf9b2df319d703a9e30f116e940c66
Author: Andrew Laack <andrew.laack@imbue.com>
Date: Sat, 3 Jan 2026 02:18:03 -0600
Started making this better. No sqlite, better code, all that. Still needs some refactoring though....
Diffstat:
13 files changed, 400 insertions(+), 0 deletions(-)
diff --git a/Makefile b/Makefile
@@ -0,0 +1,2 @@
+clean:
+ python3 crawling/clean.py
diff --git a/crawling/README.md b/crawling/README.md
@@ -0,0 +1,13 @@
+# Crawling
+
+The purpose of this directory is to facilitate web crawling. This directory should suffice to setup your crawler and your queueing database.
+
+## DB
+
+The backing database for queueing is postgresql for consistency. The database name is 'crawling'.
+
+
+### Schema
+
+crawling:
+ - queued_site(url, creation_timestamp, status, claimed_at, crawl_requests, depth)
diff --git a/crawling/clean.py b/crawling/clean.py
@@ -0,0 +1,32 @@
+from spider import get_crawling_db_connection
+import os
+from spider import get_indexing_db_connection
+
+
+if __name__ == "__main__":
+ try:
+ os.rmdir("/var/lib/search/crawl_cache")
+ except FileNotFoundError as e:
+ print("Crawl cache directory doesn't exist, continuing with cleanup")
+ crawling_conn = get_crawling_db_connection()
+
+ crawling_cur = crawling_conn.cursor()
+ crawling_cur.execute("""
+ DROP TABLE queued_site;
+ """)
+ crawling_cur.close()
+ crawling_conn.commit()
+ crawling_conn.close()
+
+ print("Crawling datbase cleaned")
+
+ indexing_conn = get_indexing_db_connection()
+ indexing_cur = indexing_conn.cursor()
+ indexing_cur.execute("""
+ DROP TABLE indexing_queue;
+ """)
+ indexing_cur.close()
+ indexing_conn.commit()
+ indexing_conn.close()
+
+ print("Indexing datbase cleaned")
diff --git a/crawling/constants.py b/crawling/constants.py
@@ -0,0 +1,8 @@
+# CONSTANTS FOR CRAWLING, INDEXING, and SEARCHING
+CACHE_DIRECTORY = "/var/lib/search/crawl_cache"
+CRAWLING_DB = "crawling"
+INDEXING_DB = "indexing"
+DB_PASSWORD_ENV_VAR = "CRAWLING_DB_PASSWORD"
+DB_PORT = 5432
+DB_HOST = 'localhost'
+DB_USER_ENV_VAR = 'CRAWLING_DB_USER'
diff --git a/crawling/spider.py b/crawling/spider.py
@@ -0,0 +1,290 @@
+import urllib.robotparser
+from urllib.parse import urlparse
+import urllib.request
+import requests
+import os
+from urllib.parse import urljoin, urlparse
+from bs4 import BeautifulSoup
+import sys
+import psycopg2
+from constants import CRAWLING_DB
+from constants import INDEXING_DB
+from constants import CACHE_DIRECTORY
+from constants import DB_PASSWORD_ENV_VAR
+from constants import DB_USER_ENV_VAR
+from constants import DB_HOST
+from constants import DB_PORT
+import urllib
+import uuid
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# this is the number of links we take out of the queue
+LINK_SELECTION_COUNT = 500
+MAX_SITE_SIZE = 2_000_000
+MAX_URLS_PER_SITE = 100
+# number of concurrent workers for thread pool executor
+MAX_WORKERS = 50
+
+# TODO: Only queue if we haven't already indexed it recently.
+def queue_urls_for_crawling(conn, urls, prior_depth):
+ current_depth = prior_depth + 1
+ for url in urls:
+ insert_or_increment_urls(conn, {url: current_depth})
+
+def is_allowed(url, user_agent, timeout=1):
+ try:
+ parsed = urlparse(url)
+ robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
+ rp = urllib.robotparser.RobotFileParser()
+ rp.set_url(robots_url)
+ with urllib.request.urlopen(robots_url, timeout=timeout) as response:
+ rp.parse(response.read().decode('utf-8').splitlines())
+ return rp.can_fetch(user_agent, url)
+ except Exception:
+ return True
+
+# TODO: How can we limit the request size prior to loading it into memory?
+def crawl_url(url, filepath):
+ links = set()
+ written_to_fs = False
+
+ user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0'
+ if not is_allowed(url,user_agent):
+ print(f"Can't crawl {url} due to robots.txt violation")
+ return written_to_fs, links
+
+
+ headers = {
+ 'User-Agent': user_agent,
+ }
+
+ try:
+ source_code = requests.get(url, headers=headers, timeout=1)
+ if not source_code.ok:
+ print(f'Status code not 2xx for {url}, returning.')
+ return written_to_fs, links
+
+ content_type = source_code.headers.get('Content-Type', '')
+ if 'text/html' not in content_type:
+ print(f'Content type for {url} not html, returning.')
+ return False, links
+
+ soup = BeautifulSoup(source_code.content, 'html.parser')
+ content = soup.prettify()
+
+ if len(content.encode('utf-8')) < MAX_SITE_SIZE:
+ with open(filepath, 'w') as f:
+ f.write(content)
+ written_to_fs = True
+ print(f'Wrote {url} to {filepath}')
+ else:
+ print(f'skipping fs write for {url}, too large')
+ return written_to_fs, links
+ except Exception as e:
+ print(e)
+ return written_to_fs, links
+
+ current_url_without_fragment = urlparse(url)._replace(fragment='').geturl()
+
+ # find all links < max_urls_per_site that direct to a different page.
+ for link in soup.find_all('a', href=True):
+ href = link.get('href')
+
+ if href.startswith('#'):
+ continue
+
+ absolute_url = urljoin(url, href)
+ parsed = urlparse(absolute_url)
+
+ url_without_fragment = parsed._replace(fragment='').geturl()
+
+ if url_without_fragment == current_url_without_fragment:
+ continue
+
+ if parsed.scheme in ('http', 'https'):
+ if len(links) < MAX_URLS_PER_SITE:
+ links.add(absolute_url)
+
+ assert written_to_fs == True
+ return written_to_fs, links
+
+def crawl(url):
+ filepath = CACHE_DIRECTORY + "/" + str(uuid.uuid4())
+ success, links = crawl_url(url, filepath)
+ return url, success, filepath, links
+
+# urls: {url: depth}
+def insert_or_increment_urls(conn, urls_dict):
+ cursor = conn.cursor()
+ for url in urls_dict:
+ depth = urls_dict[url]
+ query = """
+ INSERT INTO queued_site (url, depth)
+ VALUES (%s, %s)
+ ON CONFLICT (url) DO UPDATE
+ SET depth = LEAST(queued_site.depth, EXCLUDED.depth),
+ crawl_requests = queued_site.crawl_requests + 1
+ """
+ cursor.execute(query, (url, depth))
+
+ cursor.close()
+ conn.commit()
+
+def move_url_to_indexing_if_success(conn, url, filepath, success, conn_indexing):
+ cursor = conn.cursor()
+ delete_query = """
+ DELETE FROM queued_site
+ WHERE url = %s
+ """
+ cursor.execute(delete_query, (url,))
+ cursor.close()
+
+ if success:
+ assert os.path.isfile(filepath)
+
+ cursor_indexing = conn_indexing.cursor()
+ cursor_indexing.execute("SELECT filepath FROM indexing_queue WHERE url = %s", (url,))
+ existing = cursor_indexing.fetchone()
+ old_filepath = existing[0] if existing else None
+ upsert_query = """
+ INSERT INTO indexing_queue (url, filepath) VALUES (%s, %s)
+ ON CONFLICT (url) DO UPDATE SET filepath = EXCLUDED.filepath
+ """
+ cursor_indexing.execute(upsert_query, (url, filepath))
+ if old_filepath and os.path.isfile(old_filepath):
+ os.remove(old_filepath)
+ cursor_indexing.close()
+ conn_indexing.commit()
+
+ conn.commit()
+
+
+def get_k_urls_with_depth_from_db(conn, k):
+ cursor = conn.cursor()
+
+ # TODO: Improve this to make use of all attributes we have.
+ select_top_priority_elements_query = """
+ UPDATE queued_site
+ SET status = 'processing', claimed_at = NOW()
+ WHERE (url) IN (
+ SELECT url
+ FROM queued_site
+ WHERE status = 'pending'
+ ORDER BY depth DESC, creation_timestamp ASC
+ LIMIT %s
+ )
+ RETURNING url, depth;
+ """
+ cursor.execute(select_top_priority_elements_query, (k,))
+ result = cursor.fetchall()
+ conn.commit()
+ result = {res[0] : res[1] for res in result}
+ cursor.close()
+ return result
+
+def ensure_indexing_queue_and_get_connection(db_name, db_user, db_password, db_host, db_port):
+ conn = psycopg2.connect(
+ database=db_name,
+ user=db_user,
+ password=db_password,
+ host=db_host,
+ port=db_port
+ )
+ cursor = conn.cursor()
+ create_table_query = """
+ CREATE TABLE IF NOT EXISTS indexing_queue (
+ url TEXT PRIMARY KEY,
+ filepath TEXT NOT NULL,
+ creation_timestamp TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP NOT NULL
+ );
+ """
+
+ cursor.execute(create_table_query)
+ conn.commit()
+ cursor.close()
+ return conn
+
+def ensure_queued_sites_and_get_connection(db_name, db_user, db_password, db_host, db_port):
+ conn = psycopg2.connect(
+ database=db_name,
+ user=db_user,
+ password=db_password,
+ host=db_host,
+ port=db_port
+ )
+ cursor = conn.cursor()
+
+ # If a site can't be reached it will still be removed from the db.
+ # The status is only for spiders that stop mid execution, and to ensure
+ # multiple spiders don't grab the same url.
+
+ # TODO: See above, add logic to unset claimed status after certain amount of time.
+
+ create_table_query = """
+ CREATE TABLE IF NOT EXISTS queued_site (
+ url TEXT PRIMARY KEY,
+ creation_timestamp TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP NOT NULL,
+ status TEXT DEFAULT 'pending' NOT NULL
+ CHECK (status IN ('pending', 'processing')),
+ claimed_at TIMESTAMPTZ,
+ crawl_requests INTEGER DEFAULT 1 NOT NULL CHECK (crawl_requests >= 1),
+ depth INTEGER NOT NULL CHECK (depth >= 0)
+ );
+ """
+
+ cursor.execute(create_table_query)
+ conn.commit()
+ cursor.close()
+ return conn
+
+def insert_seed_file(filepath, conn):
+ with open(filepath, 'r') as f:
+ urls = f.readlines()
+
+ to_insert = {}
+ for url in urls:
+ to_insert[url.strip()] = 0
+
+ insert_or_increment_urls(conn, to_insert)
+
+def get_crawling_db_connection():
+ password = os.getenv(DB_PASSWORD_ENV_VAR)
+ username = os.getenv(DB_USER_ENV_VAR)
+ conn = ensure_queued_sites_and_get_connection(CRAWLING_DB, username, password, DB_HOST, DB_PORT)
+ return conn
+
+def get_indexing_db_connection():
+ password = os.getenv(DB_PASSWORD_ENV_VAR)
+ username = os.getenv(DB_USER_ENV_VAR)
+ conn_indexing_queue = ensure_indexing_queue_and_get_connection(INDEXING_DB, username, password, DB_HOST, DB_PORT)
+ return conn_indexing_queue
+
+if __name__ == "__main__":
+ if not os.path.exists(CACHE_DIRECTORY):
+ os.makedirs(CACHE_DIRECTORY)
+
+ conn = get_crawling_db_connection()
+ conn_indexing_queue = get_indexing_db_connection()
+
+ if len(sys.argv) > 1:
+ for filepath in sys.argv[1:]:
+ insert_seed_file(filepath, conn)
+ print(f"Inserted urls from {filepath}")
+
+ while True:
+ urls_dict = get_k_urls_with_depth_from_db(conn, LINK_SELECTION_COUNT)
+ if len(urls_dict) == 0:
+ print('No URLs to search... Exiting (if you are just starting, try passing in a seed file)')
+ break
+
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+ futures = {executor.submit(crawl, url): url for url in urls_dict}
+ for future in as_completed(futures):
+ url, success, filepath, links = future.result()
+ # success means the html was written to the filepath
+ move_url_to_indexing_if_success(conn, url, filepath, success, conn_indexing_queue)
+ current_urls_depth = urls_dict[url]
+ queue_urls_for_crawling(conn, links, current_urls_depth)
+
+ conn.close()
+ conn_indexing_queue.close()
diff --git a/seeds/code.txt b/seeds/code.txt
@@ -0,0 +1,7 @@
+https://www.die.net/
+https://en.wikipedia.org/wiki/List_of_programming_languages
+https://ziglang.org/
+https://rust-lang.org/
+https://www.python.org/
+https://cppreference.com/
+https://cplusplus.com/reference/
diff --git a/seeds/dictionaries.txt b/seeds/dictionaries.txt
@@ -0,0 +1,4 @@
+https://www.dictionary.com
+https://www.merriam-webster.com
+https://www.oed.com
+https://dictionary.cambridge.org
diff --git a/seeds/music.txt b/seeds/music.txt
@@ -0,0 +1,5 @@
+https://open.spotify.com/
+https://www.tunecore.com/
+https://www.allmusic.com/
+https://musicbrainz.org/
+https://www.google.com/search?q=music+indexing+sites
diff --git a/seeds/otr.txt b/seeds/otr.txt
@@ -0,0 +1,8 @@
+https://laack.co
+https://arstechnica.com
+https://geohot.github.io/blog
+https://suckless.org
+https://blog.laack.co
+https://stevana.github.io
+https://lukesmith.xyz
+https://github.com/sindresorhus/awesome
diff --git a/seeds/piracy.txt b/seeds/piracy.txt
@@ -0,0 +1,6 @@
+https://annas-archive.org/
+https://libgen.ac/
+https://forum.mobilism.me/
+https://github.com/Igglybuff/awesome-piracy
+https://www.reddit.com/r/CuratedTumblr/comments/1e63sew/for_those_too_lazy_to_check_the_rpiracy_megathread/
+https://sci-hub.se/
diff --git a/seeds/research.txt b/seeds/research.txt
@@ -0,0 +1,7 @@
+https://xlinux.nist.gov/dads/
+https://thimbleby.gitlab.io/algorithm-wiki-site/
+https://www.kaggle.com/
+https://arxiv.org/
+https://research.com/journals-rankings/computer-science
+https://scholar.google.com/citations?view_op=top_venues&hl=en&vq=eng
+https://scholar.google.com/citations?view_op=top_venues&hl=en&vq=phy
diff --git a/seeds/wikis.txt b/seeds/wikis.txt
@@ -0,0 +1,6 @@
+https://en.wikipedia.org/wiki/Main_Page
+https://archlinux.org/
+https://wiki.ubuntu.com/
+https://repair.wiki/w/Main_Page#gsc.tab=0
+https://stackoverflow.com
+https://stackexchange.com
diff --git a/setup.sh b/setup.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+sudo mkdir -p /var/lib/search/
+sudo chown $USER /var/lib/search/
+
+# TODO: Automate creation process (should be possible with psycopg2)
+# Sign into postgres
+# Create user
+# createdb crawling
+# createdb indexing
+# createdb search
+# TODO: Also,