Started making this better. No sqlite, better code, all that. Still needs some refactoring though.... - information-retrieval - Unnamed repository; edit this file 'description' to name the repository.

commit 7067745bb56e6e07eabb2574447937fbc0ec3779
parent 0547119084bf9b2df319d703a9e30f116e940c66
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Sat,  3 Jan 2026 02:18:03 -0600

Started making this better. No sqlite, better code, all that. Still needs some refactoring though....

Diffstat:
A Makefile  | 2 ++
A crawling/README.md  | 13 +++++++++++++
A crawling/clean.py  | 32 ++++++++++++++++++++++++++++++++
A crawling/constants.py  | 8 ++++++++
A crawling/spider.py  | 290 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A seeds/code.txt  | 7 +++++++
A seeds/dictionaries.txt  | 4 ++++
A seeds/music.txt  | 5 +++++
A seeds/otr.txt  | 8 ++++++++
A seeds/piracy.txt  | 6 ++++++
A seeds/research.txt  | 7 +++++++
A seeds/wikis.txt  | 6 ++++++
A setup.sh  | 12 ++++++++++++

13 files changed, 400 insertions(+), 0 deletions(-)
diff --git a/Makefile b/Makefile
@@ -0,0 +1,2 @@
+clean:
+	python3 crawling/clean.py
diff --git a/crawling/README.md b/crawling/README.md
@@ -0,0 +1,13 @@
+# Crawling
+
+The purpose of this directory is to facilitate web crawling. This directory should suffice to setup your crawler and your queueing database.
+
+## DB
+
+The backing database for queueing is postgresql for consistency. The database name is 'crawling'.
+
+
+### Schema
+
+crawling:
+    - queued_site(url, creation_timestamp, status, claimed_at, crawl_requests, depth)
diff --git a/crawling/clean.py b/crawling/clean.py
@@ -0,0 +1,32 @@
+from spider import get_crawling_db_connection
+import os
+from spider import get_indexing_db_connection
+
+
+if __name__ == "__main__":
+    try:
+        os.rmdir("/var/lib/search/crawl_cache")
+    except FileNotFoundError as e:
+        print("Crawl cache directory doesn't exist, continuing with cleanup")
+    crawling_conn = get_crawling_db_connection()
+    
+    crawling_cur = crawling_conn.cursor()
+    crawling_cur.execute("""
+        DROP TABLE queued_site;
+    """)
+    crawling_cur.close()
+    crawling_conn.commit()
+    crawling_conn.close()
+
+    print("Crawling datbase cleaned")
+
+    indexing_conn = get_indexing_db_connection()
+    indexing_cur = indexing_conn.cursor()
+    indexing_cur.execute("""
+        DROP TABLE indexing_queue;
+    """)
+    indexing_cur.close()
+    indexing_conn.commit()
+    indexing_conn.close()
+
+    print("Indexing datbase cleaned")
diff --git a/crawling/constants.py b/crawling/constants.py
@@ -0,0 +1,8 @@
+# CONSTANTS FOR CRAWLING, INDEXING, and SEARCHING
+CACHE_DIRECTORY = "/var/lib/search/crawl_cache"
+CRAWLING_DB = "crawling"
+INDEXING_DB = "indexing"
+DB_PASSWORD_ENV_VAR = "CRAWLING_DB_PASSWORD"
+DB_PORT = 5432
+DB_HOST = 'localhost'
+DB_USER_ENV_VAR = 'CRAWLING_DB_USER'
diff --git a/crawling/spider.py b/crawling/spider.py
@@ -0,0 +1,290 @@
+import urllib.robotparser
+from urllib.parse import urlparse
+import urllib.request
+import requests
+import os
+from urllib.parse import urljoin, urlparse
+from bs4 import BeautifulSoup
+import sys
+import psycopg2
+from constants import CRAWLING_DB 
+from constants import INDEXING_DB
+from constants import CACHE_DIRECTORY
+from constants import DB_PASSWORD_ENV_VAR
+from constants import DB_USER_ENV_VAR
+from constants import DB_HOST
+from constants import DB_PORT
+import urllib
+import uuid
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# this is the number of links we take out of the queue
+LINK_SELECTION_COUNT = 500
+MAX_SITE_SIZE = 2_000_000
+MAX_URLS_PER_SITE = 100
+# number of concurrent workers for thread pool executor
+MAX_WORKERS = 50
+
+# TODO: Only queue if we haven't already indexed it recently.
+def queue_urls_for_crawling(conn, urls, prior_depth):
+    current_depth = prior_depth + 1
+    for url in urls:
+        insert_or_increment_urls(conn, {url: current_depth})
+
+def is_allowed(url, user_agent, timeout=1):
+    try:
+        parsed = urlparse(url)
+        robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
+        rp = urllib.robotparser.RobotFileParser()
+        rp.set_url(robots_url)
+        with urllib.request.urlopen(robots_url, timeout=timeout) as response:
+            rp.parse(response.read().decode('utf-8').splitlines())
+        return rp.can_fetch(user_agent, url)
+    except Exception:
+        return True
+
+# TODO: How can we limit the request size prior to loading it into memory?
+def crawl_url(url, filepath):
+    links = set()
+    written_to_fs = False
+
+    user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0'
+    if not is_allowed(url,user_agent):
+        print(f"Can't crawl {url} due to robots.txt violation")
+        return written_to_fs, links
+
+
+    headers = {
+        'User-Agent': user_agent,
+    }
+
+    try:
+        source_code = requests.get(url, headers=headers, timeout=1)
+        if not source_code.ok:
+            print(f'Status code not 2xx for {url}, returning.')
+            return written_to_fs, links
+        
+        content_type = source_code.headers.get('Content-Type', '')
+        if 'text/html' not in content_type:
+            print(f'Content type for {url} not html, returning.')
+            return False, links
+
+        soup = BeautifulSoup(source_code.content, 'html.parser')
+        content = soup.prettify()
+
+        if len(content.encode('utf-8')) < MAX_SITE_SIZE:
+            with open(filepath, 'w') as f:
+                f.write(content)
+                written_to_fs = True
+                print(f'Wrote {url} to {filepath}')
+        else:
+            print(f'skipping fs write for {url}, too large')
+            return written_to_fs, links
+    except Exception as e:
+        print(e)
+        return written_to_fs, links
+
+    current_url_without_fragment = urlparse(url)._replace(fragment='').geturl()
+
+    # find all links < max_urls_per_site that direct to a different page.
+    for link in soup.find_all('a', href=True):
+            href = link.get('href')
+
+            if href.startswith('#'):
+                continue
+
+            absolute_url = urljoin(url, href)
+            parsed = urlparse(absolute_url)
+
+            url_without_fragment = parsed._replace(fragment='').geturl()
+            
+            if url_without_fragment == current_url_without_fragment:
+                continue
+
+            if parsed.scheme in ('http', 'https'):
+                if len(links) < MAX_URLS_PER_SITE:
+                    links.add(absolute_url)
+
+    assert written_to_fs == True
+    return written_to_fs, links
+
+def crawl(url):
+    filepath = CACHE_DIRECTORY + "/" + str(uuid.uuid4())
+    success, links = crawl_url(url, filepath)
+    return url, success, filepath, links
+
+# urls: {url: depth}
+def insert_or_increment_urls(conn, urls_dict):
+    cursor = conn.cursor()
+    for url in urls_dict:
+        depth = urls_dict[url]
+        query = """
+            INSERT INTO queued_site (url, depth)
+            VALUES (%s, %s)
+            ON CONFLICT (url) DO UPDATE
+            SET depth = LEAST(queued_site.depth, EXCLUDED.depth),
+            crawl_requests = queued_site.crawl_requests + 1
+        """
+        cursor.execute(query, (url, depth))
+
+    cursor.close()
+    conn.commit()
+
+def move_url_to_indexing_if_success(conn, url, filepath, success, conn_indexing):
+    cursor = conn.cursor()
+    delete_query = """
+        DELETE FROM queued_site
+        WHERE url = %s
+    """
+    cursor.execute(delete_query, (url,))
+    cursor.close()
+
+    if success:
+        assert os.path.isfile(filepath)
+
+        cursor_indexing = conn_indexing.cursor()
+        cursor_indexing.execute("SELECT filepath FROM indexing_queue WHERE url = %s", (url,))
+        existing = cursor_indexing.fetchone()
+        old_filepath = existing[0] if existing else None
+        upsert_query = """
+            INSERT INTO indexing_queue (url, filepath) VALUES (%s, %s)
+            ON CONFLICT (url) DO UPDATE SET filepath = EXCLUDED.filepath
+        """
+        cursor_indexing.execute(upsert_query, (url, filepath))
+        if old_filepath and os.path.isfile(old_filepath):
+            os.remove(old_filepath)
+        cursor_indexing.close()
+        conn_indexing.commit()
+
+    conn.commit()
+
+
+def get_k_urls_with_depth_from_db(conn, k):
+    cursor = conn.cursor()
+
+    # TODO: Improve this to make use of all attributes we have.
+    select_top_priority_elements_query = """
+        UPDATE queued_site 
+        SET status = 'processing', claimed_at = NOW()
+        WHERE (url) IN (
+            SELECT url
+            FROM queued_site 
+            WHERE status = 'pending' 
+            ORDER BY depth DESC, creation_timestamp ASC 
+            LIMIT %s
+        )
+        RETURNING url, depth;
+    """
+    cursor.execute(select_top_priority_elements_query, (k,))
+    result = cursor.fetchall()
+    conn.commit()
+    result = {res[0] : res[1] for res in result}
+    cursor.close()
+    return result
+
+def ensure_indexing_queue_and_get_connection(db_name, db_user, db_password, db_host, db_port):
+    conn = psycopg2.connect(
+        database=db_name,
+        user=db_user,
+        password=db_password,
+        host=db_host,
+        port=db_port
+    )
+    cursor = conn.cursor()
+    create_table_query = """
+        CREATE TABLE IF NOT EXISTS indexing_queue (
+            url TEXT PRIMARY KEY,
+            filepath TEXT NOT NULL,
+            creation_timestamp TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP NOT NULL
+        );
+    """
+
+    cursor.execute(create_table_query)
+    conn.commit()
+    cursor.close()
+    return conn
+
+def ensure_queued_sites_and_get_connection(db_name, db_user, db_password, db_host, db_port):
+    conn = psycopg2.connect(
+        database=db_name,
+        user=db_user,
+        password=db_password,
+        host=db_host,
+        port=db_port
+    )
+    cursor = conn.cursor()
+
+    # If a site can't be reached it will still be removed from the db.
+    # The status is only for spiders that stop mid execution, and to ensure
+    # multiple spiders don't grab the same url.
+
+    # TODO: See above, add logic to unset claimed status after certain amount of time.
+
+    create_table_query = """
+        CREATE TABLE IF NOT EXISTS queued_site (
+            url TEXT PRIMARY KEY,
+            creation_timestamp TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP NOT NULL,
+            status TEXT DEFAULT 'pending' NOT NULL
+                CHECK (status IN ('pending', 'processing')),
+            claimed_at TIMESTAMPTZ,
+            crawl_requests INTEGER DEFAULT 1 NOT NULL CHECK (crawl_requests >= 1),
+            depth INTEGER NOT NULL CHECK (depth >= 0)
+        );
+    """
+
+    cursor.execute(create_table_query)
+    conn.commit()
+    cursor.close()
+    return conn
+
+def insert_seed_file(filepath, conn):
+    with open(filepath, 'r') as f:
+        urls = f.readlines()
+
+    to_insert = {}
+    for url in urls:
+        to_insert[url.strip()] = 0
+
+    insert_or_increment_urls(conn, to_insert)
+
+def get_crawling_db_connection():
+    password = os.getenv(DB_PASSWORD_ENV_VAR)
+    username = os.getenv(DB_USER_ENV_VAR)
+    conn = ensure_queued_sites_and_get_connection(CRAWLING_DB, username, password, DB_HOST, DB_PORT)
+    return conn
+
+def get_indexing_db_connection():
+    password = os.getenv(DB_PASSWORD_ENV_VAR)
+    username = os.getenv(DB_USER_ENV_VAR)
+    conn_indexing_queue = ensure_indexing_queue_and_get_connection(INDEXING_DB, username, password, DB_HOST, DB_PORT)
+    return conn_indexing_queue
+
+if __name__ == "__main__":
+    if not os.path.exists(CACHE_DIRECTORY):
+        os.makedirs(CACHE_DIRECTORY)
+
+    conn = get_crawling_db_connection()
+    conn_indexing_queue = get_indexing_db_connection()
+
+    if len(sys.argv) > 1:
+        for filepath in sys.argv[1:]:
+            insert_seed_file(filepath, conn)
+            print(f"Inserted urls from {filepath}")
+
+    while True:
+        urls_dict = get_k_urls_with_depth_from_db(conn, LINK_SELECTION_COUNT)
+        if len(urls_dict) == 0:
+            print('No URLs to search... Exiting (if you are just starting, try passing in a seed file)')
+            break
+
+        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+            futures = {executor.submit(crawl, url): url for url in urls_dict}
+            for future in as_completed(futures):
+                url, success, filepath, links = future.result()
+                # success means the html was written to the filepath
+                move_url_to_indexing_if_success(conn, url, filepath, success, conn_indexing_queue)
+                current_urls_depth = urls_dict[url]
+                queue_urls_for_crawling(conn, links, current_urls_depth)
+
+    conn.close()
+    conn_indexing_queue.close()
diff --git a/seeds/code.txt b/seeds/code.txt
@@ -0,0 +1,7 @@
+https://www.die.net/
+https://en.wikipedia.org/wiki/List_of_programming_languages
+https://ziglang.org/
+https://rust-lang.org/
+https://www.python.org/
+https://cppreference.com/
+https://cplusplus.com/reference/
diff --git a/seeds/dictionaries.txt b/seeds/dictionaries.txt
@@ -0,0 +1,4 @@
+https://www.dictionary.com
+https://www.merriam-webster.com
+https://www.oed.com
+https://dictionary.cambridge.org
diff --git a/seeds/music.txt b/seeds/music.txt
@@ -0,0 +1,5 @@
+https://open.spotify.com/
+https://www.tunecore.com/
+https://www.allmusic.com/
+https://musicbrainz.org/
+https://www.google.com/search?q=music+indexing+sites
diff --git a/seeds/otr.txt b/seeds/otr.txt
@@ -0,0 +1,8 @@
+https://laack.co
+https://arstechnica.com
+https://geohot.github.io/blog
+https://suckless.org
+https://blog.laack.co
+https://stevana.github.io
+https://lukesmith.xyz
+https://github.com/sindresorhus/awesome
diff --git a/seeds/piracy.txt b/seeds/piracy.txt
@@ -0,0 +1,6 @@
+https://annas-archive.org/
+https://libgen.ac/
+https://forum.mobilism.me/
+https://github.com/Igglybuff/awesome-piracy
+https://www.reddit.com/r/CuratedTumblr/comments/1e63sew/for_those_too_lazy_to_check_the_rpiracy_megathread/
+https://sci-hub.se/
diff --git a/seeds/research.txt b/seeds/research.txt
@@ -0,0 +1,7 @@
+https://xlinux.nist.gov/dads/
+https://thimbleby.gitlab.io/algorithm-wiki-site/
+https://www.kaggle.com/
+https://arxiv.org/
+https://research.com/journals-rankings/computer-science
+https://scholar.google.com/citations?view_op=top_venues&hl=en&vq=eng
+https://scholar.google.com/citations?view_op=top_venues&hl=en&vq=phy
diff --git a/seeds/wikis.txt b/seeds/wikis.txt
@@ -0,0 +1,6 @@
+https://en.wikipedia.org/wiki/Main_Page
+https://archlinux.org/
+https://wiki.ubuntu.com/
+https://repair.wiki/w/Main_Page#gsc.tab=0
+https://stackoverflow.com
+https://stackexchange.com
diff --git a/setup.sh b/setup.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+sudo mkdir -p /var/lib/search/
+sudo chown $USER /var/lib/search/
+
+# TODO: Automate creation process (should be possible with psycopg2)
+# Sign into postgres
+# Create user
+# createdb crawling
+# createdb indexing
+# createdb search
+# TODO: Also,

	information-retrieval Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs

A	Makefile	\|	2	++
A	crawling/README.md	\|	13	+++++++++++++
A	crawling/clean.py	\|	32	++++++++++++++++++++++++++++++++
A	crawling/constants.py	\|	8	++++++++
A	crawling/spider.py	\|	290	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	seeds/code.txt	\|	7	+++++++
A	seeds/dictionaries.txt	\|	4	++++
A	seeds/music.txt	\|	5	+++++
A	seeds/otr.txt	\|	8	++++++++
A	seeds/piracy.txt	\|	6	++++++
A	seeds/research.txt	\|	7	+++++++
A	seeds/wikis.txt	\|	6	++++++
A	setup.sh	\|	12	++++++++++++