information-retrieval

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 7067745bb56e6e07eabb2574447937fbc0ec3779
parent 0547119084bf9b2df319d703a9e30f116e940c66
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Sat,  3 Jan 2026 02:18:03 -0600

Started making this better. No sqlite, better code, all that. Still needs some refactoring though....

Diffstat:
AMakefile | 2++
Acrawling/README.md | 13+++++++++++++
Acrawling/clean.py | 32++++++++++++++++++++++++++++++++
Acrawling/constants.py | 8++++++++
Acrawling/spider.py | 290+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aseeds/code.txt | 7+++++++
Aseeds/dictionaries.txt | 4++++
Aseeds/music.txt | 5+++++
Aseeds/otr.txt | 8++++++++
Aseeds/piracy.txt | 6++++++
Aseeds/research.txt | 7+++++++
Aseeds/wikis.txt | 6++++++
Asetup.sh | 12++++++++++++
13 files changed, 400 insertions(+), 0 deletions(-)

diff --git a/Makefile b/Makefile @@ -0,0 +1,2 @@ +clean: + python3 crawling/clean.py diff --git a/crawling/README.md b/crawling/README.md @@ -0,0 +1,13 @@ +# Crawling + +The purpose of this directory is to facilitate web crawling. This directory should suffice to setup your crawler and your queueing database. + +## DB + +The backing database for queueing is postgresql for consistency. The database name is 'crawling'. + + +### Schema + +crawling: + - queued_site(url, creation_timestamp, status, claimed_at, crawl_requests, depth) diff --git a/crawling/clean.py b/crawling/clean.py @@ -0,0 +1,32 @@ +from spider import get_crawling_db_connection +import os +from spider import get_indexing_db_connection + + +if __name__ == "__main__": + try: + os.rmdir("/var/lib/search/crawl_cache") + except FileNotFoundError as e: + print("Crawl cache directory doesn't exist, continuing with cleanup") + crawling_conn = get_crawling_db_connection() + + crawling_cur = crawling_conn.cursor() + crawling_cur.execute(""" + DROP TABLE queued_site; + """) + crawling_cur.close() + crawling_conn.commit() + crawling_conn.close() + + print("Crawling datbase cleaned") + + indexing_conn = get_indexing_db_connection() + indexing_cur = indexing_conn.cursor() + indexing_cur.execute(""" + DROP TABLE indexing_queue; + """) + indexing_cur.close() + indexing_conn.commit() + indexing_conn.close() + + print("Indexing datbase cleaned") diff --git a/crawling/constants.py b/crawling/constants.py @@ -0,0 +1,8 @@ +# CONSTANTS FOR CRAWLING, INDEXING, and SEARCHING +CACHE_DIRECTORY = "/var/lib/search/crawl_cache" +CRAWLING_DB = "crawling" +INDEXING_DB = "indexing" +DB_PASSWORD_ENV_VAR = "CRAWLING_DB_PASSWORD" +DB_PORT = 5432 +DB_HOST = 'localhost' +DB_USER_ENV_VAR = 'CRAWLING_DB_USER' diff --git a/crawling/spider.py b/crawling/spider.py @@ -0,0 +1,290 @@ +import urllib.robotparser +from urllib.parse import urlparse +import urllib.request +import requests +import os +from urllib.parse import urljoin, urlparse +from bs4 import BeautifulSoup +import sys +import psycopg2 +from constants import CRAWLING_DB +from constants import INDEXING_DB +from constants import CACHE_DIRECTORY +from constants import DB_PASSWORD_ENV_VAR +from constants import DB_USER_ENV_VAR +from constants import DB_HOST +from constants import DB_PORT +import urllib +import uuid +from concurrent.futures import ThreadPoolExecutor, as_completed + +# this is the number of links we take out of the queue +LINK_SELECTION_COUNT = 500 +MAX_SITE_SIZE = 2_000_000 +MAX_URLS_PER_SITE = 100 +# number of concurrent workers for thread pool executor +MAX_WORKERS = 50 + +# TODO: Only queue if we haven't already indexed it recently. +def queue_urls_for_crawling(conn, urls, prior_depth): + current_depth = prior_depth + 1 + for url in urls: + insert_or_increment_urls(conn, {url: current_depth}) + +def is_allowed(url, user_agent, timeout=1): + try: + parsed = urlparse(url) + robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt" + rp = urllib.robotparser.RobotFileParser() + rp.set_url(robots_url) + with urllib.request.urlopen(robots_url, timeout=timeout) as response: + rp.parse(response.read().decode('utf-8').splitlines()) + return rp.can_fetch(user_agent, url) + except Exception: + return True + +# TODO: How can we limit the request size prior to loading it into memory? +def crawl_url(url, filepath): + links = set() + written_to_fs = False + + user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0' + if not is_allowed(url,user_agent): + print(f"Can't crawl {url} due to robots.txt violation") + return written_to_fs, links + + + headers = { + 'User-Agent': user_agent, + } + + try: + source_code = requests.get(url, headers=headers, timeout=1) + if not source_code.ok: + print(f'Status code not 2xx for {url}, returning.') + return written_to_fs, links + + content_type = source_code.headers.get('Content-Type', '') + if 'text/html' not in content_type: + print(f'Content type for {url} not html, returning.') + return False, links + + soup = BeautifulSoup(source_code.content, 'html.parser') + content = soup.prettify() + + if len(content.encode('utf-8')) < MAX_SITE_SIZE: + with open(filepath, 'w') as f: + f.write(content) + written_to_fs = True + print(f'Wrote {url} to {filepath}') + else: + print(f'skipping fs write for {url}, too large') + return written_to_fs, links + except Exception as e: + print(e) + return written_to_fs, links + + current_url_without_fragment = urlparse(url)._replace(fragment='').geturl() + + # find all links < max_urls_per_site that direct to a different page. + for link in soup.find_all('a', href=True): + href = link.get('href') + + if href.startswith('#'): + continue + + absolute_url = urljoin(url, href) + parsed = urlparse(absolute_url) + + url_without_fragment = parsed._replace(fragment='').geturl() + + if url_without_fragment == current_url_without_fragment: + continue + + if parsed.scheme in ('http', 'https'): + if len(links) < MAX_URLS_PER_SITE: + links.add(absolute_url) + + assert written_to_fs == True + return written_to_fs, links + +def crawl(url): + filepath = CACHE_DIRECTORY + "/" + str(uuid.uuid4()) + success, links = crawl_url(url, filepath) + return url, success, filepath, links + +# urls: {url: depth} +def insert_or_increment_urls(conn, urls_dict): + cursor = conn.cursor() + for url in urls_dict: + depth = urls_dict[url] + query = """ + INSERT INTO queued_site (url, depth) + VALUES (%s, %s) + ON CONFLICT (url) DO UPDATE + SET depth = LEAST(queued_site.depth, EXCLUDED.depth), + crawl_requests = queued_site.crawl_requests + 1 + """ + cursor.execute(query, (url, depth)) + + cursor.close() + conn.commit() + +def move_url_to_indexing_if_success(conn, url, filepath, success, conn_indexing): + cursor = conn.cursor() + delete_query = """ + DELETE FROM queued_site + WHERE url = %s + """ + cursor.execute(delete_query, (url,)) + cursor.close() + + if success: + assert os.path.isfile(filepath) + + cursor_indexing = conn_indexing.cursor() + cursor_indexing.execute("SELECT filepath FROM indexing_queue WHERE url = %s", (url,)) + existing = cursor_indexing.fetchone() + old_filepath = existing[0] if existing else None + upsert_query = """ + INSERT INTO indexing_queue (url, filepath) VALUES (%s, %s) + ON CONFLICT (url) DO UPDATE SET filepath = EXCLUDED.filepath + """ + cursor_indexing.execute(upsert_query, (url, filepath)) + if old_filepath and os.path.isfile(old_filepath): + os.remove(old_filepath) + cursor_indexing.close() + conn_indexing.commit() + + conn.commit() + + +def get_k_urls_with_depth_from_db(conn, k): + cursor = conn.cursor() + + # TODO: Improve this to make use of all attributes we have. + select_top_priority_elements_query = """ + UPDATE queued_site + SET status = 'processing', claimed_at = NOW() + WHERE (url) IN ( + SELECT url + FROM queued_site + WHERE status = 'pending' + ORDER BY depth DESC, creation_timestamp ASC + LIMIT %s + ) + RETURNING url, depth; + """ + cursor.execute(select_top_priority_elements_query, (k,)) + result = cursor.fetchall() + conn.commit() + result = {res[0] : res[1] for res in result} + cursor.close() + return result + +def ensure_indexing_queue_and_get_connection(db_name, db_user, db_password, db_host, db_port): + conn = psycopg2.connect( + database=db_name, + user=db_user, + password=db_password, + host=db_host, + port=db_port + ) + cursor = conn.cursor() + create_table_query = """ + CREATE TABLE IF NOT EXISTS indexing_queue ( + url TEXT PRIMARY KEY, + filepath TEXT NOT NULL, + creation_timestamp TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP NOT NULL + ); + """ + + cursor.execute(create_table_query) + conn.commit() + cursor.close() + return conn + +def ensure_queued_sites_and_get_connection(db_name, db_user, db_password, db_host, db_port): + conn = psycopg2.connect( + database=db_name, + user=db_user, + password=db_password, + host=db_host, + port=db_port + ) + cursor = conn.cursor() + + # If a site can't be reached it will still be removed from the db. + # The status is only for spiders that stop mid execution, and to ensure + # multiple spiders don't grab the same url. + + # TODO: See above, add logic to unset claimed status after certain amount of time. + + create_table_query = """ + CREATE TABLE IF NOT EXISTS queued_site ( + url TEXT PRIMARY KEY, + creation_timestamp TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP NOT NULL, + status TEXT DEFAULT 'pending' NOT NULL + CHECK (status IN ('pending', 'processing')), + claimed_at TIMESTAMPTZ, + crawl_requests INTEGER DEFAULT 1 NOT NULL CHECK (crawl_requests >= 1), + depth INTEGER NOT NULL CHECK (depth >= 0) + ); + """ + + cursor.execute(create_table_query) + conn.commit() + cursor.close() + return conn + +def insert_seed_file(filepath, conn): + with open(filepath, 'r') as f: + urls = f.readlines() + + to_insert = {} + for url in urls: + to_insert[url.strip()] = 0 + + insert_or_increment_urls(conn, to_insert) + +def get_crawling_db_connection(): + password = os.getenv(DB_PASSWORD_ENV_VAR) + username = os.getenv(DB_USER_ENV_VAR) + conn = ensure_queued_sites_and_get_connection(CRAWLING_DB, username, password, DB_HOST, DB_PORT) + return conn + +def get_indexing_db_connection(): + password = os.getenv(DB_PASSWORD_ENV_VAR) + username = os.getenv(DB_USER_ENV_VAR) + conn_indexing_queue = ensure_indexing_queue_and_get_connection(INDEXING_DB, username, password, DB_HOST, DB_PORT) + return conn_indexing_queue + +if __name__ == "__main__": + if not os.path.exists(CACHE_DIRECTORY): + os.makedirs(CACHE_DIRECTORY) + + conn = get_crawling_db_connection() + conn_indexing_queue = get_indexing_db_connection() + + if len(sys.argv) > 1: + for filepath in sys.argv[1:]: + insert_seed_file(filepath, conn) + print(f"Inserted urls from {filepath}") + + while True: + urls_dict = get_k_urls_with_depth_from_db(conn, LINK_SELECTION_COUNT) + if len(urls_dict) == 0: + print('No URLs to search... Exiting (if you are just starting, try passing in a seed file)') + break + + with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: + futures = {executor.submit(crawl, url): url for url in urls_dict} + for future in as_completed(futures): + url, success, filepath, links = future.result() + # success means the html was written to the filepath + move_url_to_indexing_if_success(conn, url, filepath, success, conn_indexing_queue) + current_urls_depth = urls_dict[url] + queue_urls_for_crawling(conn, links, current_urls_depth) + + conn.close() + conn_indexing_queue.close() diff --git a/seeds/code.txt b/seeds/code.txt @@ -0,0 +1,7 @@ +https://www.die.net/ +https://en.wikipedia.org/wiki/List_of_programming_languages +https://ziglang.org/ +https://rust-lang.org/ +https://www.python.org/ +https://cppreference.com/ +https://cplusplus.com/reference/ diff --git a/seeds/dictionaries.txt b/seeds/dictionaries.txt @@ -0,0 +1,4 @@ +https://www.dictionary.com +https://www.merriam-webster.com +https://www.oed.com +https://dictionary.cambridge.org diff --git a/seeds/music.txt b/seeds/music.txt @@ -0,0 +1,5 @@ +https://open.spotify.com/ +https://www.tunecore.com/ +https://www.allmusic.com/ +https://musicbrainz.org/ +https://www.google.com/search?q=music+indexing+sites diff --git a/seeds/otr.txt b/seeds/otr.txt @@ -0,0 +1,8 @@ +https://laack.co +https://arstechnica.com +https://geohot.github.io/blog +https://suckless.org +https://blog.laack.co +https://stevana.github.io +https://lukesmith.xyz +https://github.com/sindresorhus/awesome diff --git a/seeds/piracy.txt b/seeds/piracy.txt @@ -0,0 +1,6 @@ +https://annas-archive.org/ +https://libgen.ac/ +https://forum.mobilism.me/ +https://github.com/Igglybuff/awesome-piracy +https://www.reddit.com/r/CuratedTumblr/comments/1e63sew/for_those_too_lazy_to_check_the_rpiracy_megathread/ +https://sci-hub.se/ diff --git a/seeds/research.txt b/seeds/research.txt @@ -0,0 +1,7 @@ +https://xlinux.nist.gov/dads/ +https://thimbleby.gitlab.io/algorithm-wiki-site/ +https://www.kaggle.com/ +https://arxiv.org/ +https://research.com/journals-rankings/computer-science +https://scholar.google.com/citations?view_op=top_venues&hl=en&vq=eng +https://scholar.google.com/citations?view_op=top_venues&hl=en&vq=phy diff --git a/seeds/wikis.txt b/seeds/wikis.txt @@ -0,0 +1,6 @@ +https://en.wikipedia.org/wiki/Main_Page +https://archlinux.org/ +https://wiki.ubuntu.com/ +https://repair.wiki/w/Main_Page#gsc.tab=0 +https://stackoverflow.com +https://stackexchange.com diff --git a/setup.sh b/setup.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +sudo mkdir -p /var/lib/search/ +sudo chown $USER /var/lib/search/ + +# TODO: Automate creation process (should be possible with psycopg2) +# Sign into postgres +# Create user +# createdb crawling +# createdb indexing +# createdb search +# TODO: Also,