information-retrieval

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 1b5bb8f087a72d206aa5b50669676e519509a755
parent 6d3b8448a30a10843c1f3d016516729060da39ad
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Wed, 31 Dec 2025 00:18:13 -0600

Improved crawling

Diffstat:
M.gitignore | 2++
Mcollection/spider.py | 91++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Mmetrics/tf-idf.py | 69++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
Mseeds/code.txt | 2++
Aseeds/music.txt | 5+++++
Aseeds/otr.txt | 8++++++++
Aseeds/piracy.txt | 6++++++
Mseeds/research.txt | 6+++++-
Mseeds/wikis.txt | 5++++-
9 files changed, 162 insertions(+), 32 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -1,2 +1,4 @@ documents/ parsed/ +sites/ +database/ diff --git a/collection/spider.py b/collection/spider.py @@ -1,12 +1,35 @@ import requests +import time +import os +import datetime +import uuid +from urllib.parse import urljoin, urlparse from concurrent.futures import ThreadPoolExecutor, as_completed from bs4 import BeautifulSoup import sys import base64 +import sqlite3 + +# Layout: +# - sites + # - date + # - each file is one site with a UUID as the filename +# - database + # - manifest.db + # this is the database that maps UUIDs with urls and dates + # the urls might not be unique as we could have multiple copies of the same site from different times + # no, this is not 3nf, no I don't care, this is faster. + # tables: + # site + # url, filepath, date + # + # bytes -MAX_SIZE = 1_000_000 -MAX_WORKERS = 1000 +MAX_SIZE = 2_000_000 +MAX_WORKERS = 5 +MAX_URLS_PER_LEVEL = 100_000 +MAX_URLS_PER_SITE = 500 def url_to_filename(url): return base64.urlsafe_b64encode(url.encode()).decode() + ".html" @@ -20,25 +43,41 @@ def search_url(url, filepath): 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0', } try: - source_code = requests.get(url, headers=headers, timeout=1) + source_code = requests.get(url, headers=headers, timeout=1) # natural limit to file size in memory soup = BeautifulSoup(source_code.content, 'html.parser') content = soup.prettify() if len(content.encode('utf-8')) < MAX_SIZE: with open(filepath, 'w') as f: f.write(content) + print(f'Wrote {url} to {filepath}') else: - print(f'skipping {url}, too large') + print(f'skipping fs write for {url}, too large') except Exception as e: print(e) - return url, [] + return "", "", [] + + base_domain = urlparse(url).netloc for link in soup.find_all('a', href=True): - if link.get('href').startswith('https://'): - links.append(link.get('href')) - return url, links + href = link.get('href') + absolute_url = urljoin(url, href) + parsed = urlparse(absolute_url) + if parsed.scheme in ('http', 'https') and parsed.netloc == base_domain: + if len(links) < MAX_URLS_PER_SITE: + links.append(absolute_url) + + return filepath, url, links if __name__ == "__main__": seed_filename = sys.argv[1] + con = sqlite3.connect('database/manifest.db', timeout=60) + con.execute('PRAGMA journal_mode=WAL') + cur = con.cursor() + + cur.execute("CREATE TABLE IF NOT EXISTS site(url, filepath, date)") + cur.execute("CREATE INDEX IF NOT EXISTS idx_site_url ON site(url)") + cur.execute("CREATE INDEX IF NOT EXISTS idx_site_filepath ON site(filepath)") + urls = [] with open(seed_filename, 'r') as f: urls = f.readlines() @@ -48,25 +87,47 @@ if __name__ == "__main__": searched = set() - save_location = sys.argv[2] + save_location = 'sites/' depth = 0 - max_depth = int(sys.argv[3]) + max_depth = int(sys.argv[2]) + # TODO: How can we only search sites we haven't seen recently? while len(urls) != 0 and depth < max_depth: + print(f"Depth {depth}: processing {len(urls)} URLs") next_urls = set() with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: + now = datetime.datetime.now() + pth = save_location + now.strftime("%Y-%m-%d") + "/" + os.makedirs(pth, exist_ok=True) futures = { - executor.submit(search_url, url, save_location + url_to_filename(url)): url + executor.submit(search_url, url, pth+ str(uuid.uuid4())): url for url in urls } for future in as_completed(futures): - url, links = future.result() - searched.add(url) + filepath, url, links = future.result() + for link in links: - if link not in searched and link not in urls: + if link not in searched and link not in urls and len(next_urls) < MAX_URLS_PER_LEVEL: next_urls.add(link) + + time = datetime.datetime.now().timestamp() + if filepath != '' and url != '': + cur.execute(""" + INSERT INTO site VALUES (?, ?, ?) + """, (url, filepath, time)) + con.commit() - urls = next_urls + one_day_ago = (datetime.datetime.now() - datetime.timedelta(days=1)).timestamp() + fresh_urls = [] + for url in next_urls: + cur.execute("SELECT 1 FROM site WHERE url = ? AND date > ?", (url, one_day_ago)) + if cur.fetchone() is None: + fresh_urls.append(url) + else: + print('Already traversed today, skipping') + + urls = fresh_urls # haven't traversed in the last day. + depth += 1 diff --git a/metrics/tf-idf.py b/metrics/tf-idf.py @@ -1,4 +1,5 @@ import os +import random import math import re import sys @@ -15,10 +16,16 @@ def get_words(filename): final.append(lines[i].lower()) return final -def tfs(prefix, filenames, word): +def tfs(prefix, filenames, word, to_sample=-1): tfs = {} - for filename in filenames: - tfs[filename] = tf(prefix, filename, word) + + if to_sample == -1: + for filename in filenames: + tfs[filename] = tf(prefix, filename, word) + else: + for _ in range(0, to_sample): + filename = random.choice(filenames) + tfs[filename] = tf(prefix, filename, word) return tfs def tf(prefix, filename, word): @@ -47,33 +54,65 @@ def idf(prefix, filenames): idf[word] = math.log(len(filenames) / idf[word]) return idf +def idf_word(prefix, filenames, word, to_sample=-1): + + frequency = 0 + sampled = 0 + + if to_sample == -1: + sampled = len(filenames) + for filename in filenames: + words = get_words(prefix + filename) + if word in words: + frequency += 1 + else: + for _ in range(0,to_sample): + sampled = to_sample + filename = random.choice(filenames) + words = get_words(prefix + filename) + if word in words: + frequency += 1 + if frequency != 0: + idf = math.log + + if frequency != 0: + idf = math.log(sampled / frequency) + else: + idf = 0.0 + return idf + + + + if __name__ == "__main__": + document_directory = sys.argv[1] + if(document_directory[-1] != '/'): + document_directory += '/' + user_input = True # continually prompt if the user is in interactive mode while user_input: word = "" top_k = 1 - if len(sys.argv) == 2: - word = sys.argv[1] - top_k = int(sys.argv[2]) + if len(sys.argv) == 4: + word = sys.argv[2] + top_k = int(sys.argv[3]) user_input = False else: user_input = True word = input("Word to find: ") top_k = int(input("Top k elements to show: ")) - filenames = os.listdir('parsed') - idf_dict = idf('parsed/', filenames) - - if word not in idf_dict: - print('Word does not appear in any documents') - exit() + filenames = os.listdir(document_directory) + print('calculating idf') + idf_of_word = idf_word(document_directory, filenames, word, 1000) - tf_dict = tfs('parsed/', filenames, word) + print('calculating tf') + tf_dict = tfs(document_directory, filenames, word, 1000) tfidf = {} - for filename in filenames: - tfidf[filename] = idf_dict[word] * tf_dict[filename] + for filename in tf_dict: + tfidf[filename] = idf_of_word * tf_dict[filename] sorted_items = sorted(tfidf.items(), key=lambda kv: (kv[1], kv[0])) sorted_items.reverse() diff --git a/seeds/code.txt b/seeds/code.txt @@ -3,3 +3,5 @@ https://codeberg.org/ https://about.gitlab.com/ https://github.com/ https://github.com/topics/awesome +https://www.reddit.com/r/programming/ +https://www.reddit.com/r/ProgrammingLanguages/ diff --git a/seeds/music.txt b/seeds/music.txt @@ -0,0 +1,5 @@ +https://open.spotify.com/ +https://www.tunecore.com/ +https://www.allmusic.com/ +https://musicbrainz.org/ +https://www.google.com/search?q=music+indexing+sites diff --git a/seeds/otr.txt b/seeds/otr.txt @@ -0,0 +1,8 @@ +https://laack.co +https://arstechnica.com +https://geohot.github.io/blog +https://suckless.org +https://blog.laack.co +https://stevana.github.io +https://lukesmith.xyz +https://github.com/sindresorhus/awesome diff --git a/seeds/piracy.txt b/seeds/piracy.txt @@ -0,0 +1,6 @@ +https://annas-archive.org/ +https://libgen.ac/ +https://forum.mobilism.me/ +https://github.com/Igglybuff/awesome-piracy +https://www.reddit.com/r/CuratedTumblr/comments/1e63sew/for_those_too_lazy_to_check_the_rpiracy_megathread/ +https://sci-hub.se/ diff --git a/seeds/research.txt b/seeds/research.txt @@ -1,3 +1,7 @@ +https://xlinux.nist.gov/dads/ +https://thimbleby.gitlab.io/algorithm-wiki-site/ +https://www.kaggle.com/ https://arxiv.org/ https://research.com/journals-rankings/computer-science -https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:llm +https://scholar.google.com/citations?view_op=top_venues&hl=en&vq=eng +https://scholar.google.com/citations?view_op=top_venues&hl=en&vq=phy diff --git a/seeds/wikis.txt b/seeds/wikis.txt @@ -1,3 +1,6 @@ -https://wikipedia.org +https://en.wikipedia.org/wiki/Main_Page https://archlinux.org/ https://wiki.ubuntu.com/ +https://repair.wiki/w/Main_Page#gsc.tab=0 +https://stackoverflow.com +https://stackexchange.com