commit 1b5bb8f087a72d206aa5b50669676e519509a755
parent 6d3b8448a30a10843c1f3d016516729060da39ad
Author: Andrew Laack <andrew.laack@imbue.com>
Date: Wed, 31 Dec 2025 00:18:13 -0600
Improved crawling
Diffstat:
9 files changed, 162 insertions(+), 32 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,4 @@
documents/
parsed/
+sites/
+database/
diff --git a/collection/spider.py b/collection/spider.py
@@ -1,12 +1,35 @@
import requests
+import time
+import os
+import datetime
+import uuid
+from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
import sys
import base64
+import sqlite3
+
+# Layout:
+# - sites
+ # - date
+ # - each file is one site with a UUID as the filename
+# - database
+ # - manifest.db
+ # this is the database that maps UUIDs with urls and dates
+ # the urls might not be unique as we could have multiple copies of the same site from different times
+ # no, this is not 3nf, no I don't care, this is faster.
+ # tables:
+ # site
+ # url, filepath, date
+ #
+
# bytes
-MAX_SIZE = 1_000_000
-MAX_WORKERS = 1000
+MAX_SIZE = 2_000_000
+MAX_WORKERS = 5
+MAX_URLS_PER_LEVEL = 100_000
+MAX_URLS_PER_SITE = 500
def url_to_filename(url):
return base64.urlsafe_b64encode(url.encode()).decode() + ".html"
@@ -20,25 +43,41 @@ def search_url(url, filepath):
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0',
}
try:
- source_code = requests.get(url, headers=headers, timeout=1)
+ source_code = requests.get(url, headers=headers, timeout=1) # natural limit to file size in memory
soup = BeautifulSoup(source_code.content, 'html.parser')
content = soup.prettify()
if len(content.encode('utf-8')) < MAX_SIZE:
with open(filepath, 'w') as f:
f.write(content)
+ print(f'Wrote {url} to {filepath}')
else:
- print(f'skipping {url}, too large')
+ print(f'skipping fs write for {url}, too large')
except Exception as e:
print(e)
- return url, []
+ return "", "", []
+
+ base_domain = urlparse(url).netloc
for link in soup.find_all('a', href=True):
- if link.get('href').startswith('https://'):
- links.append(link.get('href'))
- return url, links
+ href = link.get('href')
+ absolute_url = urljoin(url, href)
+ parsed = urlparse(absolute_url)
+ if parsed.scheme in ('http', 'https') and parsed.netloc == base_domain:
+ if len(links) < MAX_URLS_PER_SITE:
+ links.append(absolute_url)
+
+ return filepath, url, links
if __name__ == "__main__":
seed_filename = sys.argv[1]
+ con = sqlite3.connect('database/manifest.db', timeout=60)
+ con.execute('PRAGMA journal_mode=WAL')
+ cur = con.cursor()
+
+ cur.execute("CREATE TABLE IF NOT EXISTS site(url, filepath, date)")
+ cur.execute("CREATE INDEX IF NOT EXISTS idx_site_url ON site(url)")
+ cur.execute("CREATE INDEX IF NOT EXISTS idx_site_filepath ON site(filepath)")
+
urls = []
with open(seed_filename, 'r') as f:
urls = f.readlines()
@@ -48,25 +87,47 @@ if __name__ == "__main__":
searched = set()
- save_location = sys.argv[2]
+ save_location = 'sites/'
depth = 0
- max_depth = int(sys.argv[3])
+ max_depth = int(sys.argv[2])
+ # TODO: How can we only search sites we haven't seen recently?
while len(urls) != 0 and depth < max_depth:
+
print(f"Depth {depth}: processing {len(urls)} URLs")
next_urls = set()
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+ now = datetime.datetime.now()
+ pth = save_location + now.strftime("%Y-%m-%d") + "/"
+ os.makedirs(pth, exist_ok=True)
futures = {
- executor.submit(search_url, url, save_location + url_to_filename(url)): url
+ executor.submit(search_url, url, pth+ str(uuid.uuid4())): url
for url in urls
}
for future in as_completed(futures):
- url, links = future.result()
- searched.add(url)
+ filepath, url, links = future.result()
+
for link in links:
- if link not in searched and link not in urls:
+ if link not in searched and link not in urls and len(next_urls) < MAX_URLS_PER_LEVEL:
next_urls.add(link)
+
+ time = datetime.datetime.now().timestamp()
+ if filepath != '' and url != '':
+ cur.execute("""
+ INSERT INTO site VALUES (?, ?, ?)
+ """, (url, filepath, time))
+ con.commit()
- urls = next_urls
+ one_day_ago = (datetime.datetime.now() - datetime.timedelta(days=1)).timestamp()
+ fresh_urls = []
+ for url in next_urls:
+ cur.execute("SELECT 1 FROM site WHERE url = ? AND date > ?", (url, one_day_ago))
+ if cur.fetchone() is None:
+ fresh_urls.append(url)
+ else:
+ print('Already traversed today, skipping')
+
+ urls = fresh_urls # haven't traversed in the last day.
+
depth += 1
diff --git a/metrics/tf-idf.py b/metrics/tf-idf.py
@@ -1,4 +1,5 @@
import os
+import random
import math
import re
import sys
@@ -15,10 +16,16 @@ def get_words(filename):
final.append(lines[i].lower())
return final
-def tfs(prefix, filenames, word):
+def tfs(prefix, filenames, word, to_sample=-1):
tfs = {}
- for filename in filenames:
- tfs[filename] = tf(prefix, filename, word)
+
+ if to_sample == -1:
+ for filename in filenames:
+ tfs[filename] = tf(prefix, filename, word)
+ else:
+ for _ in range(0, to_sample):
+ filename = random.choice(filenames)
+ tfs[filename] = tf(prefix, filename, word)
return tfs
def tf(prefix, filename, word):
@@ -47,33 +54,65 @@ def idf(prefix, filenames):
idf[word] = math.log(len(filenames) / idf[word])
return idf
+def idf_word(prefix, filenames, word, to_sample=-1):
+
+ frequency = 0
+ sampled = 0
+
+ if to_sample == -1:
+ sampled = len(filenames)
+ for filename in filenames:
+ words = get_words(prefix + filename)
+ if word in words:
+ frequency += 1
+ else:
+ for _ in range(0,to_sample):
+ sampled = to_sample
+ filename = random.choice(filenames)
+ words = get_words(prefix + filename)
+ if word in words:
+ frequency += 1
+ if frequency != 0:
+ idf = math.log
+
+ if frequency != 0:
+ idf = math.log(sampled / frequency)
+ else:
+ idf = 0.0
+ return idf
+
+
+
+
if __name__ == "__main__":
+ document_directory = sys.argv[1]
+ if(document_directory[-1] != '/'):
+ document_directory += '/'
+
user_input = True # continually prompt if the user is in interactive mode
while user_input:
word = ""
top_k = 1
- if len(sys.argv) == 2:
- word = sys.argv[1]
- top_k = int(sys.argv[2])
+ if len(sys.argv) == 4:
+ word = sys.argv[2]
+ top_k = int(sys.argv[3])
user_input = False
else:
user_input = True
word = input("Word to find: ")
top_k = int(input("Top k elements to show: "))
- filenames = os.listdir('parsed')
- idf_dict = idf('parsed/', filenames)
-
- if word not in idf_dict:
- print('Word does not appear in any documents')
- exit()
+ filenames = os.listdir(document_directory)
+ print('calculating idf')
+ idf_of_word = idf_word(document_directory, filenames, word, 1000)
- tf_dict = tfs('parsed/', filenames, word)
+ print('calculating tf')
+ tf_dict = tfs(document_directory, filenames, word, 1000)
tfidf = {}
- for filename in filenames:
- tfidf[filename] = idf_dict[word] * tf_dict[filename]
+ for filename in tf_dict:
+ tfidf[filename] = idf_of_word * tf_dict[filename]
sorted_items = sorted(tfidf.items(), key=lambda kv: (kv[1], kv[0]))
sorted_items.reverse()
diff --git a/seeds/code.txt b/seeds/code.txt
@@ -3,3 +3,5 @@ https://codeberg.org/
https://about.gitlab.com/
https://github.com/
https://github.com/topics/awesome
+https://www.reddit.com/r/programming/
+https://www.reddit.com/r/ProgrammingLanguages/
diff --git a/seeds/music.txt b/seeds/music.txt
@@ -0,0 +1,5 @@
+https://open.spotify.com/
+https://www.tunecore.com/
+https://www.allmusic.com/
+https://musicbrainz.org/
+https://www.google.com/search?q=music+indexing+sites
diff --git a/seeds/otr.txt b/seeds/otr.txt
@@ -0,0 +1,8 @@
+https://laack.co
+https://arstechnica.com
+https://geohot.github.io/blog
+https://suckless.org
+https://blog.laack.co
+https://stevana.github.io
+https://lukesmith.xyz
+https://github.com/sindresorhus/awesome
diff --git a/seeds/piracy.txt b/seeds/piracy.txt
@@ -0,0 +1,6 @@
+https://annas-archive.org/
+https://libgen.ac/
+https://forum.mobilism.me/
+https://github.com/Igglybuff/awesome-piracy
+https://www.reddit.com/r/CuratedTumblr/comments/1e63sew/for_those_too_lazy_to_check_the_rpiracy_megathread/
+https://sci-hub.se/
diff --git a/seeds/research.txt b/seeds/research.txt
@@ -1,3 +1,7 @@
+https://xlinux.nist.gov/dads/
+https://thimbleby.gitlab.io/algorithm-wiki-site/
+https://www.kaggle.com/
https://arxiv.org/
https://research.com/journals-rankings/computer-science
-https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:llm
+https://scholar.google.com/citations?view_op=top_venues&hl=en&vq=eng
+https://scholar.google.com/citations?view_op=top_venues&hl=en&vq=phy
diff --git a/seeds/wikis.txt b/seeds/wikis.txt
@@ -1,3 +1,6 @@
-https://wikipedia.org
+https://en.wikipedia.org/wiki/Main_Page
https://archlinux.org/
https://wiki.ubuntu.com/
+https://repair.wiki/w/Main_Page#gsc.tab=0
+https://stackoverflow.com
+https://stackexchange.com