Improved crawling - information-retrieval - Unnamed repository; edit this file 'description' to name the repository.

commit 1b5bb8f087a72d206aa5b50669676e519509a755
parent 6d3b8448a30a10843c1f3d016516729060da39ad
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Wed, 31 Dec 2025 00:18:13 -0600

Improved crawling

Diffstat:
M .gitignore  | 2 ++
M collection/spider.py  | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
M metrics/tf-idf.py  | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
M seeds/code.txt  | 2 ++
A seeds/music.txt  | 5 +++++
A seeds/otr.txt  | 8 ++++++++
A seeds/piracy.txt  | 6 ++++++
M seeds/research.txt  | 6 +++++-
M seeds/wikis.txt  | 5 ++++-

9 files changed, 162 insertions(+), 32 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,4 @@
 documents/
 parsed/
+sites/
+database/
diff --git a/collection/spider.py b/collection/spider.py
@@ -1,12 +1,35 @@
 import requests
+import time
+import os
+import datetime
+import uuid
+from urllib.parse import urljoin, urlparse
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from bs4 import BeautifulSoup
 import sys
 import base64
+import sqlite3
+
+# Layout:
+# - sites
+    # - date
+        # - each file is one site with a UUID as the filename
+# - database
+    # - manifest.db
+        # this is the database that maps UUIDs with urls and dates
+        # the urls might not be unique as we could have multiple copies of the same site from different times
+            # no, this is not 3nf, no I don't care, this is faster.
+        # tables:
+            # site
+                # url, filepath, date
+            # 
+
 
 # bytes
-MAX_SIZE = 1_000_000
-MAX_WORKERS = 1000
+MAX_SIZE = 2_000_000
+MAX_WORKERS = 5
+MAX_URLS_PER_LEVEL = 100_000
+MAX_URLS_PER_SITE = 500
 
 def url_to_filename(url):
     return base64.urlsafe_b64encode(url.encode()).decode() + ".html"
@@ -20,25 +43,41 @@ def search_url(url, filepath):
         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0',
     }
     try:
-        source_code = requests.get(url, headers=headers, timeout=1)
+        source_code = requests.get(url, headers=headers, timeout=1) # natural limit to file size in memory
         soup = BeautifulSoup(source_code.content, 'html.parser')
         content = soup.prettify()
 
         if len(content.encode('utf-8')) < MAX_SIZE:
             with open(filepath, 'w') as f:
                 f.write(content)
+                print(f'Wrote {url} to {filepath}')
         else:
-            print(f'skipping {url}, too large')
+            print(f'skipping fs write for {url}, too large')
     except Exception as e:
         print(e)
-        return url, []
+        return "", "", []
+
+    base_domain = urlparse(url).netloc
     for link in soup.find_all('a', href=True):
-        if link.get('href').startswith('https://'):
-            links.append(link.get('href'))
-    return url, links
+            href = link.get('href')
+            absolute_url = urljoin(url, href)
+            parsed = urlparse(absolute_url)
+            if parsed.scheme in ('http', 'https') and parsed.netloc == base_domain:
+                if len(links) < MAX_URLS_PER_SITE:
+                    links.append(absolute_url)
+
+    return filepath, url, links
 
 if __name__ == "__main__":
     seed_filename = sys.argv[1]
+    con = sqlite3.connect('database/manifest.db', timeout=60)
+    con.execute('PRAGMA journal_mode=WAL')
+    cur = con.cursor()
+
+    cur.execute("CREATE TABLE IF NOT EXISTS site(url, filepath, date)")
+    cur.execute("CREATE INDEX IF NOT EXISTS idx_site_url ON site(url)")
+    cur.execute("CREATE INDEX IF NOT EXISTS idx_site_filepath ON site(filepath)")
+
     urls = []
     with open(seed_filename, 'r') as f:
         urls = f.readlines()
@@ -48,25 +87,47 @@ if __name__ == "__main__":
 
     searched = set()
 
-    save_location = sys.argv[2]
+    save_location = 'sites/'
     depth = 0
-    max_depth = int(sys.argv[3])
+    max_depth = int(sys.argv[2])
 
+    # TODO: How can we only search sites we haven't seen recently?
     while len(urls) != 0 and depth < max_depth:
+
         print(f"Depth {depth}: processing {len(urls)} URLs")
         next_urls = set()
         with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+            now = datetime.datetime.now()
+            pth = save_location + now.strftime("%Y-%m-%d") + "/" 
+            os.makedirs(pth, exist_ok=True)
             futures = {
-                executor.submit(search_url, url, save_location + url_to_filename(url)): url
+                executor.submit(search_url, url, pth+ str(uuid.uuid4())): url
                 for url in urls
             }
             
             for future in as_completed(futures):
-                url, links = future.result()
-                searched.add(url)
+                filepath, url, links = future.result()
+
                 for link in links:
-                    if link not in searched and link not in urls:
+                    if link not in searched and link not in urls and len(next_urls) < MAX_URLS_PER_LEVEL:
                         next_urls.add(link)
+
+                time = datetime.datetime.now().timestamp()
+                if filepath != '' and url != '':
+                    cur.execute("""
+                        INSERT INTO site VALUES (?, ?, ?)
+                    """, (url, filepath, time))
+                    con.commit()
         
-        urls = next_urls
+        one_day_ago = (datetime.datetime.now() - datetime.timedelta(days=1)).timestamp()
+        fresh_urls = []
+        for url in next_urls:
+            cur.execute("SELECT 1 FROM site WHERE url = ? AND date > ?", (url, one_day_ago))
+            if cur.fetchone() is None:
+                fresh_urls.append(url)
+            else:
+                print('Already traversed today, skipping')
+
+        urls = fresh_urls # haven't traversed in the last day.
+
         depth += 1
diff --git a/metrics/tf-idf.py b/metrics/tf-idf.py
@@ -1,4 +1,5 @@
 import os
+import random
 import math
 import re
 import sys
@@ -15,10 +16,16 @@ def get_words(filename):
             final.append(lines[i].lower())
     return final
 
-def tfs(prefix, filenames, word):
+def tfs(prefix, filenames, word, to_sample=-1):
     tfs = {}
-    for filename in filenames:
-        tfs[filename] = tf(prefix, filename, word)
+
+    if to_sample == -1:
+        for filename in filenames:
+            tfs[filename] = tf(prefix, filename, word)
+    else:
+        for _ in range(0, to_sample):
+            filename = random.choice(filenames)
+            tfs[filename] = tf(prefix, filename, word)
     return tfs
 
 def tf(prefix, filename, word):
@@ -47,33 +54,65 @@ def idf(prefix, filenames):
         idf[word] = math.log(len(filenames) / idf[word])
     return idf
 
+def idf_word(prefix, filenames, word, to_sample=-1):
+
+    frequency = 0
+    sampled = 0
+
+    if to_sample == -1:
+        sampled = len(filenames)
+        for filename in filenames:
+            words = get_words(prefix + filename)
+            if word in words:
+                frequency += 1
+    else:
+        for _ in range(0,to_sample):
+            sampled = to_sample
+            filename = random.choice(filenames)
+            words = get_words(prefix + filename)
+            if word in words:
+                frequency += 1
+            if frequency != 0:
+                idf = math.log
+
+    if frequency != 0:
+        idf = math.log(sampled / frequency)
+    else:
+        idf = 0.0
+    return idf
+
+
+
+
 if __name__ == "__main__":
+    document_directory = sys.argv[1]
+    if(document_directory[-1] != '/'):
+        document_directory += '/'
+
     user_input = True # continually prompt if the user is in interactive mode
     while user_input:
         word = ""
         top_k = 1
-        if len(sys.argv) == 2:
-            word = sys.argv[1]
-            top_k = int(sys.argv[2])
+        if len(sys.argv) == 4:
+            word = sys.argv[2]
+            top_k = int(sys.argv[3])
             user_input = False
         else:
             user_input = True
             word = input("Word to find: ")
             top_k = int(input("Top k elements to show: "))
 
-        filenames = os.listdir('parsed')
-        idf_dict = idf('parsed/', filenames)
-
-        if word not in idf_dict:
-            print('Word does not appear in any documents')
-            exit()
+        filenames = os.listdir(document_directory)
+        print('calculating idf')
+        idf_of_word = idf_word(document_directory, filenames, word, 1000)
 
-        tf_dict = tfs('parsed/', filenames, word)
+        print('calculating tf')
+        tf_dict = tfs(document_directory, filenames, word, 1000)
 
         tfidf = {}
 
-        for filename in filenames:
-            tfidf[filename] = idf_dict[word] * tf_dict[filename]
+        for filename in tf_dict:
+            tfidf[filename] = idf_of_word * tf_dict[filename]
 
         sorted_items = sorted(tfidf.items(), key=lambda kv: (kv[1], kv[0]))
         sorted_items.reverse()
diff --git a/seeds/code.txt b/seeds/code.txt
@@ -3,3 +3,5 @@ https://codeberg.org/
 https://about.gitlab.com/
 https://github.com/
 https://github.com/topics/awesome
+https://www.reddit.com/r/programming/
+https://www.reddit.com/r/ProgrammingLanguages/
diff --git a/seeds/music.txt b/seeds/music.txt
@@ -0,0 +1,5 @@
+https://open.spotify.com/
+https://www.tunecore.com/
+https://www.allmusic.com/
+https://musicbrainz.org/
+https://www.google.com/search?q=music+indexing+sites
diff --git a/seeds/otr.txt b/seeds/otr.txt
@@ -0,0 +1,8 @@
+https://laack.co
+https://arstechnica.com
+https://geohot.github.io/blog
+https://suckless.org
+https://blog.laack.co
+https://stevana.github.io
+https://lukesmith.xyz
+https://github.com/sindresorhus/awesome
diff --git a/seeds/piracy.txt b/seeds/piracy.txt
@@ -0,0 +1,6 @@
+https://annas-archive.org/
+https://libgen.ac/
+https://forum.mobilism.me/
+https://github.com/Igglybuff/awesome-piracy
+https://www.reddit.com/r/CuratedTumblr/comments/1e63sew/for_those_too_lazy_to_check_the_rpiracy_megathread/
+https://sci-hub.se/
diff --git a/seeds/research.txt b/seeds/research.txt
@@ -1,3 +1,7 @@
+https://xlinux.nist.gov/dads/
+https://thimbleby.gitlab.io/algorithm-wiki-site/
+https://www.kaggle.com/
 https://arxiv.org/
 https://research.com/journals-rankings/computer-science
-https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:llm
+https://scholar.google.com/citations?view_op=top_venues&hl=en&vq=eng
+https://scholar.google.com/citations?view_op=top_venues&hl=en&vq=phy
diff --git a/seeds/wikis.txt b/seeds/wikis.txt
@@ -1,3 +1,6 @@
-https://wikipedia.org
+https://en.wikipedia.org/wiki/Main_Page
 https://archlinux.org/
 https://wiki.ubuntu.com/
+https://repair.wiki/w/Main_Page#gsc.tab=0
+https://stackoverflow.com
+https://stackexchange.com

	information-retrieval Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs

M	.gitignore	\|	2	++
M	collection/spider.py	\|	91	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
M	metrics/tf-idf.py	\|	69	++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
M	seeds/code.txt	\|	2	++
A	seeds/music.txt	\|	5	+++++
A	seeds/otr.txt	\|	8	++++++++
A	seeds/piracy.txt	\|	6	++++++
M	seeds/research.txt	\|	6	+++++-
M	seeds/wikis.txt	\|	5	++++-