commit 6d3b8448a30a10843c1f3d016516729060da39ad
Author: Andrew Laack <andrew.laack@imbue.com>
Date: Tue, 30 Dec 2025 17:17:12 -0600
Implemented tf-idf and created a basic spider
Diffstat:
6 files changed, 167 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+documents/
+parsed/
diff --git a/collection/spider.py b/collection/spider.py
@@ -0,0 +1,72 @@
+import requests
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from bs4 import BeautifulSoup
+import sys
+import base64
+
+# bytes
+MAX_SIZE = 1_000_000
+MAX_WORKERS = 1000
+
+def url_to_filename(url):
+ return base64.urlsafe_b64encode(url.encode()).decode() + ".html"
+
+def filename_to_url(filename):
+ return base64.urlsafe_b64decode(filename[:-5]).decode()
+
+def search_url(url, filepath):
+ links = []
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0',
+ }
+ try:
+ source_code = requests.get(url, headers=headers, timeout=1)
+ soup = BeautifulSoup(source_code.content, 'html.parser')
+ content = soup.prettify()
+
+ if len(content.encode('utf-8')) < MAX_SIZE:
+ with open(filepath, 'w') as f:
+ f.write(content)
+ else:
+ print(f'skipping {url}, too large')
+ except Exception as e:
+ print(e)
+ return url, []
+ for link in soup.find_all('a', href=True):
+ if link.get('href').startswith('https://'):
+ links.append(link.get('href'))
+ return url, links
+
+if __name__ == "__main__":
+ seed_filename = sys.argv[1]
+ urls = []
+ with open(seed_filename, 'r') as f:
+ urls = f.readlines()
+ for i in range(len(urls)):
+ urls[i] = urls[i].strip()
+ print(urls)
+
+ searched = set()
+
+ save_location = sys.argv[2]
+ depth = 0
+ max_depth = int(sys.argv[3])
+
+ while len(urls) != 0 and depth < max_depth:
+ print(f"Depth {depth}: processing {len(urls)} URLs")
+ next_urls = set()
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+ futures = {
+ executor.submit(search_url, url, save_location + url_to_filename(url)): url
+ for url in urls
+ }
+
+ for future in as_completed(futures):
+ url, links = future.result()
+ searched.add(url)
+ for link in links:
+ if link not in searched and link not in urls:
+ next_urls.add(link)
+
+ urls = next_urls
+ depth += 1
diff --git a/metrics/tf-idf.py b/metrics/tf-idf.py
@@ -0,0 +1,82 @@
+import os
+import math
+import re
+import sys
+
+# get all words
+def get_words(filename):
+ with open(filename, 'r') as f:
+ lines = f.read()
+ lines = re.sub('[^0-9a-zA-Z]+', ' ', lines)
+ lines = lines.split(' ')
+ final = []
+ for i in range(0, len(lines)):
+ if lines[i] != '':
+ final.append(lines[i].lower())
+ return final
+
+def tfs(prefix, filenames, word):
+ tfs = {}
+ for filename in filenames:
+ tfs[filename] = tf(prefix, filename, word)
+ return tfs
+
+def tf(prefix, filename, word):
+ words = get_words(prefix + filename)
+ count_of_word = 0
+ for cw in words:
+ if cw == word:
+ count_of_word += 1
+ if len(words) != 0:
+ return count_of_word / len(words)
+ return 0 # empty documents
+
+
+# technically, we might just want the one word and output the value for that
+def idf(prefix, filenames):
+ word_document_frequency = {}
+ for filename in filenames:
+ words = get_words(prefix + filename)
+ for word in words:
+ if word in word_document_frequency:
+ word_document_frequency[word] += 1
+ else:
+ word_document_frequency[word] = 1
+ idf = word_document_frequency.copy()
+ for word in idf:
+ idf[word] = math.log(len(filenames) / idf[word])
+ return idf
+
+if __name__ == "__main__":
+ user_input = True # continually prompt if the user is in interactive mode
+ while user_input:
+ word = ""
+ top_k = 1
+ if len(sys.argv) == 2:
+ word = sys.argv[1]
+ top_k = int(sys.argv[2])
+ user_input = False
+ else:
+ user_input = True
+ word = input("Word to find: ")
+ top_k = int(input("Top k elements to show: "))
+
+ filenames = os.listdir('parsed')
+ idf_dict = idf('parsed/', filenames)
+
+ if word not in idf_dict:
+ print('Word does not appear in any documents')
+ exit()
+
+ tf_dict = tfs('parsed/', filenames, word)
+
+ tfidf = {}
+
+ for filename in filenames:
+ tfidf[filename] = idf_dict[word] * tf_dict[filename]
+
+ sorted_items = sorted(tfidf.items(), key=lambda kv: (kv[1], kv[0]))
+ sorted_items.reverse()
+
+ for i in range(top_k):
+ print(sorted_items[i])
diff --git a/seeds/code.txt b/seeds/code.txt
@@ -0,0 +1,5 @@
+https://github.com/trending
+https://codeberg.org/
+https://about.gitlab.com/
+https://github.com/
+https://github.com/topics/awesome
diff --git a/seeds/research.txt b/seeds/research.txt
@@ -0,0 +1,3 @@
+https://arxiv.org/
+https://research.com/journals-rankings/computer-science
+https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:llm
diff --git a/seeds/wikis.txt b/seeds/wikis.txt
@@ -0,0 +1,3 @@
+https://wikipedia.org
+https://archlinux.org/
+https://wiki.ubuntu.com/