Implemented tf-idf and created a basic spider - information-retrieval - Unnamed repository; edit this file 'description' to name the repository.

commit 6d3b8448a30a10843c1f3d016516729060da39ad
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Tue, 30 Dec 2025 17:17:12 -0600

Implemented tf-idf and created a basic spider

Diffstat:
A .gitignore  | 2 ++
A collection/spider.py  | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A metrics/tf-idf.py  | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A seeds/code.txt  | 5 +++++
A seeds/research.txt  | 3 +++
A seeds/wikis.txt  | 3 +++

6 files changed, 167 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+documents/
+parsed/
diff --git a/collection/spider.py b/collection/spider.py
@@ -0,0 +1,72 @@
+import requests
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from bs4 import BeautifulSoup
+import sys
+import base64
+
+# bytes
+MAX_SIZE = 1_000_000
+MAX_WORKERS = 1000
+
+def url_to_filename(url):
+    return base64.urlsafe_b64encode(url.encode()).decode() + ".html"
+
+def filename_to_url(filename):
+    return base64.urlsafe_b64decode(filename[:-5]).decode()
+
+def search_url(url, filepath):
+    links = []
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0',
+    }
+    try:
+        source_code = requests.get(url, headers=headers, timeout=1)
+        soup = BeautifulSoup(source_code.content, 'html.parser')
+        content = soup.prettify()
+
+        if len(content.encode('utf-8')) < MAX_SIZE:
+            with open(filepath, 'w') as f:
+                f.write(content)
+        else:
+            print(f'skipping {url}, too large')
+    except Exception as e:
+        print(e)
+        return url, []
+    for link in soup.find_all('a', href=True):
+        if link.get('href').startswith('https://'):
+            links.append(link.get('href'))
+    return url, links
+
+if __name__ == "__main__":
+    seed_filename = sys.argv[1]
+    urls = []
+    with open(seed_filename, 'r') as f:
+        urls = f.readlines()
+    for i in range(len(urls)):
+        urls[i] = urls[i].strip()
+    print(urls)
+
+    searched = set()
+
+    save_location = sys.argv[2]
+    depth = 0
+    max_depth = int(sys.argv[3])
+
+    while len(urls) != 0 and depth < max_depth:
+        print(f"Depth {depth}: processing {len(urls)} URLs")
+        next_urls = set()
+        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+            futures = {
+                executor.submit(search_url, url, save_location + url_to_filename(url)): url
+                for url in urls
+            }
+            
+            for future in as_completed(futures):
+                url, links = future.result()
+                searched.add(url)
+                for link in links:
+                    if link not in searched and link not in urls:
+                        next_urls.add(link)
+        
+        urls = next_urls
+        depth += 1
diff --git a/metrics/tf-idf.py b/metrics/tf-idf.py
@@ -0,0 +1,82 @@
+import os
+import math
+import re
+import sys
+
+# get all words
+def get_words(filename):
+    with open(filename, 'r') as f:
+        lines = f.read()
+    lines = re.sub('[^0-9a-zA-Z]+', ' ', lines)
+    lines = lines.split(' ')
+    final = []
+    for i in range(0, len(lines)):
+        if lines[i] != '':
+            final.append(lines[i].lower())
+    return final
+
+def tfs(prefix, filenames, word):
+    tfs = {}
+    for filename in filenames:
+        tfs[filename] = tf(prefix, filename, word)
+    return tfs
+
+def tf(prefix, filename, word):
+    words = get_words(prefix + filename)
+    count_of_word = 0
+    for cw in words:
+        if cw == word:
+            count_of_word += 1
+    if len(words) != 0:
+        return count_of_word / len(words)
+    return 0 # empty documents
+
+
+# technically, we might just want the one word and output the value for that
+def idf(prefix, filenames):
+    word_document_frequency = {}
+    for filename in filenames:
+        words = get_words(prefix + filename)
+        for word in words:
+            if word in word_document_frequency:
+                word_document_frequency[word] += 1
+            else:
+                word_document_frequency[word] = 1
+    idf = word_document_frequency.copy()
+    for word in idf:
+        idf[word] = math.log(len(filenames) / idf[word])
+    return idf
+
+if __name__ == "__main__":
+    user_input = True # continually prompt if the user is in interactive mode
+    while user_input:
+        word = ""
+        top_k = 1
+        if len(sys.argv) == 2:
+            word = sys.argv[1]
+            top_k = int(sys.argv[2])
+            user_input = False
+        else:
+            user_input = True
+            word = input("Word to find: ")
+            top_k = int(input("Top k elements to show: "))
+
+        filenames = os.listdir('parsed')
+        idf_dict = idf('parsed/', filenames)
+
+        if word not in idf_dict:
+            print('Word does not appear in any documents')
+            exit()
+
+        tf_dict = tfs('parsed/', filenames, word)
+
+        tfidf = {}
+
+        for filename in filenames:
+            tfidf[filename] = idf_dict[word] * tf_dict[filename]
+
+        sorted_items = sorted(tfidf.items(), key=lambda kv: (kv[1], kv[0]))
+        sorted_items.reverse()
+
+        for i in range(top_k):
+            print(sorted_items[i])
diff --git a/seeds/code.txt b/seeds/code.txt
@@ -0,0 +1,5 @@
+https://github.com/trending
+https://codeberg.org/
+https://about.gitlab.com/
+https://github.com/
+https://github.com/topics/awesome
diff --git a/seeds/research.txt b/seeds/research.txt
@@ -0,0 +1,3 @@
+https://arxiv.org/
+https://research.com/journals-rankings/computer-science
+https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:llm
diff --git a/seeds/wikis.txt b/seeds/wikis.txt
@@ -0,0 +1,3 @@
+https://wikipedia.org
+https://archlinux.org/
+https://wiki.ubuntu.com/

	information-retrieval Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs

A	.gitignore	\|	2	++
A	collection/spider.py	\|	72	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	metrics/tf-idf.py	\|	82	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	seeds/code.txt	\|	5	+++++
A	seeds/research.txt	\|	3	+++
A	seeds/wikis.txt	\|	3	+++