information-retrieval

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 6d3b8448a30a10843c1f3d016516729060da39ad
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Tue, 30 Dec 2025 17:17:12 -0600

Implemented tf-idf and created a basic spider

Diffstat:
A.gitignore | 2++
Acollection/spider.py | 72++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ametrics/tf-idf.py | 82+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aseeds/code.txt | 5+++++
Aseeds/research.txt | 3+++
Aseeds/wikis.txt | 3+++
6 files changed, 167 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1,2 @@ +documents/ +parsed/ diff --git a/collection/spider.py b/collection/spider.py @@ -0,0 +1,72 @@ +import requests +from concurrent.futures import ThreadPoolExecutor, as_completed +from bs4 import BeautifulSoup +import sys +import base64 + +# bytes +MAX_SIZE = 1_000_000 +MAX_WORKERS = 1000 + +def url_to_filename(url): + return base64.urlsafe_b64encode(url.encode()).decode() + ".html" + +def filename_to_url(filename): + return base64.urlsafe_b64decode(filename[:-5]).decode() + +def search_url(url, filepath): + links = [] + headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0', + } + try: + source_code = requests.get(url, headers=headers, timeout=1) + soup = BeautifulSoup(source_code.content, 'html.parser') + content = soup.prettify() + + if len(content.encode('utf-8')) < MAX_SIZE: + with open(filepath, 'w') as f: + f.write(content) + else: + print(f'skipping {url}, too large') + except Exception as e: + print(e) + return url, [] + for link in soup.find_all('a', href=True): + if link.get('href').startswith('https://'): + links.append(link.get('href')) + return url, links + +if __name__ == "__main__": + seed_filename = sys.argv[1] + urls = [] + with open(seed_filename, 'r') as f: + urls = f.readlines() + for i in range(len(urls)): + urls[i] = urls[i].strip() + print(urls) + + searched = set() + + save_location = sys.argv[2] + depth = 0 + max_depth = int(sys.argv[3]) + + while len(urls) != 0 and depth < max_depth: + print(f"Depth {depth}: processing {len(urls)} URLs") + next_urls = set() + with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: + futures = { + executor.submit(search_url, url, save_location + url_to_filename(url)): url + for url in urls + } + + for future in as_completed(futures): + url, links = future.result() + searched.add(url) + for link in links: + if link not in searched and link not in urls: + next_urls.add(link) + + urls = next_urls + depth += 1 diff --git a/metrics/tf-idf.py b/metrics/tf-idf.py @@ -0,0 +1,82 @@ +import os +import math +import re +import sys + +# get all words +def get_words(filename): + with open(filename, 'r') as f: + lines = f.read() + lines = re.sub('[^0-9a-zA-Z]+', ' ', lines) + lines = lines.split(' ') + final = [] + for i in range(0, len(lines)): + if lines[i] != '': + final.append(lines[i].lower()) + return final + +def tfs(prefix, filenames, word): + tfs = {} + for filename in filenames: + tfs[filename] = tf(prefix, filename, word) + return tfs + +def tf(prefix, filename, word): + words = get_words(prefix + filename) + count_of_word = 0 + for cw in words: + if cw == word: + count_of_word += 1 + if len(words) != 0: + return count_of_word / len(words) + return 0 # empty documents + + +# technically, we might just want the one word and output the value for that +def idf(prefix, filenames): + word_document_frequency = {} + for filename in filenames: + words = get_words(prefix + filename) + for word in words: + if word in word_document_frequency: + word_document_frequency[word] += 1 + else: + word_document_frequency[word] = 1 + idf = word_document_frequency.copy() + for word in idf: + idf[word] = math.log(len(filenames) / idf[word]) + return idf + +if __name__ == "__main__": + user_input = True # continually prompt if the user is in interactive mode + while user_input: + word = "" + top_k = 1 + if len(sys.argv) == 2: + word = sys.argv[1] + top_k = int(sys.argv[2]) + user_input = False + else: + user_input = True + word = input("Word to find: ") + top_k = int(input("Top k elements to show: ")) + + filenames = os.listdir('parsed') + idf_dict = idf('parsed/', filenames) + + if word not in idf_dict: + print('Word does not appear in any documents') + exit() + + tf_dict = tfs('parsed/', filenames, word) + + tfidf = {} + + for filename in filenames: + tfidf[filename] = idf_dict[word] * tf_dict[filename] + + sorted_items = sorted(tfidf.items(), key=lambda kv: (kv[1], kv[0])) + sorted_items.reverse() + + for i in range(top_k): + print(sorted_items[i]) diff --git a/seeds/code.txt b/seeds/code.txt @@ -0,0 +1,5 @@ +https://github.com/trending +https://codeberg.org/ +https://about.gitlab.com/ +https://github.com/ +https://github.com/topics/awesome diff --git a/seeds/research.txt b/seeds/research.txt @@ -0,0 +1,3 @@ +https://arxiv.org/ +https://research.com/journals-rankings/computer-science +https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:llm diff --git a/seeds/wikis.txt b/seeds/wikis.txt @@ -0,0 +1,3 @@ +https://wikipedia.org +https://archlinux.org/ +https://wiki.ubuntu.com/