information-retrieval

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit edd197026c4213a9b7479de73a77d0f3944df0cf
parent 01a14abfa7d688731f7220f16dde22042bb068eb
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Wed, 31 Dec 2025 17:44:39 -0600

Continued working, created todo, setup idf, played with language detection, setup project.

Diffstat:
M.gitignore | 2++
ATODO.md | 28++++++++++++++++++++++++++++
Acollection/lang-detect.py | 20++++++++++++++++++++
Acollection/prune.py | 35+++++++++++++++++++++++++++++++++++
Dindexing/__pycache__/__init__.cpython-313.pyc | 0
Dindexing/__pycache__/tf.cpython-313.pyc | 0
Dindexing/__pycache__/utils.cpython-313.pyc | 0
Mindexing/idf.py | 2--
Mindexing/utils.py | 46+++++++++++++++++++++++++++++++++++++++-------
Apyproject.toml | 8++++++++
10 files changed, 132 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -2,3 +2,5 @@ documents/ parsed/ sites/ database/ +*/__pycache__/ +information_retrieval.egg-info/ diff --git a/TODO.md b/TODO.md @@ -0,0 +1,28 @@ +- should we have a term count table for documents? +- tf indexing + - for each document go through each term within it and calculate the tf value directly + - we then save something to this table: + - tf(document-path, term, value) + - indexed: document-path and term + - combined index? + - probably not + - in general we are interested in the terms being indexed, but it also probably makes sense to lookup documents too for derived values. +- add linking support? + - how to do this? + - key value db where keys are urls and values are outlinks? +- language detection + - this should be included as part of the sites lookup + - or should this be its own db per lang? +- should this be authority based + - on one hand, I hate centralization and authority + - on the other hand, is there any way around this if there are llms online + - should we just do some sort of llm prediction? + - rank domains based on llm suspicion +- distance from authority +- ensure pruning logic is used during spider crawling so we don't write useless stuff to begin with +- update deletion of documents to also update the db + - this will be used to apply rules backwards +- improve pruning + - currently just based on length, but I could see information content being useful too + - since we have words, the information content could be computed based on word frequency + - actually, that'd be a bit different because we don't have word frequency globally diff --git a/collection/lang-detect.py b/collection/lang-detect.py @@ -0,0 +1,20 @@ +from langdetect import detect +from indexing.utils import get_plaintext +from indexing.utils import get_filepaths +import tqdm + +def detect_language(text): + return detect(text) + +if __name__ == "__main__": + filepaths = get_filepaths("sites/") + languages = {} + for i in tqdm.tqdm(range(0, len(filepaths))): + filepath = filepaths[i] + current = detect_language(get_plaintext(filepath)) + if current not in languages: + languages[current] = 1 + else: + languages[current] += 1 + print(languages) + diff --git a/collection/prune.py b/collection/prune.py @@ -0,0 +1,35 @@ +# prune documents that aren't useful / we don't want. +import os +from indexing.utils import get_filepaths +from indexing.utils import get_plaintext +from concurrent.futures import ThreadPoolExecutor, as_completed +import tqdm + +# this can happen from rate limiting, requiring js, and some other things too. +def non_substantive(plaintext): + if len(plaintext) < 1_000: + return True + return False + +def drop(plaintext): + if non_substantive(plaintext): + return True + return False +def process_file(filepath): + plaintext = get_plaintext(filepath) + if drop(plaintext): + os.remove(filepath) + return True + return False + +if __name__ == "__main__": + filepaths = get_filepaths("sites") + print(f'{len(filepaths)} filepaths found.') + deleted = 0 + + with ThreadPoolExecutor(max_workers=50) as executor: + futures = {executor.submit(process_file, fp): fp for fp in filepaths} + for future in tqdm.tqdm(as_completed(futures), total=len(filepaths)): + if future.result(): + deleted += 1 + print(f'Deleted {deleted} files') diff --git a/indexing/__pycache__/__init__.cpython-313.pyc b/indexing/__pycache__/__init__.cpython-313.pyc Binary files differ. diff --git a/indexing/__pycache__/tf.cpython-313.pyc b/indexing/__pycache__/tf.cpython-313.pyc Binary files differ. diff --git a/indexing/__pycache__/utils.cpython-313.pyc b/indexing/__pycache__/utils.cpython-313.pyc Binary files differ. diff --git a/indexing/idf.py b/indexing/idf.py @@ -6,7 +6,6 @@ if __name__ == "__main__": filepaths = get_filepaths('sites') print(f'Found {len(filepaths)} files') log_doc_freq = get_log_doc_freqs(filepaths) - print(log_doc_freq) print(f"There are {len(log_doc_freq)}") con = sqlite3.connect('database/manifest.db', timeout=60) @@ -16,7 +15,6 @@ if __name__ == "__main__": cur.execute("DELETE FROM term") for term in log_doc_freq: - print(f"Term: {str(term)} \tLog Freq: {log_doc_freq[term]}") cur.execute("INSERT INTO term VALUES (?, ?)", (term, log_doc_freq[term])) con.commit() print(f"All {len(log_doc_freq)} terms inserted into the db.") diff --git a/indexing/utils.py b/indexing/utils.py @@ -1,14 +1,40 @@ import glob -import time +import nltk +from nltk.corpus import stopwords +import subprocess import math import tqdm import os import re -def get_words(filepath): +nltk.download('stopwords') + +# use lynx to render the raw html. This will give us exactly what a user +# of the best browser, lynx, would see if they went to the site. +def get_plaintext(filepath): + result = subprocess.run( + ['lynx', '--dump', '--force_html', '--nolist', filepath], + capture_output=True, + ) + + plaintext = result.stdout.decode('utf-8', errors='replace') + + plaintext = re.sub(r'\(BUTTON\)\s*', '', plaintext) + plaintext = re.sub(r'_{3,}', '', plaintext) + + lines = [line for line in plaintext.splitlines() + if 'REFRESH' not in line + and 'file://' not in line + and not re.match(r'^\s+\d{2}-\d{2}/', line)] + return '\n'.join(lines) + +def get_plaintext_words(filepath): + plaintext = get_plaintext(filepath) + return get_words(plaintext) + +def get_words(text): current = set() - with open(filepath, 'r') as f: - lines = f.read() + lines = text lines = re.sub('[^0-9a-zA-Z]+', ' ', lines) lines = lines.split(' ') for i in range(0, len(lines)): @@ -23,11 +49,15 @@ def get_filepaths(document_directory): filepaths.append(filepath) return filepaths -def get_log_doc_freqs(filepaths): +def get_log_doc_freqs(filepaths, lower_bound_percentage=.0001, upper_bound_percentage=.85, filter_stop_words=True): + stop_words = set() + if filter_stop_words: + stop_words = set(stopwords.words('english')) + current = {} for i in tqdm.tqdm(range(len(filepaths))): filepath = filepaths[i] - words = get_words(filepath) + words = get_plaintext_words(filepath) for word in words: if word in current: current[word] += 1 @@ -35,5 +65,7 @@ def get_log_doc_freqs(filepaths): current[word] = 1 log_doc_freqs = {} for word in current: - log_doc_freqs[word] = math.log(len(filepaths) / current[word]) + # random word or stop word? + if current[word] > int(lower_bound_percentage * len(filepaths)) and current[word] < int(len(filepaths) * upper_bound_percentage) and word not in stop_words: + log_doc_freqs[word] = math.log(len(filepaths) / current[word]) return log_doc_freqs diff --git a/pyproject.toml b/pyproject.toml @@ -0,0 +1,8 @@ +[project] +name = "information_retrieval" +version = "0.1.0" +[tool.setuptools.packages.find] +where = ["."] +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta"