commit edd197026c4213a9b7479de73a77d0f3944df0cf
parent 01a14abfa7d688731f7220f16dde22042bb068eb
Author: Andrew Laack <andrew.laack@imbue.com>
Date: Wed, 31 Dec 2025 17:44:39 -0600
Continued working, created todo, setup idf, played with language detection, setup project.
Diffstat:
10 files changed, 132 insertions(+), 9 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,5 @@ documents/
parsed/
sites/
database/
+*/__pycache__/
+information_retrieval.egg-info/
diff --git a/TODO.md b/TODO.md
@@ -0,0 +1,28 @@
+- should we have a term count table for documents?
+- tf indexing
+ - for each document go through each term within it and calculate the tf value directly
+ - we then save something to this table:
+ - tf(document-path, term, value)
+ - indexed: document-path and term
+ - combined index?
+ - probably not
+ - in general we are interested in the terms being indexed, but it also probably makes sense to lookup documents too for derived values.
+- add linking support?
+ - how to do this?
+ - key value db where keys are urls and values are outlinks?
+- language detection
+ - this should be included as part of the sites lookup
+ - or should this be its own db per lang?
+- should this be authority based
+ - on one hand, I hate centralization and authority
+ - on the other hand, is there any way around this if there are llms online
+ - should we just do some sort of llm prediction?
+ - rank domains based on llm suspicion
+- distance from authority
+- ensure pruning logic is used during spider crawling so we don't write useless stuff to begin with
+- update deletion of documents to also update the db
+ - this will be used to apply rules backwards
+- improve pruning
+ - currently just based on length, but I could see information content being useful too
+ - since we have words, the information content could be computed based on word frequency
+ - actually, that'd be a bit different because we don't have word frequency globally
diff --git a/collection/lang-detect.py b/collection/lang-detect.py
@@ -0,0 +1,20 @@
+from langdetect import detect
+from indexing.utils import get_plaintext
+from indexing.utils import get_filepaths
+import tqdm
+
+def detect_language(text):
+ return detect(text)
+
+if __name__ == "__main__":
+ filepaths = get_filepaths("sites/")
+ languages = {}
+ for i in tqdm.tqdm(range(0, len(filepaths))):
+ filepath = filepaths[i]
+ current = detect_language(get_plaintext(filepath))
+ if current not in languages:
+ languages[current] = 1
+ else:
+ languages[current] += 1
+ print(languages)
+
diff --git a/collection/prune.py b/collection/prune.py
@@ -0,0 +1,35 @@
+# prune documents that aren't useful / we don't want.
+import os
+from indexing.utils import get_filepaths
+from indexing.utils import get_plaintext
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import tqdm
+
+# this can happen from rate limiting, requiring js, and some other things too.
+def non_substantive(plaintext):
+ if len(plaintext) < 1_000:
+ return True
+ return False
+
+def drop(plaintext):
+ if non_substantive(plaintext):
+ return True
+ return False
+def process_file(filepath):
+ plaintext = get_plaintext(filepath)
+ if drop(plaintext):
+ os.remove(filepath)
+ return True
+ return False
+
+if __name__ == "__main__":
+ filepaths = get_filepaths("sites")
+ print(f'{len(filepaths)} filepaths found.')
+ deleted = 0
+
+ with ThreadPoolExecutor(max_workers=50) as executor:
+ futures = {executor.submit(process_file, fp): fp for fp in filepaths}
+ for future in tqdm.tqdm(as_completed(futures), total=len(filepaths)):
+ if future.result():
+ deleted += 1
+ print(f'Deleted {deleted} files')
diff --git a/indexing/__pycache__/__init__.cpython-313.pyc b/indexing/__pycache__/__init__.cpython-313.pyc
Binary files differ.
diff --git a/indexing/__pycache__/tf.cpython-313.pyc b/indexing/__pycache__/tf.cpython-313.pyc
Binary files differ.
diff --git a/indexing/__pycache__/utils.cpython-313.pyc b/indexing/__pycache__/utils.cpython-313.pyc
Binary files differ.
diff --git a/indexing/idf.py b/indexing/idf.py
@@ -6,7 +6,6 @@ if __name__ == "__main__":
filepaths = get_filepaths('sites')
print(f'Found {len(filepaths)} files')
log_doc_freq = get_log_doc_freqs(filepaths)
- print(log_doc_freq)
print(f"There are {len(log_doc_freq)}")
con = sqlite3.connect('database/manifest.db', timeout=60)
@@ -16,7 +15,6 @@ if __name__ == "__main__":
cur.execute("DELETE FROM term")
for term in log_doc_freq:
- print(f"Term: {str(term)} \tLog Freq: {log_doc_freq[term]}")
cur.execute("INSERT INTO term VALUES (?, ?)", (term, log_doc_freq[term]))
con.commit()
print(f"All {len(log_doc_freq)} terms inserted into the db.")
diff --git a/indexing/utils.py b/indexing/utils.py
@@ -1,14 +1,40 @@
import glob
-import time
+import nltk
+from nltk.corpus import stopwords
+import subprocess
import math
import tqdm
import os
import re
-def get_words(filepath):
+nltk.download('stopwords')
+
+# use lynx to render the raw html. This will give us exactly what a user
+# of the best browser, lynx, would see if they went to the site.
+def get_plaintext(filepath):
+ result = subprocess.run(
+ ['lynx', '--dump', '--force_html', '--nolist', filepath],
+ capture_output=True,
+ )
+
+ plaintext = result.stdout.decode('utf-8', errors='replace')
+
+ plaintext = re.sub(r'\(BUTTON\)\s*', '', plaintext)
+ plaintext = re.sub(r'_{3,}', '', plaintext)
+
+ lines = [line for line in plaintext.splitlines()
+ if 'REFRESH' not in line
+ and 'file://' not in line
+ and not re.match(r'^\s+\d{2}-\d{2}/', line)]
+ return '\n'.join(lines)
+
+def get_plaintext_words(filepath):
+ plaintext = get_plaintext(filepath)
+ return get_words(plaintext)
+
+def get_words(text):
current = set()
- with open(filepath, 'r') as f:
- lines = f.read()
+ lines = text
lines = re.sub('[^0-9a-zA-Z]+', ' ', lines)
lines = lines.split(' ')
for i in range(0, len(lines)):
@@ -23,11 +49,15 @@ def get_filepaths(document_directory):
filepaths.append(filepath)
return filepaths
-def get_log_doc_freqs(filepaths):
+def get_log_doc_freqs(filepaths, lower_bound_percentage=.0001, upper_bound_percentage=.85, filter_stop_words=True):
+ stop_words = set()
+ if filter_stop_words:
+ stop_words = set(stopwords.words('english'))
+
current = {}
for i in tqdm.tqdm(range(len(filepaths))):
filepath = filepaths[i]
- words = get_words(filepath)
+ words = get_plaintext_words(filepath)
for word in words:
if word in current:
current[word] += 1
@@ -35,5 +65,7 @@ def get_log_doc_freqs(filepaths):
current[word] = 1
log_doc_freqs = {}
for word in current:
- log_doc_freqs[word] = math.log(len(filepaths) / current[word])
+ # random word or stop word?
+ if current[word] > int(lower_bound_percentage * len(filepaths)) and current[word] < int(len(filepaths) * upper_bound_percentage) and word not in stop_words:
+ log_doc_freqs[word] = math.log(len(filepaths) / current[word])
return log_doc_freqs
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,8 @@
+[project]
+name = "information_retrieval"
+version = "0.1.0"
+[tool.setuptools.packages.find]
+where = ["."]
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"