Continued working, created todo, setup idf, played with language detection, setup project. - information-retrieval - Unnamed repository; edit this file 'description' to name the repository.

commit edd197026c4213a9b7479de73a77d0f3944df0cf
parent 01a14abfa7d688731f7220f16dde22042bb068eb
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Wed, 31 Dec 2025 17:44:39 -0600

Continued working, created todo, setup idf, played with language detection, setup project.

Diffstat:
M .gitignore  | 2 ++
A TODO.md  | 28 ++++++++++++++++++++++++++++
A collection/lang-detect.py  | 20 ++++++++++++++++++++
A collection/prune.py  | 35 +++++++++++++++++++++++++++++++++++
D indexing/__pycache__/__init__.cpython-313.pyc  | 0 
D indexing/__pycache__/tf.cpython-313.pyc  | 0 
D indexing/__pycache__/utils.cpython-313.pyc  | 0 
M indexing/idf.py  | 2 --
M indexing/utils.py  | 46 +++++++++++++++++++++++++++++++++++++++-------
A pyproject.toml  | 8 ++++++++

10 files changed, 132 insertions(+), 9 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,5 @@ documents/
 parsed/
 sites/
 database/
+*/__pycache__/
+information_retrieval.egg-info/
diff --git a/TODO.md b/TODO.md
@@ -0,0 +1,28 @@
+- should we have a term count table for documents?
+- tf indexing
+    - for each document go through each term within it and calculate the tf value directly
+        - we then save something to this table:
+            - tf(document-path, term, value)
+                - indexed: document-path and term
+                    - combined index?
+                        - probably not
+                            - in general we are interested in the terms being indexed, but it also probably makes sense to lookup documents too for derived values.
+- add linking support?
+    - how to do this?
+        - key value db where keys are urls and values are outlinks?
+- language detection
+    - this should be included as part of the sites lookup
+        - or should this be its own db per lang?
+- should this be authority based
+    - on one hand, I hate centralization and authority
+    - on the other hand, is there any way around this if there are llms online
+    - should we just do some sort of llm prediction?
+        - rank domains based on llm suspicion
+- distance from authority
+- ensure pruning logic is used during spider crawling so we don't write useless stuff to begin with
+- update deletion of documents to also update the db
+    - this will be used to apply rules backwards
+- improve pruning
+    - currently just based on length, but I could see information content being useful too
+    - since we have words, the information content could be computed based on word frequency
+        - actually, that'd be a bit different because we don't have word frequency globally
diff --git a/collection/lang-detect.py b/collection/lang-detect.py
@@ -0,0 +1,20 @@
+from langdetect import detect
+from indexing.utils import get_plaintext
+from indexing.utils import get_filepaths
+import tqdm
+
+def detect_language(text):
+    return detect(text)
+
+if __name__ == "__main__":
+    filepaths = get_filepaths("sites/")
+    languages = {}
+    for i in tqdm.tqdm(range(0, len(filepaths))):
+        filepath = filepaths[i]
+        current = detect_language(get_plaintext(filepath))
+        if current not in languages:
+            languages[current] = 1
+        else:
+            languages[current] += 1
+    print(languages)
+        
diff --git a/collection/prune.py b/collection/prune.py
@@ -0,0 +1,35 @@
+# prune documents that aren't useful / we don't want.
+import os
+from indexing.utils import get_filepaths
+from indexing.utils import get_plaintext
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import tqdm
+
+# this can happen from rate limiting, requiring js, and some other things too.
+def non_substantive(plaintext):
+    if len(plaintext) < 1_000:
+        return True
+    return False
+
+def drop(plaintext):
+    if non_substantive(plaintext):
+        return True
+    return False
+def process_file(filepath):
+    plaintext = get_plaintext(filepath)
+    if drop(plaintext):
+        os.remove(filepath)
+        return True
+    return False
+
+if __name__ == "__main__":
+    filepaths = get_filepaths("sites")
+    print(f'{len(filepaths)} filepaths found.')
+    deleted = 0
+
+    with ThreadPoolExecutor(max_workers=50) as executor:
+        futures = {executor.submit(process_file, fp): fp for fp in filepaths}
+        for future in tqdm.tqdm(as_completed(futures), total=len(filepaths)):
+            if future.result():
+                deleted += 1
+    print(f'Deleted {deleted} files')
diff --git a/indexing/__pycache__/__init__.cpython-313.pyc b/indexing/__pycache__/__init__.cpython-313.pyc
Binary files differ.
diff --git a/indexing/__pycache__/tf.cpython-313.pyc b/indexing/__pycache__/tf.cpython-313.pyc
Binary files differ.
diff --git a/indexing/__pycache__/utils.cpython-313.pyc b/indexing/__pycache__/utils.cpython-313.pyc
Binary files differ.
diff --git a/indexing/idf.py b/indexing/idf.py
@@ -6,7 +6,6 @@ if __name__ == "__main__":
     filepaths = get_filepaths('sites')
     print(f'Found {len(filepaths)} files')
     log_doc_freq = get_log_doc_freqs(filepaths)
-    print(log_doc_freq)
     print(f"There are {len(log_doc_freq)}")
 
     con = sqlite3.connect('database/manifest.db', timeout=60)
@@ -16,7 +15,6 @@ if __name__ == "__main__":
     cur.execute("DELETE FROM term")
 
     for term in log_doc_freq:
-        print(f"Term: {str(term)} \tLog Freq: {log_doc_freq[term]}")
         cur.execute("INSERT INTO term VALUES (?, ?)", (term, log_doc_freq[term]))
         con.commit()
     print(f"All {len(log_doc_freq)} terms inserted into the db.")
diff --git a/indexing/utils.py b/indexing/utils.py
@@ -1,14 +1,40 @@
 import glob
-import time
+import nltk
+from nltk.corpus import stopwords
+import subprocess
 import math
 import tqdm
 import os
 import re
 
-def get_words(filepath):
+nltk.download('stopwords')
+
+# use lynx to render the raw html. This will give us exactly what a user
+# of the best browser, lynx, would see if they went to the site.
+def get_plaintext(filepath):
+    result = subprocess.run(
+        ['lynx', '--dump', '--force_html', '--nolist', filepath],
+        capture_output=True,
+    )
+
+    plaintext = result.stdout.decode('utf-8', errors='replace')
+
+    plaintext = re.sub(r'\(BUTTON\)\s*', '', plaintext)
+    plaintext = re.sub(r'_{3,}', '', plaintext)
+
+    lines = [line for line in plaintext.splitlines() 
+             if 'REFRESH' not in line 
+             and 'file://' not in line
+             and not re.match(r'^\s+\d{2}-\d{2}/', line)] 
+    return '\n'.join(lines)
+
+def get_plaintext_words(filepath):
+    plaintext = get_plaintext(filepath)
+    return get_words(plaintext)
+
+def get_words(text):
     current = set()
-    with open(filepath, 'r') as f:
-        lines = f.read()
+    lines = text
     lines = re.sub('[^0-9a-zA-Z]+', ' ', lines)
     lines = lines.split(' ')
     for i in range(0, len(lines)):
@@ -23,11 +49,15 @@ def get_filepaths(document_directory):
              filepaths.append(filepath)
     return filepaths
 
-def get_log_doc_freqs(filepaths):
+def get_log_doc_freqs(filepaths, lower_bound_percentage=.0001, upper_bound_percentage=.85, filter_stop_words=True):
+    stop_words = set()
+    if filter_stop_words:
+        stop_words = set(stopwords.words('english'))
+
     current = {}
     for i in tqdm.tqdm(range(len(filepaths))):
         filepath = filepaths[i]
-        words = get_words(filepath)
+        words = get_plaintext_words(filepath)
         for word in words:
             if word in current:
                 current[word] += 1
@@ -35,5 +65,7 @@ def get_log_doc_freqs(filepaths):
                 current[word] = 1
     log_doc_freqs = {}
     for word in current:
-        log_doc_freqs[word] = math.log(len(filepaths) / current[word])
+        # random word or stop word?
+        if current[word] > int(lower_bound_percentage * len(filepaths)) and current[word] < int(len(filepaths) * upper_bound_percentage) and word not in stop_words:
+            log_doc_freqs[word] = math.log(len(filepaths) / current[word])
     return log_doc_freqs
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,8 @@
+[project]
+name = "information_retrieval"
+version = "0.1.0"
+[tool.setuptools.packages.find]
+where = ["."]
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"

	information-retrieval Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs

M	.gitignore	\|	2	++
A	TODO.md	\|	28	++++++++++++++++++++++++++++
A	collection/lang-detect.py	\|	20	++++++++++++++++++++
A	collection/prune.py	\|	35	+++++++++++++++++++++++++++++++++++
D	indexing/__pycache__/__init__.cpython-313.pyc	\|	0
D	indexing/__pycache__/tf.cpython-313.pyc	\|	0
D	indexing/__pycache__/utils.cpython-313.pyc	\|	0
M	indexing/idf.py	\|	2	--
M	indexing/utils.py	\|	46	+++++++++++++++++++++++++++++++++++++++-------
A	pyproject.toml	\|	8	++++++++