commit 01a14abfa7d688731f7220f16dde22042bb068eb
parent 1b5bb8f087a72d206aa5b50669676e519509a755
Author: Andrew Laack <andrew.laack@imbue.com>
Date: Wed, 31 Dec 2025 02:21:06 -0600
Setup larger idf
Diffstat:
7 files changed, 62 insertions(+), 2 deletions(-)
diff --git a/collection/spider.py b/collection/spider.py
@@ -22,8 +22,6 @@ import sqlite3
# tables:
# site
# url, filepath, date
- #
-
# bytes
MAX_SIZE = 2_000_000
diff --git a/indexing/__init__.py b/indexing/__init__.py
diff --git a/indexing/__pycache__/__init__.cpython-313.pyc b/indexing/__pycache__/__init__.cpython-313.pyc
Binary files differ.
diff --git a/indexing/__pycache__/tf.cpython-313.pyc b/indexing/__pycache__/tf.cpython-313.pyc
Binary files differ.
diff --git a/indexing/__pycache__/utils.cpython-313.pyc b/indexing/__pycache__/utils.cpython-313.pyc
Binary files differ.
diff --git a/indexing/idf.py b/indexing/idf.py
@@ -0,0 +1,23 @@
+import sqlite3
+from utils import get_log_doc_freqs
+from utils import get_filepaths
+
+if __name__ == "__main__":
+ filepaths = get_filepaths('sites')
+ print(f'Found {len(filepaths)} files')
+ log_doc_freq = get_log_doc_freqs(filepaths)
+ print(log_doc_freq)
+ print(f"There are {len(log_doc_freq)}")
+
+ con = sqlite3.connect('database/manifest.db', timeout=60)
+ con.execute('PRAGMA journal_mode=WAL')
+ cur = con.cursor()
+ cur.execute("CREATE TABLE IF NOT EXISTS term(name, idf)")
+ cur.execute("DELETE FROM term")
+
+ for term in log_doc_freq:
+ print(f"Term: {str(term)} \tLog Freq: {log_doc_freq[term]}")
+ cur.execute("INSERT INTO term VALUES (?, ?)", (term, log_doc_freq[term]))
+ con.commit()
+ print(f"All {len(log_doc_freq)} terms inserted into the db.")
+ cur.close()
diff --git a/indexing/utils.py b/indexing/utils.py
@@ -0,0 +1,39 @@
+import glob
+import time
+import math
+import tqdm
+import os
+import re
+
+def get_words(filepath):
+ current = set()
+ with open(filepath, 'r') as f:
+ lines = f.read()
+ lines = re.sub('[^0-9a-zA-Z]+', ' ', lines)
+ lines = lines.split(' ')
+ for i in range(0, len(lines)):
+ if lines[i] != '':
+ current.add(lines[i].lower())
+ return current
+
+def get_filepaths(document_directory):
+ filepaths = []
+ for filepath in glob.iglob(document_directory + '**/**', recursive=True):
+ if os.path.isfile(filepath):
+ filepaths.append(filepath)
+ return filepaths
+
+def get_log_doc_freqs(filepaths):
+ current = {}
+ for i in tqdm.tqdm(range(len(filepaths))):
+ filepath = filepaths[i]
+ words = get_words(filepath)
+ for word in words:
+ if word in current:
+ current[word] += 1
+ else:
+ current[word] = 1
+ log_doc_freqs = {}
+ for word in current:
+ log_doc_freqs[word] = math.log(len(filepaths) / current[word])
+ return log_doc_freqs