information-retrieval

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 01a14abfa7d688731f7220f16dde22042bb068eb
parent 1b5bb8f087a72d206aa5b50669676e519509a755
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Wed, 31 Dec 2025 02:21:06 -0600

Setup larger idf

Diffstat:
Mcollection/spider.py | 2--
Aindexing/__init__.py | 0
Aindexing/__pycache__/__init__.cpython-313.pyc | 0
Aindexing/__pycache__/tf.cpython-313.pyc | 0
Aindexing/__pycache__/utils.cpython-313.pyc | 0
Aindexing/idf.py | 23+++++++++++++++++++++++
Aindexing/utils.py | 39+++++++++++++++++++++++++++++++++++++++
7 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/collection/spider.py b/collection/spider.py @@ -22,8 +22,6 @@ import sqlite3 # tables: # site # url, filepath, date - # - # bytes MAX_SIZE = 2_000_000 diff --git a/indexing/__init__.py b/indexing/__init__.py diff --git a/indexing/__pycache__/__init__.cpython-313.pyc b/indexing/__pycache__/__init__.cpython-313.pyc Binary files differ. diff --git a/indexing/__pycache__/tf.cpython-313.pyc b/indexing/__pycache__/tf.cpython-313.pyc Binary files differ. diff --git a/indexing/__pycache__/utils.cpython-313.pyc b/indexing/__pycache__/utils.cpython-313.pyc Binary files differ. diff --git a/indexing/idf.py b/indexing/idf.py @@ -0,0 +1,23 @@ +import sqlite3 +from utils import get_log_doc_freqs +from utils import get_filepaths + +if __name__ == "__main__": + filepaths = get_filepaths('sites') + print(f'Found {len(filepaths)} files') + log_doc_freq = get_log_doc_freqs(filepaths) + print(log_doc_freq) + print(f"There are {len(log_doc_freq)}") + + con = sqlite3.connect('database/manifest.db', timeout=60) + con.execute('PRAGMA journal_mode=WAL') + cur = con.cursor() + cur.execute("CREATE TABLE IF NOT EXISTS term(name, idf)") + cur.execute("DELETE FROM term") + + for term in log_doc_freq: + print(f"Term: {str(term)} \tLog Freq: {log_doc_freq[term]}") + cur.execute("INSERT INTO term VALUES (?, ?)", (term, log_doc_freq[term])) + con.commit() + print(f"All {len(log_doc_freq)} terms inserted into the db.") + cur.close() diff --git a/indexing/utils.py b/indexing/utils.py @@ -0,0 +1,39 @@ +import glob +import time +import math +import tqdm +import os +import re + +def get_words(filepath): + current = set() + with open(filepath, 'r') as f: + lines = f.read() + lines = re.sub('[^0-9a-zA-Z]+', ' ', lines) + lines = lines.split(' ') + for i in range(0, len(lines)): + if lines[i] != '': + current.add(lines[i].lower()) + return current + +def get_filepaths(document_directory): + filepaths = [] + for filepath in glob.iglob(document_directory + '**/**', recursive=True): + if os.path.isfile(filepath): + filepaths.append(filepath) + return filepaths + +def get_log_doc_freqs(filepaths): + current = {} + for i in tqdm.tqdm(range(len(filepaths))): + filepath = filepaths[i] + words = get_words(filepath) + for word in words: + if word in current: + current[word] += 1 + else: + current[word] = 1 + log_doc_freqs = {} + for word in current: + log_doc_freqs[word] = math.log(len(filepaths) / current[word]) + return log_doc_freqs