Sent my first queriesgit status! - information-retrieval - Unnamed repository; edit this file 'description' to name the repository.

commit 677346e211c3c9544f3708ab117f2f61e2b27f01
parent 6c374cfe6d87c097ed0f3fe655d4b5ce0cc68837
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Thu,  1 Jan 2026 17:48:23 -0600

Sent my first queriesgit status!

Diffstat:
M TODO.md  | 5 +++++
M collection/spider.py  | 18 +++++++++---------
M indexing/utils.py  | 2 +-
A search/query.py  | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

4 files changed, 78 insertions(+), 10 deletions(-)
diff --git a/TODO.md b/TODO.md
@@ -31,6 +31,11 @@
 
 ---
 
+- update idf to be incrementally calculated
+    - constantly updating as things change in the corpus
+        - hmm, what about when we remove old stuff though? that stuff will throw off the stats more and more over time...
+        - might not want this to be incremental after all...
+- indexing should include adding language
 - add centralized indexing
     - i added an indexed field to support this idea for incremental indexing
 - smarter queueing
diff --git a/collection/spider.py b/collection/spider.py
@@ -46,7 +46,7 @@ import time
 
 # bytes
 MAX_SIZE = 2_000_000
-MAX_WORKERS = 250
+MAX_WORKERS = 50
 MAX_URLS_PER_SITE = 100
 NOT_INDEXED = 0
 INDEXED = 1
@@ -59,7 +59,7 @@ def should_queue(url, cur):
     """, (url, cutoff))
     return cur.fetchone() is None
 
-def is_allowed(url, user_agent, timeout=.1):
+def is_allowed(url, user_agent, timeout=.5):
     try:
         parsed = urlparse(url)
         robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
@@ -72,14 +72,14 @@ def is_allowed(url, user_agent, timeout=.1):
         return True
 
 # you should always repect robots.txt, but if you are trying to do something with this spider I guess you can
-# disable it. please don't do this enmasse though, that's naughty.
-
+# disable it. please don't do this en-masse though, that's naughty.
+# TODO: Check the size with a HEAD request prior to reading into memory. 
 def search_url(url, filepath, respect_robots_txt=True):
     user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0'
     if respect_robots_txt:
         if not is_allowed(url,user_agent):
             print(f"Can't crawl {url} due to robots.txt violation")
-            return "", "", []
+            return "", "", set()
 
     links = set()
 
@@ -91,12 +91,12 @@ def search_url(url, filepath, respect_robots_txt=True):
         source_code = requests.get(url, headers=headers, timeout=1) # natural limit to file size in memory
         if not source_code.ok:
             print(f'Status code not 2xx for {url}, returning.')
-            return "", "", []
+            return "", "", set()
         
         content_type = source_code.headers.get('Content-Type', '')
         if 'text/html' not in content_type:
             print(f'Content type for {url} not html, returning.')
-            return "", "", []
+            return "", "", set()
 
         soup = BeautifulSoup(source_code.content, 'html.parser')
         content = soup.prettify()
@@ -110,12 +110,12 @@ def search_url(url, filepath, respect_robots_txt=True):
             # we don't want to spider from bad sites.
             # process_file does some regexp checks on the site to see if it is short / bad in some other way.
             if deleted:
-                return "", "", []
+                return "", "", set()
         else:
             print(f'skipping fs write for {url}, too large')
     except Exception as e:
         print(e)
-        return "", "", []
+        return "", "", set()
 
     current_url_without_fragment = urlparse(url)._replace(fragment='').geturl()
 
diff --git a/indexing/utils.py b/indexing/utils.py
@@ -8,7 +8,6 @@ import tqdm
 import os
 import re
 
-nltk.download('stopwords')
 
 # get term frequencies for file given our list of terms (we only care about indexed terms which doesn't include certain terms like stop words)
 def get_tfs(filepath, terms):
@@ -104,6 +103,7 @@ def get_filepaths(document_directory):
 def get_log_doc_freqs(filepaths, lower_bound_percentage=0, upper_bound_percentage=1, filter_stop_words=False):
     stop_words = set()
     if filter_stop_words:
+        nltk.download('stopwords')
         stop_words = set(stopwords.words('english'))
 
     current = {}
diff --git a/search/query.py b/search/query.py
@@ -0,0 +1,63 @@
+import sqlite3
+import sys
+from indexing.utils import get_words
+
+if __name__ == "__main__":
+
+    query = sys.argv[1]
+    limit = int(sys.argv[2])
+
+    query = get_words(query)
+    con = sqlite3.connect('database/manifest.db', timeout=60)
+    #tf(document_path, term, value)
+    cur = con.cursor()
+
+    idfs = {}
+
+    for term in query:
+        # term(name, idf);
+        cur.execute("SELECT idf from term where name = ?", (term,))
+        idf = cur.fetchone()
+        if len(idf) > 0:
+            idfs[term] = idf[0]
+        else:
+
+            # Should this be a really large value????
+            # there is a lot of information in a value that doesn't have an idf, unless we messed
+            # something up....
+
+            idfs[term] = 0
+            print("Couldn't find idf...")
+
+
+    tfidf = {}
+
+    for term in query:
+        cur.execute("SELECT document_path, value from tf where term = ? ORDER BY value desc", (term,))
+        rows = cur.fetchall()
+        for row in rows:
+            if tfidf.get(row[0]) is None:
+                tfidf[row[0]] = float(row[1]) * idfs[term]
+            else:
+                tfidf[row[0]] += float(row[1]) * idfs[term]
+
+    sorted_results = sorted(tfidf.items(), key=lambda x: x[1], reverse=True)
+
+
+#    site(url, filepath, date, indexed)
+
+
+    count = 0
+    for doc_path, score in sorted_results:
+        if count > limit:
+            break
+        cur.execute("SELECT url from site where filepath = ? ORDER BY date LIMIT 1", (doc_path,))
+        rows = cur.fetchall()
+        # TODO: Add check for safety
+        if len(rows) > 0 and len(rows[0]) > 0:
+            url = rows[0][0]
+            print(f"{score:.4f} - {url}")
+        count += 1
+
+    cur.close()
+    con.close()

	information-retrieval Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs

M	TODO.md	\|	5	+++++
M	collection/spider.py	\|	18	+++++++++---------
M	indexing/utils.py	\|	2	+-
A	search/query.py	\|	63	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++