commit 677346e211c3c9544f3708ab117f2f61e2b27f01
parent 6c374cfe6d87c097ed0f3fe655d4b5ce0cc68837
Author: Andrew Laack <andrew.laack@imbue.com>
Date: Thu, 1 Jan 2026 17:48:23 -0600
Sent my first queriesgit status!
Diffstat:
4 files changed, 78 insertions(+), 10 deletions(-)
diff --git a/TODO.md b/TODO.md
@@ -31,6 +31,11 @@
---
+- update idf to be incrementally calculated
+ - constantly updating as things change in the corpus
+ - hmm, what about when we remove old stuff though? that stuff will throw off the stats more and more over time...
+ - might not want this to be incremental after all...
+- indexing should include adding language
- add centralized indexing
- i added an indexed field to support this idea for incremental indexing
- smarter queueing
diff --git a/collection/spider.py b/collection/spider.py
@@ -46,7 +46,7 @@ import time
# bytes
MAX_SIZE = 2_000_000
-MAX_WORKERS = 250
+MAX_WORKERS = 50
MAX_URLS_PER_SITE = 100
NOT_INDEXED = 0
INDEXED = 1
@@ -59,7 +59,7 @@ def should_queue(url, cur):
""", (url, cutoff))
return cur.fetchone() is None
-def is_allowed(url, user_agent, timeout=.1):
+def is_allowed(url, user_agent, timeout=.5):
try:
parsed = urlparse(url)
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
@@ -72,14 +72,14 @@ def is_allowed(url, user_agent, timeout=.1):
return True
# you should always repect robots.txt, but if you are trying to do something with this spider I guess you can
-# disable it. please don't do this enmasse though, that's naughty.
-
+# disable it. please don't do this en-masse though, that's naughty.
+# TODO: Check the size with a HEAD request prior to reading into memory.
def search_url(url, filepath, respect_robots_txt=True):
user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0'
if respect_robots_txt:
if not is_allowed(url,user_agent):
print(f"Can't crawl {url} due to robots.txt violation")
- return "", "", []
+ return "", "", set()
links = set()
@@ -91,12 +91,12 @@ def search_url(url, filepath, respect_robots_txt=True):
source_code = requests.get(url, headers=headers, timeout=1) # natural limit to file size in memory
if not source_code.ok:
print(f'Status code not 2xx for {url}, returning.')
- return "", "", []
+ return "", "", set()
content_type = source_code.headers.get('Content-Type', '')
if 'text/html' not in content_type:
print(f'Content type for {url} not html, returning.')
- return "", "", []
+ return "", "", set()
soup = BeautifulSoup(source_code.content, 'html.parser')
content = soup.prettify()
@@ -110,12 +110,12 @@ def search_url(url, filepath, respect_robots_txt=True):
# we don't want to spider from bad sites.
# process_file does some regexp checks on the site to see if it is short / bad in some other way.
if deleted:
- return "", "", []
+ return "", "", set()
else:
print(f'skipping fs write for {url}, too large')
except Exception as e:
print(e)
- return "", "", []
+ return "", "", set()
current_url_without_fragment = urlparse(url)._replace(fragment='').geturl()
diff --git a/indexing/utils.py b/indexing/utils.py
@@ -8,7 +8,6 @@ import tqdm
import os
import re
-nltk.download('stopwords')
# get term frequencies for file given our list of terms (we only care about indexed terms which doesn't include certain terms like stop words)
def get_tfs(filepath, terms):
@@ -104,6 +103,7 @@ def get_filepaths(document_directory):
def get_log_doc_freqs(filepaths, lower_bound_percentage=0, upper_bound_percentage=1, filter_stop_words=False):
stop_words = set()
if filter_stop_words:
+ nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
current = {}
diff --git a/search/query.py b/search/query.py
@@ -0,0 +1,63 @@
+import sqlite3
+import sys
+from indexing.utils import get_words
+
+if __name__ == "__main__":
+
+ query = sys.argv[1]
+ limit = int(sys.argv[2])
+
+ query = get_words(query)
+ con = sqlite3.connect('database/manifest.db', timeout=60)
+ #tf(document_path, term, value)
+ cur = con.cursor()
+
+ idfs = {}
+
+ for term in query:
+ # term(name, idf);
+ cur.execute("SELECT idf from term where name = ?", (term,))
+ idf = cur.fetchone()
+ if len(idf) > 0:
+ idfs[term] = idf[0]
+ else:
+
+ # Should this be a really large value????
+ # there is a lot of information in a value that doesn't have an idf, unless we messed
+ # something up....
+
+ idfs[term] = 0
+ print("Couldn't find idf...")
+
+
+ tfidf = {}
+
+ for term in query:
+ cur.execute("SELECT document_path, value from tf where term = ? ORDER BY value desc", (term,))
+ rows = cur.fetchall()
+ for row in rows:
+ if tfidf.get(row[0]) is None:
+ tfidf[row[0]] = float(row[1]) * idfs[term]
+ else:
+ tfidf[row[0]] += float(row[1]) * idfs[term]
+
+ sorted_results = sorted(tfidf.items(), key=lambda x: x[1], reverse=True)
+
+
+# site(url, filepath, date, indexed)
+
+
+ count = 0
+ for doc_path, score in sorted_results:
+ if count > limit:
+ break
+ cur.execute("SELECT url from site where filepath = ? ORDER BY date LIMIT 1", (doc_path,))
+ rows = cur.fetchall()
+ # TODO: Add check for safety
+ if len(rows) > 0 and len(rows[0]) > 0:
+ url = rows[0][0]
+ print(f"{score:.4f} - {url}")
+ count += 1
+
+ cur.close()
+ con.close()