information-retrieval

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 677346e211c3c9544f3708ab117f2f61e2b27f01
parent 6c374cfe6d87c097ed0f3fe655d4b5ce0cc68837
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Thu,  1 Jan 2026 17:48:23 -0600

Sent my first queriesgit status!

Diffstat:
MTODO.md | 5+++++
Mcollection/spider.py | 18+++++++++---------
Mindexing/utils.py | 2+-
Asearch/query.py | 63+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 78 insertions(+), 10 deletions(-)

diff --git a/TODO.md b/TODO.md @@ -31,6 +31,11 @@ --- +- update idf to be incrementally calculated + - constantly updating as things change in the corpus + - hmm, what about when we remove old stuff though? that stuff will throw off the stats more and more over time... + - might not want this to be incremental after all... +- indexing should include adding language - add centralized indexing - i added an indexed field to support this idea for incremental indexing - smarter queueing diff --git a/collection/spider.py b/collection/spider.py @@ -46,7 +46,7 @@ import time # bytes MAX_SIZE = 2_000_000 -MAX_WORKERS = 250 +MAX_WORKERS = 50 MAX_URLS_PER_SITE = 100 NOT_INDEXED = 0 INDEXED = 1 @@ -59,7 +59,7 @@ def should_queue(url, cur): """, (url, cutoff)) return cur.fetchone() is None -def is_allowed(url, user_agent, timeout=.1): +def is_allowed(url, user_agent, timeout=.5): try: parsed = urlparse(url) robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt" @@ -72,14 +72,14 @@ def is_allowed(url, user_agent, timeout=.1): return True # you should always repect robots.txt, but if you are trying to do something with this spider I guess you can -# disable it. please don't do this enmasse though, that's naughty. - +# disable it. please don't do this en-masse though, that's naughty. +# TODO: Check the size with a HEAD request prior to reading into memory. def search_url(url, filepath, respect_robots_txt=True): user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0' if respect_robots_txt: if not is_allowed(url,user_agent): print(f"Can't crawl {url} due to robots.txt violation") - return "", "", [] + return "", "", set() links = set() @@ -91,12 +91,12 @@ def search_url(url, filepath, respect_robots_txt=True): source_code = requests.get(url, headers=headers, timeout=1) # natural limit to file size in memory if not source_code.ok: print(f'Status code not 2xx for {url}, returning.') - return "", "", [] + return "", "", set() content_type = source_code.headers.get('Content-Type', '') if 'text/html' not in content_type: print(f'Content type for {url} not html, returning.') - return "", "", [] + return "", "", set() soup = BeautifulSoup(source_code.content, 'html.parser') content = soup.prettify() @@ -110,12 +110,12 @@ def search_url(url, filepath, respect_robots_txt=True): # we don't want to spider from bad sites. # process_file does some regexp checks on the site to see if it is short / bad in some other way. if deleted: - return "", "", [] + return "", "", set() else: print(f'skipping fs write for {url}, too large') except Exception as e: print(e) - return "", "", [] + return "", "", set() current_url_without_fragment = urlparse(url)._replace(fragment='').geturl() diff --git a/indexing/utils.py b/indexing/utils.py @@ -8,7 +8,6 @@ import tqdm import os import re -nltk.download('stopwords') # get term frequencies for file given our list of terms (we only care about indexed terms which doesn't include certain terms like stop words) def get_tfs(filepath, terms): @@ -104,6 +103,7 @@ def get_filepaths(document_directory): def get_log_doc_freqs(filepaths, lower_bound_percentage=0, upper_bound_percentage=1, filter_stop_words=False): stop_words = set() if filter_stop_words: + nltk.download('stopwords') stop_words = set(stopwords.words('english')) current = {} diff --git a/search/query.py b/search/query.py @@ -0,0 +1,63 @@ +import sqlite3 +import sys +from indexing.utils import get_words + +if __name__ == "__main__": + + query = sys.argv[1] + limit = int(sys.argv[2]) + + query = get_words(query) + con = sqlite3.connect('database/manifest.db', timeout=60) + #tf(document_path, term, value) + cur = con.cursor() + + idfs = {} + + for term in query: + # term(name, idf); + cur.execute("SELECT idf from term where name = ?", (term,)) + idf = cur.fetchone() + if len(idf) > 0: + idfs[term] = idf[0] + else: + + # Should this be a really large value???? + # there is a lot of information in a value that doesn't have an idf, unless we messed + # something up.... + + idfs[term] = 0 + print("Couldn't find idf...") + + + tfidf = {} + + for term in query: + cur.execute("SELECT document_path, value from tf where term = ? ORDER BY value desc", (term,)) + rows = cur.fetchall() + for row in rows: + if tfidf.get(row[0]) is None: + tfidf[row[0]] = float(row[1]) * idfs[term] + else: + tfidf[row[0]] += float(row[1]) * idfs[term] + + sorted_results = sorted(tfidf.items(), key=lambda x: x[1], reverse=True) + + +# site(url, filepath, date, indexed) + + + count = 0 + for doc_path, score in sorted_results: + if count > limit: + break + cur.execute("SELECT url from site where filepath = ? ORDER BY date LIMIT 1", (doc_path,)) + rows = cur.fetchall() + # TODO: Add check for safety + if len(rows) > 0 and len(rows[0]) > 0: + url = rows[0][0] + print(f"{score:.4f} - {url}") + count += 1 + + cur.close() + con.close()