information-retrieval

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit eb447c513be89e86ffa878fe0e9239a9978cd1e9
parent e709ee4038cceeb0574cdfc1e8999502ecb5c960
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Sat, 17 Jan 2026 16:31:51 -0600

AI content detection

Diffstat:
M.gitignore | 2++
Aembeddings/ai_detection/process.py | 36++++++++++++++++++++++++++++++++++++
Aembeddings/ai_detection/train.py | 30++++++++++++++++++++++++++++++
Aembeddings/docs/index.py | 44++++++++++++++++++++++++++++++++++++++++++++
Aembeddings/docs/similarity.py | 39+++++++++++++++++++++++++++++++++++++++
5 files changed, 151 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -6,3 +6,5 @@ database/ information_retrieval.egg-info/ llm-search/build/* web-research/build/* +embeddings/docs/embeddings.json +*.csv diff --git a/embeddings/ai_detection/process.py b/embeddings/ai_detection/process.py @@ -0,0 +1,35 @@ +import pandas as pd +import json +from sentence_transformers import SentenceTransformer +import os +from pathlib import Path + +NAME = 'sentence-transformers/all-MiniLM-L6-v2' +# expand '~' +PATH = Path(str(os.path.expanduser('~')) + '/models/' + NAME) +EMBEDDING_PATH = 'embeddings.json' +DATASET_PATH = os.path.expanduser('~/datasets/ai_slop.csv') + +model = None + +if not Path.exists(PATH): + os.makedirs(PATH) + model = SentenceTransformer(NAME) + print('downloaded model') + model.save(str(PATH)) + print('saved model') + +model = SentenceTransformer(str(PATH)) +print('loaded model') + +def embed_doc(text): + return model.encode(text) + +docs = pd.read_csv(DATASET_PATH) +print(f"Loaded {len(docs)} records") + +docs['embedding'] = docs.iloc[:, 0].apply(embed_doc) + +docs = docs.drop('text', axis = 1) +print(docs) +docs.to_csv('embeddings.csv', index=False)+ \ No newline at end of file diff --git a/embeddings/ai_detection/train.py b/embeddings/ai_detection/train.py @@ -0,0 +1,29 @@ +import pandas as pd +import numpy as np +from sklearn.tree import DecisionTreeClassifier +from sklearn.model_selection import StratifiedKFold +from statistics import mean, stdev + +df = pd.read_csv("embeddings.csv") + +def parse_embedding(embedding_str): + cleaned = embedding_str.strip("[]").split() + return np.array([float(x) for x in cleaned]) + + +df["embedding"] = df["embedding"].apply(parse_embedding) + +x_scaled = np.stack(df["embedding"].values) +y = df["generated"].values + +tree_clf = DecisionTreeClassifier() + +skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True) +lst_accu_stratified = [] + +for train_index, test_index in skf.split(x_scaled, y): + x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index] + y_train_fold, y_test_fold = y[train_index], y[test_index] + tree_clf.fit(x_train_fold, y_train_fold) + lst_accu_stratified.append(tree_clf.score(x_test_fold, y_test_fold)) +print(f'Mean Accuracy: { mean(lst_accu_stratified)*100}')+ \ No newline at end of file diff --git a/embeddings/docs/index.py b/embeddings/docs/index.py @@ -0,0 +1,44 @@ +import json +from sentence_transformers import SentenceTransformer +import os +from pathlib import Path + +NAME = 'Qwen/Qwen3-Embedding-0.6B' +# expand '~' +PATH = Path(str(os.path.expanduser('~')) + '/models/' + NAME) +EMBEDDING_PATH = 'embeddings.json' +DOCUMENT_PATH = Path(os.path.expanduser('~/gitRepos/notes/docs')) + +model = None + +if not Path.exists(PATH): + os.makedirs(PATH) + model = SentenceTransformer(NAME) + print('downloaded model') + model.save(str(PATH)) + print('saved model') + +model = SentenceTransformer(str(PATH)) +print('loaded model') + + +docs = os.listdir(DOCUMENT_PATH) +print(f"Found {len(docs)} docs") +abs_path = [str(DOCUMENT_PATH) + '/' + name for name in docs] + + +# load this all in mem. might be large depending on corpus... +documents = {} +count = 1 + +# we assume each document fits within context length for sentence embedding +for path in abs_path: + with open(path, 'r') as f: + documents[path] = model.encode(f.read()).tolist() + print(f"{count}: Indexed {path}") + count += 1 + +dumped = json.dumps(documents) + +with open(EMBEDDING_PATH, 'w') as f: + f.write(dumped) diff --git a/embeddings/docs/similarity.py b/embeddings/docs/similarity.py @@ -0,0 +1,39 @@ +import os +from pathlib import Path +from sentence_transformers import SentenceTransformer +import json + +EMBEDDING_PATH = 'embeddings.json' +NAME = 'Qwen/Qwen3-Embedding-0.6B' +PATH = Path(str(os.path.expanduser('~')) + '/models/' + NAME) + +doc_dict = {} + +with open(EMBEDDING_PATH, 'r') as f: + doc_dict = json.load(f) + +model = SentenceTransformer(str(PATH)) +print('loaded model') + +query = input('Enter your query: ') + +embedded_query = model.encode([query]).tolist()[0] + + +similarities = {} + +current_idx = 0 +for pth in doc_dict: + document_embedding = doc_dict[pth] + similarity = model.similarity(document_embedding, embedded_query) + similarities[pth] = similarity + + +k = 10 +count = 0 + +for path in sorted(similarities, key=similarities.get, reverse=True): + if count == k: + break + print(f"Location: {path}\t\t\tScore {similarities[path][0][0]}") + count += 1