AI content detection - information-retrieval - Unnamed repository; edit this file 'description' to name the repository.

commit eb447c513be89e86ffa878fe0e9239a9978cd1e9
parent e709ee4038cceeb0574cdfc1e8999502ecb5c960
Author: Andrew Laack <andrew.laack@imbue.com>
Date:   Sat, 17 Jan 2026 16:31:51 -0600

AI content detection

Diffstat:
M .gitignore  | 2 ++
A embeddings/ai_detection/process.py  | 36 ++++++++++++++++++++++++++++++++++++
A embeddings/ai_detection/train.py  | 30 ++++++++++++++++++++++++++++++
A embeddings/docs/index.py  | 44 ++++++++++++++++++++++++++++++++++++++++++++
A embeddings/docs/similarity.py  | 39 +++++++++++++++++++++++++++++++++++++++

5 files changed, 151 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,5 @@ database/
 information_retrieval.egg-info/
 llm-search/build/*
 web-research/build/*
+embeddings/docs/embeddings.json
+*.csv
diff --git a/embeddings/ai_detection/process.py b/embeddings/ai_detection/process.py
@@ -0,0 +1,35 @@
+import pandas as pd
+import json
+from sentence_transformers import SentenceTransformer
+import os
+from pathlib import Path
+
+NAME = 'sentence-transformers/all-MiniLM-L6-v2'
+# expand '~'
+PATH = Path(str(os.path.expanduser('~')) + '/models/' + NAME)
+EMBEDDING_PATH = 'embeddings.json'
+DATASET_PATH = os.path.expanduser('~/datasets/ai_slop.csv')
+
+model = None
+
+if not Path.exists(PATH):
+    os.makedirs(PATH)
+    model = SentenceTransformer(NAME)
+    print('downloaded model')
+    model.save(str(PATH))
+    print('saved model')
+
+model = SentenceTransformer(str(PATH))
+print('loaded model')
+
+def embed_doc(text):
+    return model.encode(text)
+
+docs = pd.read_csv(DATASET_PATH)
+print(f"Loaded {len(docs)} records")
+
+docs['embedding'] = docs.iloc[:, 0].apply(embed_doc)
+
+docs = docs.drop('text', axis = 1)
+print(docs)
+docs.to_csv('embeddings.csv', index=False)+
\ No newline at end of file
diff --git a/embeddings/ai_detection/train.py b/embeddings/ai_detection/train.py
@@ -0,0 +1,29 @@
+import pandas as pd
+import numpy as np
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.model_selection import StratifiedKFold
+from statistics import mean, stdev
+
+df = pd.read_csv("embeddings.csv")
+
+def parse_embedding(embedding_str):
+    cleaned = embedding_str.strip("[]").split()
+    return np.array([float(x) for x in cleaned])
+
+
+df["embedding"] = df["embedding"].apply(parse_embedding)
+
+x_scaled = np.stack(df["embedding"].values)
+y = df["generated"].values
+
+tree_clf = DecisionTreeClassifier()
+
+skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
+lst_accu_stratified = []
+
+for train_index, test_index in skf.split(x_scaled, y):
+	x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
+	y_train_fold, y_test_fold = y[train_index], y[test_index]
+	tree_clf.fit(x_train_fold, y_train_fold)
+	lst_accu_stratified.append(tree_clf.score(x_test_fold, y_test_fold))
+print(f'Mean Accuracy: { mean(lst_accu_stratified)*100}')+
\ No newline at end of file
diff --git a/embeddings/docs/index.py b/embeddings/docs/index.py
@@ -0,0 +1,44 @@
+import json
+from sentence_transformers import SentenceTransformer
+import os
+from pathlib import Path
+
+NAME = 'Qwen/Qwen3-Embedding-0.6B'
+# expand '~'
+PATH = Path(str(os.path.expanduser('~')) + '/models/' + NAME)
+EMBEDDING_PATH = 'embeddings.json'
+DOCUMENT_PATH = Path(os.path.expanduser('~/gitRepos/notes/docs'))
+
+model = None
+
+if not Path.exists(PATH):
+    os.makedirs(PATH)
+    model = SentenceTransformer(NAME)
+    print('downloaded model')
+    model.save(str(PATH))
+    print('saved model')
+
+model = SentenceTransformer(str(PATH))
+print('loaded model')
+
+
+docs = os.listdir(DOCUMENT_PATH)
+print(f"Found {len(docs)} docs")
+abs_path = [str(DOCUMENT_PATH) + '/' + name for name in docs]
+
+
+# load this all in mem. might be large depending on corpus...
+documents = {}
+count = 1
+
+# we assume each document fits within context length for sentence embedding
+for path in abs_path:
+    with open(path, 'r') as f:
+        documents[path] = model.encode(f.read()).tolist()
+        print(f"{count}: Indexed {path}")
+        count += 1
+
+dumped = json.dumps(documents)
+
+with open(EMBEDDING_PATH, 'w') as f:
+    f.write(dumped)
diff --git a/embeddings/docs/similarity.py b/embeddings/docs/similarity.py
@@ -0,0 +1,39 @@
+import os
+from pathlib import Path
+from sentence_transformers import SentenceTransformer
+import json
+
+EMBEDDING_PATH = 'embeddings.json'
+NAME = 'Qwen/Qwen3-Embedding-0.6B'
+PATH = Path(str(os.path.expanduser('~')) + '/models/' + NAME)
+
+doc_dict = {}
+
+with open(EMBEDDING_PATH, 'r') as f:
+    doc_dict = json.load(f)
+
+model = SentenceTransformer(str(PATH))
+print('loaded model')
+
+query = input('Enter your query: ')
+
+embedded_query = model.encode([query]).tolist()[0]
+
+
+similarities = {}
+
+current_idx = 0
+for pth in doc_dict:
+    document_embedding = doc_dict[pth]
+    similarity = model.similarity(document_embedding, embedded_query)
+    similarities[pth] = similarity
+
+
+k = 10
+count = 0
+
+for path in sorted(similarities, key=similarities.get, reverse=True):
+    if count == k:
+        break
+    print(f"Location: {path}\t\t\tScore {similarities[path][0][0]}")
+    count += 1

	information-retrieval Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs

M	.gitignore	\|	2	++
A	embeddings/ai_detection/process.py	\|	36	++++++++++++++++++++++++++++++++++++
A	embeddings/ai_detection/train.py	\|	30	++++++++++++++++++++++++++++++
A	embeddings/docs/index.py	\|	44	++++++++++++++++++++++++++++++++++++++++++++
A	embeddings/docs/similarity.py	\|	39	+++++++++++++++++++++++++++++++++++++++