commit eb447c513be89e86ffa878fe0e9239a9978cd1e9
parent e709ee4038cceeb0574cdfc1e8999502ecb5c960
Author: Andrew Laack <andrew.laack@imbue.com>
Date: Sat, 17 Jan 2026 16:31:51 -0600
AI content detection
Diffstat:
5 files changed, 151 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,5 @@ database/
information_retrieval.egg-info/
llm-search/build/*
web-research/build/*
+embeddings/docs/embeddings.json
+*.csv
diff --git a/embeddings/ai_detection/process.py b/embeddings/ai_detection/process.py
@@ -0,0 +1,35 @@
+import pandas as pd
+import json
+from sentence_transformers import SentenceTransformer
+import os
+from pathlib import Path
+
+NAME = 'sentence-transformers/all-MiniLM-L6-v2'
+# expand '~'
+PATH = Path(str(os.path.expanduser('~')) + '/models/' + NAME)
+EMBEDDING_PATH = 'embeddings.json'
+DATASET_PATH = os.path.expanduser('~/datasets/ai_slop.csv')
+
+model = None
+
+if not Path.exists(PATH):
+ os.makedirs(PATH)
+ model = SentenceTransformer(NAME)
+ print('downloaded model')
+ model.save(str(PATH))
+ print('saved model')
+
+model = SentenceTransformer(str(PATH))
+print('loaded model')
+
+def embed_doc(text):
+ return model.encode(text)
+
+docs = pd.read_csv(DATASET_PATH)
+print(f"Loaded {len(docs)} records")
+
+docs['embedding'] = docs.iloc[:, 0].apply(embed_doc)
+
+docs = docs.drop('text', axis = 1)
+print(docs)
+docs.to_csv('embeddings.csv', index=False)+
\ No newline at end of file
diff --git a/embeddings/ai_detection/train.py b/embeddings/ai_detection/train.py
@@ -0,0 +1,29 @@
+import pandas as pd
+import numpy as np
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.model_selection import StratifiedKFold
+from statistics import mean, stdev
+
+df = pd.read_csv("embeddings.csv")
+
+def parse_embedding(embedding_str):
+ cleaned = embedding_str.strip("[]").split()
+ return np.array([float(x) for x in cleaned])
+
+
+df["embedding"] = df["embedding"].apply(parse_embedding)
+
+x_scaled = np.stack(df["embedding"].values)
+y = df["generated"].values
+
+tree_clf = DecisionTreeClassifier()
+
+skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
+lst_accu_stratified = []
+
+for train_index, test_index in skf.split(x_scaled, y):
+ x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
+ y_train_fold, y_test_fold = y[train_index], y[test_index]
+ tree_clf.fit(x_train_fold, y_train_fold)
+ lst_accu_stratified.append(tree_clf.score(x_test_fold, y_test_fold))
+print(f'Mean Accuracy: { mean(lst_accu_stratified)*100}')+
\ No newline at end of file
diff --git a/embeddings/docs/index.py b/embeddings/docs/index.py
@@ -0,0 +1,44 @@
+import json
+from sentence_transformers import SentenceTransformer
+import os
+from pathlib import Path
+
+NAME = 'Qwen/Qwen3-Embedding-0.6B'
+# expand '~'
+PATH = Path(str(os.path.expanduser('~')) + '/models/' + NAME)
+EMBEDDING_PATH = 'embeddings.json'
+DOCUMENT_PATH = Path(os.path.expanduser('~/gitRepos/notes/docs'))
+
+model = None
+
+if not Path.exists(PATH):
+ os.makedirs(PATH)
+ model = SentenceTransformer(NAME)
+ print('downloaded model')
+ model.save(str(PATH))
+ print('saved model')
+
+model = SentenceTransformer(str(PATH))
+print('loaded model')
+
+
+docs = os.listdir(DOCUMENT_PATH)
+print(f"Found {len(docs)} docs")
+abs_path = [str(DOCUMENT_PATH) + '/' + name for name in docs]
+
+
+# load this all in mem. might be large depending on corpus...
+documents = {}
+count = 1
+
+# we assume each document fits within context length for sentence embedding
+for path in abs_path:
+ with open(path, 'r') as f:
+ documents[path] = model.encode(f.read()).tolist()
+ print(f"{count}: Indexed {path}")
+ count += 1
+
+dumped = json.dumps(documents)
+
+with open(EMBEDDING_PATH, 'w') as f:
+ f.write(dumped)
diff --git a/embeddings/docs/similarity.py b/embeddings/docs/similarity.py
@@ -0,0 +1,39 @@
+import os
+from pathlib import Path
+from sentence_transformers import SentenceTransformer
+import json
+
+EMBEDDING_PATH = 'embeddings.json'
+NAME = 'Qwen/Qwen3-Embedding-0.6B'
+PATH = Path(str(os.path.expanduser('~')) + '/models/' + NAME)
+
+doc_dict = {}
+
+with open(EMBEDDING_PATH, 'r') as f:
+ doc_dict = json.load(f)
+
+model = SentenceTransformer(str(PATH))
+print('loaded model')
+
+query = input('Enter your query: ')
+
+embedded_query = model.encode([query]).tolist()[0]
+
+
+similarities = {}
+
+current_idx = 0
+for pth in doc_dict:
+ document_embedding = doc_dict[pth]
+ similarity = model.similarity(document_embedding, embedded_query)
+ similarities[pth] = similarity
+
+
+k = 10
+count = 0
+
+for path in sorted(similarities, key=similarities.get, reverse=True):
+ if count == k:
+ break
+ print(f"Location: {path}\t\t\tScore {similarities[path][0][0]}")
+ count += 1